Week 5 Friday#

Announcements#

  • Videos for next week posted soon.

  • Next worksheets will be given out Monday/Wednesday. Due Tuesday of Week 7.

  • No in-class quiz during Week 6.

  • Midterms will probably be returned early Week 6.

  • (Totally optional of course.) See attached flyers for two recruitment flyers I was asked to share.

Introduction to Machine Learning#

Slides available in the course notes.

K-means clustering using scikit-learn#

Using KMeans from scikit-learn, cluster the penguins data using the columns “bill_length_mm” and “flipper_length_mm”.

import pandas as pd
import altair as alt
import seaborn as sns
df = sns.load_dataset("penguins")
col0 = "bill_length_mm"
col1 = "flipper_length_mm"

There is a subtle mistake in the following. We are using "col0" instead of col0, so Altair is looking for a column whose name is the string "col0".

alt.Chart(df).mark_circle().encode(
    x=alt.X("col0", scale=alt.Scale(zero=False)),
    y=alt.Y("col1", scale=alt.Scale(zero=False))
)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/vegalite/v4/api.py:2020, in Chart.to_dict(self, *args, **kwargs)
   2018     copy.data = core.InlineData(values=[{}])
   2019     return super(Chart, copy).to_dict(*args, **kwargs)
-> 2020 return super().to_dict(*args, **kwargs)

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/vegalite/v4/api.py:384, in TopLevelMixin.to_dict(self, *args, **kwargs)
    381 kwargs["context"] = context
    383 try:
--> 384     dct = super(TopLevelMixin, copy).to_dict(*args, **kwargs)
    385 except jsonschema.ValidationError:
    386     dct = None

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:326, in SchemaBase.to_dict(self, validate, ignore, context)
    324     result = _todict(self._args[0], validate=sub_validate, context=context)
    325 elif not self._args:
--> 326     result = _todict(
    327         {k: v for k, v in self._kwds.items() if k not in ignore},
    328         validate=sub_validate,
    329         context=context,
    330     )
    331 else:
    332     raise ValueError(
    333         "{} instance has both a value and properties : "
    334         "cannot serialize to dict".format(self.__class__)
    335     )

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:60, in _todict(obj, validate, context)
     58     return [_todict(v, validate, context) for v in obj]
     59 elif isinstance(obj, dict):
---> 60     return {
     61         k: _todict(v, validate, context)
     62         for k, v in obj.items()
     63         if v is not Undefined
     64     }
     65 elif hasattr(obj, "to_dict"):
     66     return obj.to_dict()

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:61, in <dictcomp>(.0)
     58     return [_todict(v, validate, context) for v in obj]
     59 elif isinstance(obj, dict):
     60     return {
---> 61         k: _todict(v, validate, context)
     62         for k, v in obj.items()
     63         if v is not Undefined
     64     }
     65 elif hasattr(obj, "to_dict"):
     66     return obj.to_dict()

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:56, in _todict(obj, validate, context)
     54 """Convert an object to a dict representation."""
     55 if isinstance(obj, SchemaBase):
---> 56     return obj.to_dict(validate=validate, context=context)
     57 elif isinstance(obj, (list, tuple, np.ndarray)):
     58     return [_todict(v, validate, context) for v in obj]

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:326, in SchemaBase.to_dict(self, validate, ignore, context)
    324     result = _todict(self._args[0], validate=sub_validate, context=context)
    325 elif not self._args:
--> 326     result = _todict(
    327         {k: v for k, v in self._kwds.items() if k not in ignore},
    328         validate=sub_validate,
    329         context=context,
    330     )
    331 else:
    332     raise ValueError(
    333         "{} instance has both a value and properties : "
    334         "cannot serialize to dict".format(self.__class__)
    335     )

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:60, in _todict(obj, validate, context)
     58     return [_todict(v, validate, context) for v in obj]
     59 elif isinstance(obj, dict):
---> 60     return {
     61         k: _todict(v, validate, context)
     62         for k, v in obj.items()
     63         if v is not Undefined
     64     }
     65 elif hasattr(obj, "to_dict"):
     66     return obj.to_dict()

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:61, in <dictcomp>(.0)
     58     return [_todict(v, validate, context) for v in obj]
     59 elif isinstance(obj, dict):
     60     return {
---> 61         k: _todict(v, validate, context)
     62         for k, v in obj.items()
     63         if v is not Undefined
     64     }
     65 elif hasattr(obj, "to_dict"):
     66     return obj.to_dict()

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:56, in _todict(obj, validate, context)
     54 """Convert an object to a dict representation."""
     55 if isinstance(obj, SchemaBase):
---> 56     return obj.to_dict(validate=validate, context=context)
     57 elif isinstance(obj, (list, tuple, np.ndarray)):
     58     return [_todict(v, validate, context) for v in obj]

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/vegalite/v4/schema/channels.py:40, in FieldChannelMixin.to_dict(self, validate, ignore, context)
     38 elif not (type_in_shorthand or type_defined_explicitly):
     39     if isinstance(context.get('data', None), pd.DataFrame):
---> 40         raise ValueError("{} encoding field is specified without a type; "
     41                          "the type cannot be inferred because it does not "
     42                          "match any column in the data.".format(shorthand))
     43     else:
     44         raise ValueError("{} encoding field is specified without a type; "
     45                          "the type cannot be automatically inferred because "
     46                          "the data is not specified as a pandas.DataFrame."
     47                          "".format(shorthand))

ValueError: col0 encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.
alt.Chart(...)

Here is the correct chart.

alt.Chart(df).mark_circle().encode(
    x=alt.X(col0, scale=alt.Scale(zero=False)),
    y=alt.Y(col1, scale=alt.Scale(zero=False))
)

Let’s try to divide that data into two clusters. There is a general routine used with scikit-learn for Machine Learning. The more comfortable you get with this routine, the more familiar you will be with some of the conventions of Object Oriented Programming.

  1. Import (this step only needs to be done once per session)

  2. Instantiate (i.e., create an object/instance of the class you just imported)

  3. Fit

  4. Predict (or transform).

# import
from sklearn.cluster import KMeans

If you don’t specify the number of clusters, scikit-learn will use 8 clusters as its default value. (It would be nice if scikit-learn would determine the number of clusters automatically based on the data, but it doesn’t.) Be sure to use different capitalizations for the object kmeans and the class we imported KMeans. The convention is to use lower-case letters for our objects.

# instantiate
kmeans = KMeans(n_clusters=2)
# Example of Object Oriented Programming, use a special KMeans object
type(kmeans)
sklearn.cluster._kmeans.KMeans

Python error messages are not always clear, but this one is pretty clear (scroll to the bottom).

kmeans.fit(df[[col0,col1]])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [10], line 1
----> 1 kmeans.fit(df[[col0,col1]])

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:1367, in KMeans.fit(self, X, y, sample_weight)
   1341 def fit(self, X, y=None, sample_weight=None):
   1342     """Compute k-means clustering.
   1343 
   1344     Parameters
   (...)
   1365         Fitted estimator.
   1366     """
-> 1367     X = self._validate_data(
   1368         X,
   1369         accept_sparse="csr",
   1370         dtype=[np.float64, np.float32],
   1371         order="C",
   1372         copy=self.copy_x,
   1373         accept_large_sparse=False,
   1374     )
   1376     self._check_params(X)
   1377     random_state = check_random_state(self.random_state)

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/base.py:577, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    575     raise ValueError("Validation should be done on X, y or both.")
    576 elif not no_val_X and no_val_y:
--> 577     X = check_array(X, input_name="X", **check_params)
    578     out = X
    579 elif no_val_X and not no_val_y:

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/utils/validation.py:899, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    893         raise ValueError(
    894             "Found array with dim %d. %s expected <= 2."
    895             % (array.ndim, estimator_name)
    896         )
    898     if force_all_finite:
--> 899         _assert_all_finite(
    900             array,
    901             input_name=input_name,
    902             estimator_name=estimator_name,
    903             allow_nan=force_all_finite == "allow-nan",
    904         )
    906 if ensure_min_samples > 0:
    907     n_samples = _num_samples(array)

File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/utils/validation.py:146, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
    124         if (
    125             not allow_nan
    126             and estimator_name
   (...)
    130             # Improve the error message on how to handle missing values in
    131             # scikit-learn.
    132             msg_err += (
    133                 f"\n{estimator_name} does not accept missing values"
    134                 " encoded as NaN natively. For supervised learning, you might want"
   (...)
    144                 "#estimators-that-handle-nan-values"
    145             )
--> 146         raise ValueError(msg_err)
    148 # for object dtype data, we only check for NaNs (GH-13254)
    149 elif X.dtype == np.dtype("object") and not allow_nan:

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

We drop the missing values. (Stop and convince yourself that axis=0 is correct if we want to drop rows. We are changing the rows axis.) The copy is to prevent a warning later.

df2 = df.dropna(axis=0).copy()

The displayed output is new to me (at least how it is displayed on Deepnote). Think of the following cell as changing the object kmeans.

# Step 3: fit
kmeans.fit(df2[[col0,col1]])
KMeans(n_clusters=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Your output may look different, because there is some randomness to the K-Means algorithm. Especially the specific numbers could be swapped.

# Step 4: predict
arr = kmeans.predict(df2[[col0, col1]])
arr
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int32)

The variable arr is a NumPy array. It has length 333.

len(arr)
333

That’s not the same length as df, because we removed some rows from df.

len(df)
344

Here it’s the same.

len(df2)
333

Let’s try the routine, this time looking for 4 clusters.

kmeans2 = KMeans(n_clusters=4)
kmeans2.fit(df2[[col0,col1]])
KMeans(n_clusters=4)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Notice how now we see 4 different values in the NumPy array, corresponding to the 4 clusters.

arr2 = kmeans2.predict(df2[[col0,col1]])
arr2
array([3, 3, 1, 3, 3, 3, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3,
       3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3,
       1, 1, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 1, 0, 3, 3,
       3, 0, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 1, 3, 3, 1, 1, 1, 3, 1, 3, 1,
       3, 1, 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 3, 0, 3, 1, 3, 1, 3, 3, 3, 1,
       3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1,
       1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 0,
       3, 1, 3, 1, 1, 1, 1, 0, 1, 0, 0, 3, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 3, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0,
       2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2,
       0, 2, 1, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0,
       2, 0, 0], dtype=int32)

Let’s make a new column in df2 holding these cluster values. (This next line would raise a warning if we hadn’t used copy when we dropped rows with missing values.)

df2["cluster"] = arr2

The default encoding type for the “cluster” column is Quantitative (:Q), but that is probably the worst choice from among Quantitative, Ordinal, and Nominal.

alt.Chart(df2).mark_circle().encode(
    x=col0,
    y=col1,
    color="cluster"
)

The ordering of the cluster numbers 0,1,2,3 is not significant, so Nominal is by far the best choice of encoding type.

Notice how the clusters seem to be lying in horizontal bands. Look at the scales of the x-axis and the y-axis. The data along the x-axis covers about a range of 20, whereas the y-axis data covers a range of about 50. This difference in magnitude is causing the clusters to lie along horizontal bands. We will see how to get more appropriate clusterings next week.

alt.Chart(df2).mark_circle().encode(
    x=col0,
    y=col1,
    color="cluster:N"
)

Here is a zoomed in version where we remove the default inclusion of zero in the axes.

alt.Chart(df2).mark_circle().encode(
    x=alt.X(col0, scale=alt.Scale(zero=False)),
    y=alt.Y(col1, scale=alt.Scale(zero=False)),
    color="cluster:N"
)