Week 5 Friday
Contents
Week 5 Friday#
Announcements#
Videos for next week posted soon.
Next worksheets will be given out Monday/Wednesday. Due Tuesday of Week 7.
No in-class quiz during Week 6.
Midterms will probably be returned early Week 6.
(Totally optional of course.) See attached flyers for two recruitment flyers I was asked to share.
Introduction to Machine Learning#
Slides available in the course notes.
K-means clustering using scikit-learn#
Using KMeans
from scikit-learn, cluster the penguins data using the columns “bill_length_mm” and “flipper_length_mm”.
import pandas as pd
import altair as alt
import seaborn as sns
df = sns.load_dataset("penguins")
col0 = "bill_length_mm"
col1 = "flipper_length_mm"
There is a subtle mistake in the following. We are using "col0"
instead of col0
, so Altair is looking for a column whose name is the string "col0"
.
alt.Chart(df).mark_circle().encode(
x=alt.X("col0", scale=alt.Scale(zero=False)),
y=alt.Y("col1", scale=alt.Scale(zero=False))
)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/vegalite/v4/api.py:2020, in Chart.to_dict(self, *args, **kwargs)
2018 copy.data = core.InlineData(values=[{}])
2019 return super(Chart, copy).to_dict(*args, **kwargs)
-> 2020 return super().to_dict(*args, **kwargs)
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/vegalite/v4/api.py:384, in TopLevelMixin.to_dict(self, *args, **kwargs)
381 kwargs["context"] = context
383 try:
--> 384 dct = super(TopLevelMixin, copy).to_dict(*args, **kwargs)
385 except jsonschema.ValidationError:
386 dct = None
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:326, in SchemaBase.to_dict(self, validate, ignore, context)
324 result = _todict(self._args[0], validate=sub_validate, context=context)
325 elif not self._args:
--> 326 result = _todict(
327 {k: v for k, v in self._kwds.items() if k not in ignore},
328 validate=sub_validate,
329 context=context,
330 )
331 else:
332 raise ValueError(
333 "{} instance has both a value and properties : "
334 "cannot serialize to dict".format(self.__class__)
335 )
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:60, in _todict(obj, validate, context)
58 return [_todict(v, validate, context) for v in obj]
59 elif isinstance(obj, dict):
---> 60 return {
61 k: _todict(v, validate, context)
62 for k, v in obj.items()
63 if v is not Undefined
64 }
65 elif hasattr(obj, "to_dict"):
66 return obj.to_dict()
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:61, in <dictcomp>(.0)
58 return [_todict(v, validate, context) for v in obj]
59 elif isinstance(obj, dict):
60 return {
---> 61 k: _todict(v, validate, context)
62 for k, v in obj.items()
63 if v is not Undefined
64 }
65 elif hasattr(obj, "to_dict"):
66 return obj.to_dict()
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:56, in _todict(obj, validate, context)
54 """Convert an object to a dict representation."""
55 if isinstance(obj, SchemaBase):
---> 56 return obj.to_dict(validate=validate, context=context)
57 elif isinstance(obj, (list, tuple, np.ndarray)):
58 return [_todict(v, validate, context) for v in obj]
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:326, in SchemaBase.to_dict(self, validate, ignore, context)
324 result = _todict(self._args[0], validate=sub_validate, context=context)
325 elif not self._args:
--> 326 result = _todict(
327 {k: v for k, v in self._kwds.items() if k not in ignore},
328 validate=sub_validate,
329 context=context,
330 )
331 else:
332 raise ValueError(
333 "{} instance has both a value and properties : "
334 "cannot serialize to dict".format(self.__class__)
335 )
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:60, in _todict(obj, validate, context)
58 return [_todict(v, validate, context) for v in obj]
59 elif isinstance(obj, dict):
---> 60 return {
61 k: _todict(v, validate, context)
62 for k, v in obj.items()
63 if v is not Undefined
64 }
65 elif hasattr(obj, "to_dict"):
66 return obj.to_dict()
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:61, in <dictcomp>(.0)
58 return [_todict(v, validate, context) for v in obj]
59 elif isinstance(obj, dict):
60 return {
---> 61 k: _todict(v, validate, context)
62 for k, v in obj.items()
63 if v is not Undefined
64 }
65 elif hasattr(obj, "to_dict"):
66 return obj.to_dict()
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/utils/schemapi.py:56, in _todict(obj, validate, context)
54 """Convert an object to a dict representation."""
55 if isinstance(obj, SchemaBase):
---> 56 return obj.to_dict(validate=validate, context=context)
57 elif isinstance(obj, (list, tuple, np.ndarray)):
58 return [_todict(v, validate, context) for v in obj]
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/altair/vegalite/v4/schema/channels.py:40, in FieldChannelMixin.to_dict(self, validate, ignore, context)
38 elif not (type_in_shorthand or type_defined_explicitly):
39 if isinstance(context.get('data', None), pd.DataFrame):
---> 40 raise ValueError("{} encoding field is specified without a type; "
41 "the type cannot be inferred because it does not "
42 "match any column in the data.".format(shorthand))
43 else:
44 raise ValueError("{} encoding field is specified without a type; "
45 "the type cannot be automatically inferred because "
46 "the data is not specified as a pandas.DataFrame."
47 "".format(shorthand))
ValueError: col0 encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.
alt.Chart(...)
Here is the correct chart.
alt.Chart(df).mark_circle().encode(
x=alt.X(col0, scale=alt.Scale(zero=False)),
y=alt.Y(col1, scale=alt.Scale(zero=False))
)
Let’s try to divide that data into two clusters. There is a general routine used with scikit-learn for Machine Learning. The more comfortable you get with this routine, the more familiar you will be with some of the conventions of Object Oriented Programming.
Import (this step only needs to be done once per session)
Instantiate (i.e., create an object/instance of the class you just imported)
Fit
Predict (or transform).
# import
from sklearn.cluster import KMeans
If you don’t specify the number of clusters, scikit-learn will use 8 clusters as its default value. (It would be nice if scikit-learn would determine the number of clusters automatically based on the data, but it doesn’t.) Be sure to use different capitalizations for the object kmeans
and the class we imported KMeans
. The convention is to use lower-case letters for our objects.
# instantiate
kmeans = KMeans(n_clusters=2)
# Example of Object Oriented Programming, use a special KMeans object
type(kmeans)
sklearn.cluster._kmeans.KMeans
Python error messages are not always clear, but this one is pretty clear (scroll to the bottom).
kmeans.fit(df[[col0,col1]])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [10], line 1
----> 1 kmeans.fit(df[[col0,col1]])
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:1367, in KMeans.fit(self, X, y, sample_weight)
1341 def fit(self, X, y=None, sample_weight=None):
1342 """Compute k-means clustering.
1343
1344 Parameters
(...)
1365 Fitted estimator.
1366 """
-> 1367 X = self._validate_data(
1368 X,
1369 accept_sparse="csr",
1370 dtype=[np.float64, np.float32],
1371 order="C",
1372 copy=self.copy_x,
1373 accept_large_sparse=False,
1374 )
1376 self._check_params(X)
1377 random_state = check_random_state(self.random_state)
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/base.py:577, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
575 raise ValueError("Validation should be done on X, y or both.")
576 elif not no_val_X and no_val_y:
--> 577 X = check_array(X, input_name="X", **check_params)
578 out = X
579 elif no_val_X and not no_val_y:
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/utils/validation.py:899, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
893 raise ValueError(
894 "Found array with dim %d. %s expected <= 2."
895 % (array.ndim, estimator_name)
896 )
898 if force_all_finite:
--> 899 _assert_all_finite(
900 array,
901 input_name=input_name,
902 estimator_name=estimator_name,
903 allow_nan=force_all_finite == "allow-nan",
904 )
906 if ensure_min_samples > 0:
907 n_samples = _num_samples(array)
File ~/miniconda3/envs/math10f22/lib/python3.9/site-packages/sklearn/utils/validation.py:146, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
124 if (
125 not allow_nan
126 and estimator_name
(...)
130 # Improve the error message on how to handle missing values in
131 # scikit-learn.
132 msg_err += (
133 f"\n{estimator_name} does not accept missing values"
134 " encoded as NaN natively. For supervised learning, you might want"
(...)
144 "#estimators-that-handle-nan-values"
145 )
--> 146 raise ValueError(msg_err)
148 # for object dtype data, we only check for NaNs (GH-13254)
149 elif X.dtype == np.dtype("object") and not allow_nan:
ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
We drop the missing values. (Stop and convince yourself that axis=0
is correct if we want to drop rows. We are changing the rows axis.) The copy
is to prevent a warning later.
df2 = df.dropna(axis=0).copy()
The displayed output is new to me (at least how it is displayed on Deepnote). Think of the following cell as changing the object kmeans
.
# Step 3: fit
kmeans.fit(df2[[col0,col1]])
KMeans(n_clusters=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=2)
Your output may look different, because there is some randomness to the K-Means algorithm. Especially the specific numbers could be swapped.
# Step 4: predict
arr = kmeans.predict(df2[[col0, col1]])
arr
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0], dtype=int32)
The variable arr
is a NumPy array. It has length 333
.
len(arr)
333
That’s not the same length as df
, because we removed some rows from df
.
len(df)
344
Here it’s the same.
len(df2)
333
Let’s try the routine, this time looking for 4 clusters.
kmeans2 = KMeans(n_clusters=4)
kmeans2.fit(df2[[col0,col1]])
KMeans(n_clusters=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=4)
Notice how now we see 4 different values in the NumPy array, corresponding to the 4 clusters.
arr2 = kmeans2.predict(df2[[col0,col1]])
arr2
array([3, 3, 1, 3, 3, 3, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3,
3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3,
1, 1, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 1, 0, 3, 3,
3, 0, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 1, 3, 3, 1, 1, 1, 3, 1, 3, 1,
3, 1, 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 3, 0, 3, 1, 3, 1, 3, 3, 3, 1,
3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1,
1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 0,
3, 1, 3, 1, 1, 1, 1, 0, 1, 0, 0, 3, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 3, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0, 0,
0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0,
2, 2, 0, 0, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0,
2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0,
0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2,
0, 2, 1, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0,
2, 0, 0], dtype=int32)
Let’s make a new column in df2
holding these cluster values. (This next line would raise a warning if we hadn’t used copy
when we dropped rows with missing values.)
df2["cluster"] = arr2
The default encoding type for the “cluster” column is Quantitative (:Q
), but that is probably the worst choice from among Quantitative, Ordinal, and Nominal.
alt.Chart(df2).mark_circle().encode(
x=col0,
y=col1,
color="cluster"
)
The ordering of the cluster numbers 0,1,2,3 is not significant, so Nominal is by far the best choice of encoding type.
Notice how the clusters seem to be lying in horizontal bands. Look at the scales of the x-axis and the y-axis. The data along the x-axis covers about a range of 20, whereas the y-axis data covers a range of about 50. This difference in magnitude is causing the clusters to lie along horizontal bands. We will see how to get more appropriate clusterings next week.
alt.Chart(df2).mark_circle().encode(
x=col0,
y=col1,
color="cluster:N"
)
Here is a zoomed in version where we remove the default inclusion of zero in the axes.
alt.Chart(df2).mark_circle().encode(
x=alt.X(col0, scale=alt.Scale(zero=False)),
y=alt.Y(col1, scale=alt.Scale(zero=False)),
color="cluster:N"
)