K-Means clustering using scikit-learn¶
import numpy as np
import sklearn
import pandas as pd
from pandas.api.types import is_numeric_dtype
import altair as alt
# Change the path
df = pd.read_csv("../data/spotify_dataset.csv")
df.dtypes
Index int64
Highest Charting Position int64
Number of Times Charted int64
Week of Highest Charting object
Song Name object
Streams object
Artist object
Artist Followers object
Song ID object
Genre object
Release Date object
Weeks Charted object
Popularity object
Danceability object
Energy object
Loudness object
Speechiness object
Acousticness object
Liveness object
Tempo object
Duration (ms) object
Valence object
Chord object
dtype: object
# Change the path
df = pd.read_csv("../data/spotify_dataset.csv",na_values=" ")
df.dtypes
Index int64
Highest Charting Position int64
Number of Times Charted int64
Week of Highest Charting object
Song Name object
Streams object
Artist object
Artist Followers float64
Song ID object
Genre object
Release Date object
Weeks Charted object
Popularity float64
Danceability float64
Energy float64
Loudness float64
Speechiness float64
Acousticness float64
Liveness float64
Tempo float64
Duration (ms) float64
Valence float64
Chord object
dtype: object
df.columns
Index(['Index', 'Highest Charting Position', 'Number of Times Charted',
'Week of Highest Charting', 'Song Name', 'Streams', 'Artist',
'Artist Followers', 'Song ID', 'Genre', 'Release Date', 'Weeks Charted',
'Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness',
'Acousticness', 'Liveness', 'Tempo', 'Duration (ms)', 'Valence',
'Chord'],
dtype='object')
is_numeric_dtype(df["Artist"])
False
numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]
df2 = df[numeric_cols].copy()
df2.head(10)
Index | Highest Charting Position | Number of Times Charted | Artist Followers | Popularity | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 3377762.0 | 100.0 | 0.714 | 0.800 | -4.808 | 0.0504 | 0.12700 | 0.3590 | 134.002 | 211560.0 | 0.589 |
1 | 2 | 2 | 3 | 2230022.0 | 99.0 | 0.591 | 0.764 | -5.484 | 0.0483 | 0.03830 | 0.1030 | 169.928 | 141806.0 | 0.478 |
2 | 3 | 1 | 11 | 6266514.0 | 99.0 | 0.563 | 0.664 | -5.044 | 0.1540 | 0.33500 | 0.0849 | 166.928 | 178147.0 | 0.688 |
3 | 4 | 3 | 5 | 83293380.0 | 98.0 | 0.808 | 0.897 | -3.712 | 0.0348 | 0.04690 | 0.3640 | 126.026 | 231041.0 | 0.591 |
4 | 5 | 5 | 1 | 5473565.0 | 96.0 | 0.736 | 0.704 | -7.409 | 0.0615 | 0.02030 | 0.0501 | 149.995 | 212000.0 | 0.894 |
5 | 6 | 1 | 18 | 5473565.0 | 97.0 | 0.610 | 0.508 | -6.682 | 0.1520 | 0.29700 | 0.3840 | 178.818 | 137876.0 | 0.758 |
6 | 7 | 3 | 16 | 8640063.0 | 94.0 | 0.762 | 0.701 | -3.541 | 0.0286 | 0.23500 | 0.1230 | 110.968 | 208867.0 | 0.742 |
7 | 8 | 2 | 10 | 6080597.0 | 95.0 | 0.780 | 0.718 | -3.605 | 0.0506 | 0.31000 | 0.0932 | 127.949 | 199604.0 | 0.342 |
8 | 9 | 3 | 8 | 36142273.0 | 96.0 | 0.644 | 0.648 | -4.601 | 0.1180 | 0.27600 | 0.1350 | 179.951 | 206710.0 | 0.440 |
9 | 10 | 8 | 10 | 3377762.0 | 95.0 | 0.750 | 0.608 | -4.008 | 0.0387 | 0.00165 | 0.1780 | 132.507 | 173347.0 | 0.958 |
df2["Energy"].isna()
0 False
1 False
2 False
3 False
4 False
...
1551 False
1552 False
1553 False
1554 False
1555 False
Name: Energy, Length: 1556, dtype: bool
df2["Energy"].notna()
0 True
1 True
2 True
3 True
4 True
...
1551 True
1552 True
1553 True
1554 True
1555 True
Name: Energy, Length: 1556, dtype: bool
df[df2["Energy"].isna()]
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
35 | 36 | 36 | 1 | 2021-07-23--2021-07-30 | NOT SOBER (feat. Polo G & Stunna Gambino) | 11,869,336 | The Kid LAROI | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
163 | 164 | 5 | 39 | 2020-10-30--2020-11-06 | 34+35 | 5,453,159 | Ariana Grande | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
464 | 465 | 118 | 1 | 2021-03-26--2021-04-02 | Richer (feat. Polo G) | 6,292,362 | Rod Wave | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
530 | 531 | 20 | 5 | 2021-01-15--2021-01-22 | 34+35 Remix (feat. Doja Cat, Megan Thee Stalli... | 6,162,453 | Ariana Grande | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
636 | 637 | 22 | 6 | 2020-12-18--2020-12-25 | Driving Home for Christmas - 2019 Remaster | 8,804,531 | Chris Rea | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
654 | 655 | 73 | 1 | 2020-12-18--2020-12-25 | Thank God It's Christmas - Non-Album Single | 10,509,961 | Queen | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
750 | 751 | 19 | 20 | 2020-07-31--2020-08-07 | Agua (with J Balvin) - Music From "Sponge On T... | 5,358,940 | Tainy | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
784 | 785 | 76 | 14 | 2020-09-04--2020-09-11 | Lean (feat. Towy, Osquel, Beltito & Sammy & Fa... | 4,739,241 | Super Yei, Jone Quest | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
876 | 877 | 164 | 4 | 2020-09-18--2020-09-25 | +Linda | 4,964,708 | Dalex | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1140 | 1141 | 131 | 1 | 2020-05-29--2020-06-05 | In meinem Benz | 5,494,500 | AK AUSSERKONTROLLE, Bonez MC | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1538 | 1539 | 176 | 1 | 2020-01-03--2020-01-10 | fuck, i'm lonely (with Anne-Marie) - from “13 ... | 4,856,458 | Lauv | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 rows × 23 columns
df3 = df2[~df2["Energy"].isna()].copy()
from sklearn.cluster import KMeans
KMeans?
kmeans = KMeans(6)
kmeans.fit(df3)
KMeans(n_clusters=6)
kmeans.predict(df3)
array([5, 5, 0, ..., 1, 5, 3], dtype=int32)
df3["cluster"] = kmeans.predict(df3)
df3.head(3)
Index | Highest Charting Position | Number of Times Charted | Artist Followers | Popularity | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 3377762.0 | 100.0 | 0.714 | 0.800 | -4.808 | 0.0504 | 0.1270 | 0.3590 | 134.002 | 211560.0 | 0.589 | 5 |
1 | 2 | 2 | 3 | 2230022.0 | 99.0 | 0.591 | 0.764 | -5.484 | 0.0483 | 0.0383 | 0.1030 | 169.928 | 141806.0 | 0.478 | 5 |
2 | 3 | 1 | 11 | 6266514.0 | 99.0 | 0.563 | 0.664 | -5.044 | 0.1540 | 0.3350 | 0.0849 | 166.928 | 178147.0 | 0.688 | 0 |
alt.Chart(df3).mark_circle().encode(
x = "Energy",
y = "Popularity",
color = "cluster:O"
)
alt.Chart(df3).mark_circle().encode(
x = "Energy",
y = "Popularity",
color = "cluster:N"
)
numeric_cols
['Index',
'Highest Charting Position',
'Number of Times Charted',
'Artist Followers',
'Popularity',
'Danceability',
'Energy',
'Loudness',
'Speechiness',
'Acousticness',
'Liveness',
'Tempo',
'Duration (ms)',
'Valence']
alt.Chart(df3).mark_point().encode(
x = "Energy",
y = "Popularity",
size = "Loudness",
shape = "cluster:N",
color = "cluster:N"
)
kmeans_new = KMeans(n_clusters=2)
kmeans_new.fit(df3)
KMeans(n_clusters=2)
df3["new_cluster"] = kmeans_new.predict(df3)
df3.head()
Index | Highest Charting Position | Number of Times Charted | Artist Followers | Popularity | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | cluster | new_cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 3377762.0 | 100.0 | 0.714 | 0.800 | -4.808 | 0.0504 | 0.1270 | 0.3590 | 134.002 | 211560.0 | 0.589 | 5 | 1 |
1 | 2 | 2 | 3 | 2230022.0 | 99.0 | 0.591 | 0.764 | -5.484 | 0.0483 | 0.0383 | 0.1030 | 169.928 | 141806.0 | 0.478 | 5 | 1 |
2 | 3 | 1 | 11 | 6266514.0 | 99.0 | 0.563 | 0.664 | -5.044 | 0.1540 | 0.3350 | 0.0849 | 166.928 | 178147.0 | 0.688 | 0 | 1 |
3 | 4 | 3 | 5 | 83293380.0 | 98.0 | 0.808 | 0.897 | -3.712 | 0.0348 | 0.0469 | 0.3640 | 126.026 | 231041.0 | 0.591 | 4 | 0 |
4 | 5 | 5 | 1 | 5473565.0 | 96.0 | 0.736 | 0.704 | -7.409 | 0.0615 | 0.0203 | 0.0501 | 149.995 | 212000.0 | 0.894 | 0 | 1 |
alt.Chart(df3).mark_circle().encode(
x = "Artist Followers",
y = "Popularity",
color = "new_cluster:N"
)