K-Means clustering using scikit-learn

import numpy as np
import sklearn
import pandas as pd
from pandas.api.types import is_numeric_dtype
import altair as alt
# Change the path
df = pd.read_csv("../data/spotify_dataset.csv")
df.dtypes
Index                         int64
Highest Charting Position     int64
Number of Times Charted       int64
Week of Highest Charting     object
Song Name                    object
Streams                      object
Artist                       object
Artist Followers             object
Song ID                      object
Genre                        object
Release Date                 object
Weeks Charted                object
Popularity                   object
Danceability                 object
Energy                       object
Loudness                     object
Speechiness                  object
Acousticness                 object
Liveness                     object
Tempo                        object
Duration (ms)                object
Valence                      object
Chord                        object
dtype: object
# Change the path
df = pd.read_csv("../data/spotify_dataset.csv",na_values=" ")
df.dtypes
Index                          int64
Highest Charting Position      int64
Number of Times Charted        int64
Week of Highest Charting      object
Song Name                     object
Streams                       object
Artist                        object
Artist Followers             float64
Song ID                       object
Genre                         object
Release Date                  object
Weeks Charted                 object
Popularity                   float64
Danceability                 float64
Energy                       float64
Loudness                     float64
Speechiness                  float64
Acousticness                 float64
Liveness                     float64
Tempo                        float64
Duration (ms)                float64
Valence                      float64
Chord                         object
dtype: object
df.columns
Index(['Index', 'Highest Charting Position', 'Number of Times Charted',
       'Week of Highest Charting', 'Song Name', 'Streams', 'Artist',
       'Artist Followers', 'Song ID', 'Genre', 'Release Date', 'Weeks Charted',
       'Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness',
       'Acousticness', 'Liveness', 'Tempo', 'Duration (ms)', 'Valence',
       'Chord'],
      dtype='object')
is_numeric_dtype(df["Artist"])
False
numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]
df2 = df[numeric_cols].copy()
df2.head(10)
Index Highest Charting Position Number of Times Charted Artist Followers Popularity Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence
0 1 1 8 3377762.0 100.0 0.714 0.800 -4.808 0.0504 0.12700 0.3590 134.002 211560.0 0.589
1 2 2 3 2230022.0 99.0 0.591 0.764 -5.484 0.0483 0.03830 0.1030 169.928 141806.0 0.478
2 3 1 11 6266514.0 99.0 0.563 0.664 -5.044 0.1540 0.33500 0.0849 166.928 178147.0 0.688
3 4 3 5 83293380.0 98.0 0.808 0.897 -3.712 0.0348 0.04690 0.3640 126.026 231041.0 0.591
4 5 5 1 5473565.0 96.0 0.736 0.704 -7.409 0.0615 0.02030 0.0501 149.995 212000.0 0.894
5 6 1 18 5473565.0 97.0 0.610 0.508 -6.682 0.1520 0.29700 0.3840 178.818 137876.0 0.758
6 7 3 16 8640063.0 94.0 0.762 0.701 -3.541 0.0286 0.23500 0.1230 110.968 208867.0 0.742
7 8 2 10 6080597.0 95.0 0.780 0.718 -3.605 0.0506 0.31000 0.0932 127.949 199604.0 0.342
8 9 3 8 36142273.0 96.0 0.644 0.648 -4.601 0.1180 0.27600 0.1350 179.951 206710.0 0.440
9 10 8 10 3377762.0 95.0 0.750 0.608 -4.008 0.0387 0.00165 0.1780 132.507 173347.0 0.958
df2["Energy"].isna()
0       False
1       False
2       False
3       False
4       False
        ...  
1551    False
1552    False
1553    False
1554    False
1555    False
Name: Energy, Length: 1556, dtype: bool
df2["Energy"].notna()
0       True
1       True
2       True
3       True
4       True
        ... 
1551    True
1552    True
1553    True
1554    True
1555    True
Name: Energy, Length: 1556, dtype: bool
df[df2["Energy"].isna()]
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord
35 36 36 1 2021-07-23--2021-07-30 NOT SOBER (feat. Polo G & Stunna Gambino) 11,869,336 The Kid LAROI NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
163 164 5 39 2020-10-30--2020-11-06 34+35 5,453,159 Ariana Grande NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
464 465 118 1 2021-03-26--2021-04-02 Richer (feat. Polo G) 6,292,362 Rod Wave NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
530 531 20 5 2021-01-15--2021-01-22 34+35 Remix (feat. Doja Cat, Megan Thee Stalli... 6,162,453 Ariana Grande NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
636 637 22 6 2020-12-18--2020-12-25 Driving Home for Christmas - 2019 Remaster 8,804,531 Chris Rea NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
654 655 73 1 2020-12-18--2020-12-25 Thank God It's Christmas - Non-Album Single 10,509,961 Queen NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
750 751 19 20 2020-07-31--2020-08-07 Agua (with J Balvin) - Music From "Sponge On T... 5,358,940 Tainy NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
784 785 76 14 2020-09-04--2020-09-11 Lean (feat. Towy, Osquel, Beltito & Sammy & Fa... 4,739,241 Super Yei, Jone Quest NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
876 877 164 4 2020-09-18--2020-09-25 +Linda 4,964,708 Dalex NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1140 1141 131 1 2020-05-29--2020-06-05 In meinem Benz 5,494,500 AK AUSSERKONTROLLE, Bonez MC NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1538 1539 176 1 2020-01-03--2020-01-10 fuck, i'm lonely (with Anne-Marie) - from “13 ... 4,856,458 Lauv NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

11 rows × 23 columns

df3 = df2[~df2["Energy"].isna()].copy()
from sklearn.cluster import KMeans
KMeans?
kmeans = KMeans(6)
kmeans.fit(df3)
KMeans(n_clusters=6)
kmeans.predict(df3)
array([5, 5, 0, ..., 1, 5, 3], dtype=int32)
df3["cluster"] = kmeans.predict(df3)
df3.head(3)
Index Highest Charting Position Number of Times Charted Artist Followers Popularity Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence cluster
0 1 1 8 3377762.0 100.0 0.714 0.800 -4.808 0.0504 0.1270 0.3590 134.002 211560.0 0.589 5
1 2 2 3 2230022.0 99.0 0.591 0.764 -5.484 0.0483 0.0383 0.1030 169.928 141806.0 0.478 5
2 3 1 11 6266514.0 99.0 0.563 0.664 -5.044 0.1540 0.3350 0.0849 166.928 178147.0 0.688 0
alt.Chart(df3).mark_circle().encode(
    x = "Energy",
    y = "Popularity",
    color = "cluster:O"
)
alt.Chart(df3).mark_circle().encode(
    x = "Energy",
    y = "Popularity",
    color = "cluster:N"
)
numeric_cols
['Index',
 'Highest Charting Position',
 'Number of Times Charted',
 'Artist Followers',
 'Popularity',
 'Danceability',
 'Energy',
 'Loudness',
 'Speechiness',
 'Acousticness',
 'Liveness',
 'Tempo',
 'Duration (ms)',
 'Valence']
alt.Chart(df3).mark_point().encode(
    x = "Energy",
    y = "Popularity",
    size = "Loudness",
    shape = "cluster:N",
    color = "cluster:N"
)
kmeans_new = KMeans(n_clusters=2)
kmeans_new.fit(df3)
KMeans(n_clusters=2)
df3["new_cluster"] = kmeans_new.predict(df3)
df3.head()
Index Highest Charting Position Number of Times Charted Artist Followers Popularity Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence cluster new_cluster
0 1 1 8 3377762.0 100.0 0.714 0.800 -4.808 0.0504 0.1270 0.3590 134.002 211560.0 0.589 5 1
1 2 2 3 2230022.0 99.0 0.591 0.764 -5.484 0.0483 0.0383 0.1030 169.928 141806.0 0.478 5 1
2 3 1 11 6266514.0 99.0 0.563 0.664 -5.044 0.1540 0.3350 0.0849 166.928 178147.0 0.688 0 1
3 4 3 5 83293380.0 98.0 0.808 0.897 -3.712 0.0348 0.0469 0.3640 126.026 231041.0 0.591 4 0
4 5 5 1 5473565.0 96.0 0.736 0.704 -7.409 0.0615 0.0203 0.0501 149.995 212000.0 0.894 0 1
alt.Chart(df3).mark_circle().encode(
    x = "Artist Followers",
    y = "Popularity",
    color = "new_cluster:N"
)