Week 7 Video notebooks

import numpy as np
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss

Using copy to avoid pandas warnings

df = pd.read_csv("../data/spotify_dataset.csv", na_values=" ")
df.shape
(1556, 23)
df.dropna(inplace=True)
df.shape
(1545, 23)
df.Chord.value_counts()[:4]
C#/Db    214
C        155
B        141
G        136
Name: Chord, dtype: int64
df.Chord.value_counts()[:4].index
Index(['C#/Db', 'C', 'B', 'G'], dtype='object')
df.Chord.isin(df.Chord.value_counts()[:4].index).sum()
646
df.Chord.isin(df.Chord.value_counts()[:4].index)
0        True
1        True
2       False
3        True
4       False
        ...  
1551    False
1552     True
1553    False
1554    False
1555     True
Name: Chord, Length: 1545, dtype: bool
df.head()
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord
0 1 1 8 2021-07-23--2021-07-30 Beggin' 48,633,449 Måneskin 3377762.0 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 0.714 0.800 -4.808 0.0504 0.1270 0.3590 134.002 211560.0 0.589 B
1 2 2 3 2021-07-23--2021-07-30 STAY (with Justin Bieber) 47,248,719 The Kid LAROI 2230022.0 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... 0.591 0.764 -5.484 0.0483 0.0383 0.1030 169.928 141806.0 0.478 C#/Db
2 3 1 11 2021-06-25--2021-07-02 good 4 u 40,162,559 Olivia Rodrigo 6266514.0 4ZtFanR9U6ndgddUvNcjcG ['pop'] ... 0.563 0.664 -5.044 0.1540 0.3350 0.0849 166.928 178147.0 0.688 A
3 4 3 5 2021-07-02--2021-07-09 Bad Habits 37,799,456 Ed Sheeran 83293380.0 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 0.808 0.897 -3.712 0.0348 0.0469 0.3640 126.026 231041.0 0.591 B
4 5 5 1 2021-07-23--2021-07-30 INDUSTRY BABY (feat. Jack Harlow) 33,948,454 Lil Nas X 5473565.0 27NovPIUIRrOZoCHxABJwK ['lgbtq+ hip hop', 'pop rap'] ... 0.736 0.704 -7.409 0.0615 0.0203 0.0501 149.995 212000.0 0.894 D#/Eb

5 rows × 23 columns

df2 = df[df.Chord.isin(df.Chord.value_counts()[:4].index)]
df2.head()
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord
0 1 1 8 2021-07-23--2021-07-30 Beggin' 48,633,449 Måneskin 3377762.0 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 0.714 0.800 -4.808 0.0504 0.12700 0.359 134.002 211560.0 0.589 B
1 2 2 3 2021-07-23--2021-07-30 STAY (with Justin Bieber) 47,248,719 The Kid LAROI 2230022.0 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... 0.591 0.764 -5.484 0.0483 0.03830 0.103 169.928 141806.0 0.478 C#/Db
3 4 3 5 2021-07-02--2021-07-09 Bad Habits 37,799,456 Ed Sheeran 83293380.0 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 0.808 0.897 -3.712 0.0348 0.04690 0.364 126.026 231041.0 0.591 B
8 9 3 8 2021-06-18--2021-06-25 Yonaguni 25,030,128 Bad Bunny 36142273.0 2JPLbjOn0wPCngEot2STUS ['latin', 'reggaeton', 'trap latino'] ... 0.644 0.648 -4.601 0.1180 0.27600 0.135 179.951 206710.0 0.440 C#/Db
9 10 8 10 2021-07-02--2021-07-09 I WANNA BE YOUR SLAVE 24,551,591 Måneskin 3377762.0 4pt5fDVTg5GhEvEtlz9dKk ['indie rock italiano', 'italian pop'] ... 0.750 0.608 -4.008 0.0387 0.00165 0.178 132.507 173347.0 0.958 C#/Db

5 rows × 23 columns

df2["pred"] = np.nan
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/504453420.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["pred"] = np.nan
df3 = df[df.Chord.isin(df.Chord.value_counts()[:4].index)].copy()
df3["pred"] = np.nan

KNeighborsClassifier

df3
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord pred
0 1 1 8 2021-07-23--2021-07-30 Beggin' 48,633,449 Måneskin 3377762.0 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 0.800 -4.808 0.0504 0.12700 0.3590 134.002 211560.0 0.5890 B NaN
1 2 2 3 2021-07-23--2021-07-30 STAY (with Justin Bieber) 47,248,719 The Kid LAROI 2230022.0 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... 0.764 -5.484 0.0483 0.03830 0.1030 169.928 141806.0 0.4780 C#/Db NaN
3 4 3 5 2021-07-02--2021-07-09 Bad Habits 37,799,456 Ed Sheeran 83293380.0 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 0.897 -3.712 0.0348 0.04690 0.3640 126.026 231041.0 0.5910 B NaN
8 9 3 8 2021-06-18--2021-06-25 Yonaguni 25,030,128 Bad Bunny 36142273.0 2JPLbjOn0wPCngEot2STUS ['latin', 'reggaeton', 'trap latino'] ... 0.648 -4.601 0.1180 0.27600 0.1350 179.951 206710.0 0.4400 C#/Db NaN
9 10 8 10 2021-07-02--2021-07-09 I WANNA BE YOUR SLAVE 24,551,591 Måneskin 3377762.0 4pt5fDVTg5GhEvEtlz9dKk ['indie rock italiano', 'italian pop'] ... 0.608 -4.008 0.0387 0.00165 0.1780 132.507 173347.0 0.9580 C#/Db NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1546 1547 143 1 2019-12-27--2020-01-03 JACKBOYS 5,363,493 JACKBOYS 437907.0 62zKJrpbLxz6InR3tGyr7o ['rap', 'trap'] ... 0.130 -25.166 0.0336 0.90000 0.1110 123.342 46837.0 0.0676 C NaN
1547 1548 156 1 2019-12-27--2020-01-03 Combatchy (feat. MC Rebecca) 5,149,797 Anitta, Lexa, Luísa Sonza 10741972.0 2bPtwnrpFNEe8N7Q85kLHw ['funk carioca', 'funk pop', 'pagode baiano', ... ... 0.730 -3.032 0.0809 0.38300 0.0197 150.134 157600.0 0.6050 C#/Db NaN
1549 1550 187 1 2019-12-27--2020-01-03 Let Me Know (I Wonder Why Freestyle) 4,701,532 Juice WRLD 19102888.0 3wwo0bJvDSorOpNfzEkfXx ['chicago rap', 'melodic rap'] ... 0.537 -7.895 0.0832 0.17200 0.4180 125.028 215381.0 0.3830 G NaN
1552 1553 196 1 2019-12-27--2020-01-03 Cheirosa - Ao Vivo 4,623,030 Jorge & Mateus 15019109.0 2PWjKmjyTZeDpmOUa3a5da ['sertanejo', 'sertanejo universitario'] ... 0.870 -3.123 0.0851 0.24000 0.3330 152.370 181930.0 0.7140 B NaN
1555 1556 199 1 2019-12-27--2020-01-03 Lover (Remix) [feat. Shawn Mendes] 4,595,450 Taylor Swift 42227614.0 3i9UVldZOE0aD0JnyfAZZ0 ['pop', 'post-teen pop'] ... 0.603 -7.176 0.0640 0.43300 0.0862 205.272 221307.0 0.4220 G NaN

646 rows × 24 columns

df3.dtypes
Index                          int64
Highest Charting Position      int64
Number of Times Charted        int64
Week of Highest Charting      object
Song Name                     object
Streams                       object
Artist                        object
Artist Followers             float64
Song ID                       object
Genre                         object
Release Date                  object
Weeks Charted                 object
Popularity                   float64
Danceability                 float64
Energy                       float64
Loudness                     float64
Speechiness                  float64
Acousticness                 float64
Liveness                     float64
Tempo                        float64
Duration (ms)                float64
Valence                      float64
Chord                         object
pred                         float64
dtype: object
X = df3[["Artist Followers", "Danceability", "Energy", "Loudness", "Acousticness"]].copy()
X
Artist Followers Danceability Energy Loudness Acousticness
0 3377762.0 0.714 0.800 -4.808 0.12700
1 2230022.0 0.591 0.764 -5.484 0.03830
3 83293380.0 0.808 0.897 -3.712 0.04690
8 36142273.0 0.644 0.648 -4.601 0.27600
9 3377762.0 0.750 0.608 -4.008 0.00165
... ... ... ... ... ...
1546 437907.0 0.413 0.130 -25.166 0.90000
1547 10741972.0 0.826 0.730 -3.032 0.38300
1549 19102888.0 0.635 0.537 -7.895 0.17200
1552 15019109.0 0.528 0.870 -3.123 0.24000
1555 42227614.0 0.448 0.603 -7.176 0.43300

646 rows × 5 columns

y = df3["Chord"]
clf = KNeighborsClassifier(n_neighbors=4)
clf.fit(X,y)
KNeighborsClassifier(n_neighbors=4)
X["pred"] = clf.predict(X)
X
Artist Followers Danceability Energy Loudness Acousticness pred
0 3377762.0 0.714 0.800 -4.808 0.12700 C#/Db
1 2230022.0 0.591 0.764 -5.484 0.03830 C
3 83293380.0 0.808 0.897 -3.712 0.04690 C
8 36142273.0 0.644 0.648 -4.601 0.27600 C#/Db
9 3377762.0 0.750 0.608 -4.008 0.00165 C#/Db
... ... ... ... ... ... ...
1546 437907.0 0.413 0.130 -25.166 0.90000 C#/Db
1547 10741972.0 0.826 0.730 -3.032 0.38300 C#/Db
1549 19102888.0 0.635 0.537 -7.895 0.17200 C#/Db
1552 15019109.0 0.528 0.870 -3.123 0.24000 B
1555 42227614.0 0.448 0.603 -7.176 0.43300 C

646 rows × 6 columns

alt.Chart(X).mark_circle().encode(
    x = "Energy",
    y = "Danceability",
    color = "pred"
)
c1 = alt.Chart(X).mark_circle().encode(
    x = "Energy",
    y = "Artist Followers",
    color = "pred"
)
c0 = alt.Chart(df3).mark_circle().encode(
    x = "Energy",
    y = "Artist Followers",
    color = "Chord"
)
c0 | c1
X
Artist Followers Danceability Energy Loudness Acousticness pred
0 3377762.0 0.714 0.800 -4.808 0.12700 C#/Db
1 2230022.0 0.591 0.764 -5.484 0.03830 C
3 83293380.0 0.808 0.897 -3.712 0.04690 C
8 36142273.0 0.644 0.648 -4.601 0.27600 C#/Db
9 3377762.0 0.750 0.608 -4.008 0.00165 C#/Db
... ... ... ... ... ... ...
1546 437907.0 0.413 0.130 -25.166 0.90000 C#/Db
1547 10741972.0 0.826 0.730 -3.032 0.38300 C#/Db
1549 19102888.0 0.635 0.537 -7.895 0.17200 C#/Db
1552 15019109.0 0.528 0.870 -3.123 0.24000 B
1555 42227614.0 0.448 0.603 -7.176 0.43300 C

646 rows × 6 columns

StandardScaler

num_cols = ["Artist Followers", "Danceability", "Energy", "Loudness", "Acousticness"]
scaler = StandardScaler()
scaler.fit(df3[num_cols])
StandardScaler()
X_scaled = scaler.transform(df3[num_cols])
clf = KNeighborsClassifier(n_neighbors=4)
clf.fit(X_scaled,df3["Chord"])
KNeighborsClassifier(n_neighbors=4)
df3["pred"] = clf.predict(X_scaled)
c1 = alt.Chart(df3).mark_circle().encode(
    x = "Energy",
    y = "Artist Followers",
    color = "pred"
)
df4 = df3.copy()
df4[cols] = X_scaled
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/4039862807.py in <module>
----> 1 df4[cols] = X_scaled

NameError: name 'cols' is not defined
c1 = alt.Chart(df4).mark_circle().encode(
    x = "Energy",
    y = "Artist Followers",
    color = "pred"
)
c1
df4.mean(axis=0)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/2231066143.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df4.mean(axis=0)
Index                        7.907663e+02
Highest Charting Position    8.622601e+01
Number of Times Charted      1.057740e+01
Artist Followers             1.469321e+07
Popularity                   6.941331e+01
Danceability                 6.938994e-01
Energy                       6.281130e-01
Loudness                    -6.409364e+00
Speechiness                  1.248567e-01
Acousticness                 2.430600e-01
Liveness                     1.794019e-01
Tempo                        1.236974e+02
Duration (ms)                1.992237e+05
Valence                      5.107998e-01
dtype: float64
cols
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/4199983457.py in <module>
----> 1 cols

NameError: name 'cols' is not defined
df4.std(axis=0)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/241135741.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df4.std(axis=0)
Index                        4.491292e+02
Highest Charting Position    5.671161e+01
Number of Times Charted      1.633495e+01
Artist Followers             1.689649e+07
Popularity                   1.635074e+01
Danceability                 1.426079e-01
Energy                       1.569726e-01
Loudness                     2.489291e+00
Speechiness                  1.110082e-01
Acousticness                 2.450287e-01
Liveness                     1.457320e-01
Tempo                        2.967576e+01
Duration (ms)                5.062622e+04
Valence                      2.250633e-01
dtype: float64

log_loss

clf.score(X_scaled,df3["Chord"])
0.5386996904024768
clf.predict_proba(X_scaled).shape
(646, 4)
clf.predict_proba(X_scaled)
array([[0.25, 0.25, 0.25, 0.25],
       [0.  , 0.25, 0.5 , 0.25],
       [0.25, 0.25, 0.5 , 0.  ],
       ...,
       [0.  , 0.25, 0.25, 0.5 ],
       [0.5 , 0.25, 0.25, 0.  ],
       [0.  , 0.25, 0.25, 0.5 ]])
clf.classes_
array(['B', 'C', 'C#/Db', 'G'], dtype=object)
log_loss(df3["Chord"],clf.predict_proba(X_scaled))
0.8816211146814463
Created in deepnote.com Created in Deepnote