Week 7 Video notebooks
Contents
Week 7 Video notebooks¶
import numpy as np
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
Using copy to avoid pandas warnings¶
df = pd.read_csv("../data/spotify_dataset.csv", na_values=" ")
df.shape
(1556, 23)
df.dropna(inplace=True)
df.shape
(1545, 23)
df.Chord.value_counts()[:4]
C#/Db 214
C 155
B 141
G 136
Name: Chord, dtype: int64
df.Chord.value_counts()[:4].index
Index(['C#/Db', 'C', 'B', 'G'], dtype='object')
df.Chord.isin(df.Chord.value_counts()[:4].index).sum()
646
df.Chord.isin(df.Chord.value_counts()[:4].index)
0 True
1 True
2 False
3 True
4 False
...
1551 False
1552 True
1553 False
1554 False
1555 True
Name: Chord, Length: 1545, dtype: bool
df.head()
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 2021-07-23--2021-07-30 | Beggin' | 48,633,449 | Måneskin | 3377762.0 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 0.714 | 0.800 | -4.808 | 0.0504 | 0.1270 | 0.3590 | 134.002 | 211560.0 | 0.589 | B |
1 | 2 | 2 | 3 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 47,248,719 | The Kid LAROI | 2230022.0 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | 0.591 | 0.764 | -5.484 | 0.0483 | 0.0383 | 0.1030 | 169.928 | 141806.0 | 0.478 | C#/Db |
2 | 3 | 1 | 11 | 2021-06-25--2021-07-02 | good 4 u | 40,162,559 | Olivia Rodrigo | 6266514.0 | 4ZtFanR9U6ndgddUvNcjcG | ['pop'] | ... | 0.563 | 0.664 | -5.044 | 0.1540 | 0.3350 | 0.0849 | 166.928 | 178147.0 | 0.688 | A |
3 | 4 | 3 | 5 | 2021-07-02--2021-07-09 | Bad Habits | 37,799,456 | Ed Sheeran | 83293380.0 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 0.808 | 0.897 | -3.712 | 0.0348 | 0.0469 | 0.3640 | 126.026 | 231041.0 | 0.591 | B |
4 | 5 | 5 | 1 | 2021-07-23--2021-07-30 | INDUSTRY BABY (feat. Jack Harlow) | 33,948,454 | Lil Nas X | 5473565.0 | 27NovPIUIRrOZoCHxABJwK | ['lgbtq+ hip hop', 'pop rap'] | ... | 0.736 | 0.704 | -7.409 | 0.0615 | 0.0203 | 0.0501 | 149.995 | 212000.0 | 0.894 | D#/Eb |
5 rows × 23 columns
df2 = df[df.Chord.isin(df.Chord.value_counts()[:4].index)]
df2.head()
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 2021-07-23--2021-07-30 | Beggin' | 48,633,449 | Måneskin | 3377762.0 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 0.714 | 0.800 | -4.808 | 0.0504 | 0.12700 | 0.359 | 134.002 | 211560.0 | 0.589 | B |
1 | 2 | 2 | 3 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 47,248,719 | The Kid LAROI | 2230022.0 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | 0.591 | 0.764 | -5.484 | 0.0483 | 0.03830 | 0.103 | 169.928 | 141806.0 | 0.478 | C#/Db |
3 | 4 | 3 | 5 | 2021-07-02--2021-07-09 | Bad Habits | 37,799,456 | Ed Sheeran | 83293380.0 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 0.808 | 0.897 | -3.712 | 0.0348 | 0.04690 | 0.364 | 126.026 | 231041.0 | 0.591 | B |
8 | 9 | 3 | 8 | 2021-06-18--2021-06-25 | Yonaguni | 25,030,128 | Bad Bunny | 36142273.0 | 2JPLbjOn0wPCngEot2STUS | ['latin', 'reggaeton', 'trap latino'] | ... | 0.644 | 0.648 | -4.601 | 0.1180 | 0.27600 | 0.135 | 179.951 | 206710.0 | 0.440 | C#/Db |
9 | 10 | 8 | 10 | 2021-07-02--2021-07-09 | I WANNA BE YOUR SLAVE | 24,551,591 | Måneskin | 3377762.0 | 4pt5fDVTg5GhEvEtlz9dKk | ['indie rock italiano', 'italian pop'] | ... | 0.750 | 0.608 | -4.008 | 0.0387 | 0.00165 | 0.178 | 132.507 | 173347.0 | 0.958 | C#/Db |
5 rows × 23 columns
df2["pred"] = np.nan
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/504453420.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df2["pred"] = np.nan
df3 = df[df.Chord.isin(df.Chord.value_counts()[:4].index)].copy()
df3["pred"] = np.nan
KNeighborsClassifier¶
df3
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | pred | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 2021-07-23--2021-07-30 | Beggin' | 48,633,449 | Måneskin | 3377762.0 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 0.800 | -4.808 | 0.0504 | 0.12700 | 0.3590 | 134.002 | 211560.0 | 0.5890 | B | NaN |
1 | 2 | 2 | 3 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 47,248,719 | The Kid LAROI | 2230022.0 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | 0.764 | -5.484 | 0.0483 | 0.03830 | 0.1030 | 169.928 | 141806.0 | 0.4780 | C#/Db | NaN |
3 | 4 | 3 | 5 | 2021-07-02--2021-07-09 | Bad Habits | 37,799,456 | Ed Sheeran | 83293380.0 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 0.897 | -3.712 | 0.0348 | 0.04690 | 0.3640 | 126.026 | 231041.0 | 0.5910 | B | NaN |
8 | 9 | 3 | 8 | 2021-06-18--2021-06-25 | Yonaguni | 25,030,128 | Bad Bunny | 36142273.0 | 2JPLbjOn0wPCngEot2STUS | ['latin', 'reggaeton', 'trap latino'] | ... | 0.648 | -4.601 | 0.1180 | 0.27600 | 0.1350 | 179.951 | 206710.0 | 0.4400 | C#/Db | NaN |
9 | 10 | 8 | 10 | 2021-07-02--2021-07-09 | I WANNA BE YOUR SLAVE | 24,551,591 | Måneskin | 3377762.0 | 4pt5fDVTg5GhEvEtlz9dKk | ['indie rock italiano', 'italian pop'] | ... | 0.608 | -4.008 | 0.0387 | 0.00165 | 0.1780 | 132.507 | 173347.0 | 0.9580 | C#/Db | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1546 | 1547 | 143 | 1 | 2019-12-27--2020-01-03 | JACKBOYS | 5,363,493 | JACKBOYS | 437907.0 | 62zKJrpbLxz6InR3tGyr7o | ['rap', 'trap'] | ... | 0.130 | -25.166 | 0.0336 | 0.90000 | 0.1110 | 123.342 | 46837.0 | 0.0676 | C | NaN |
1547 | 1548 | 156 | 1 | 2019-12-27--2020-01-03 | Combatchy (feat. MC Rebecca) | 5,149,797 | Anitta, Lexa, Luísa Sonza | 10741972.0 | 2bPtwnrpFNEe8N7Q85kLHw | ['funk carioca', 'funk pop', 'pagode baiano', ... | ... | 0.730 | -3.032 | 0.0809 | 0.38300 | 0.0197 | 150.134 | 157600.0 | 0.6050 | C#/Db | NaN |
1549 | 1550 | 187 | 1 | 2019-12-27--2020-01-03 | Let Me Know (I Wonder Why Freestyle) | 4,701,532 | Juice WRLD | 19102888.0 | 3wwo0bJvDSorOpNfzEkfXx | ['chicago rap', 'melodic rap'] | ... | 0.537 | -7.895 | 0.0832 | 0.17200 | 0.4180 | 125.028 | 215381.0 | 0.3830 | G | NaN |
1552 | 1553 | 196 | 1 | 2019-12-27--2020-01-03 | Cheirosa - Ao Vivo | 4,623,030 | Jorge & Mateus | 15019109.0 | 2PWjKmjyTZeDpmOUa3a5da | ['sertanejo', 'sertanejo universitario'] | ... | 0.870 | -3.123 | 0.0851 | 0.24000 | 0.3330 | 152.370 | 181930.0 | 0.7140 | B | NaN |
1555 | 1556 | 199 | 1 | 2019-12-27--2020-01-03 | Lover (Remix) [feat. Shawn Mendes] | 4,595,450 | Taylor Swift | 42227614.0 | 3i9UVldZOE0aD0JnyfAZZ0 | ['pop', 'post-teen pop'] | ... | 0.603 | -7.176 | 0.0640 | 0.43300 | 0.0862 | 205.272 | 221307.0 | 0.4220 | G | NaN |
646 rows × 24 columns
df3.dtypes
Index int64
Highest Charting Position int64
Number of Times Charted int64
Week of Highest Charting object
Song Name object
Streams object
Artist object
Artist Followers float64
Song ID object
Genre object
Release Date object
Weeks Charted object
Popularity float64
Danceability float64
Energy float64
Loudness float64
Speechiness float64
Acousticness float64
Liveness float64
Tempo float64
Duration (ms) float64
Valence float64
Chord object
pred float64
dtype: object
X = df3[["Artist Followers", "Danceability", "Energy", "Loudness", "Acousticness"]].copy()
X
Artist Followers | Danceability | Energy | Loudness | Acousticness | |
---|---|---|---|---|---|
0 | 3377762.0 | 0.714 | 0.800 | -4.808 | 0.12700 |
1 | 2230022.0 | 0.591 | 0.764 | -5.484 | 0.03830 |
3 | 83293380.0 | 0.808 | 0.897 | -3.712 | 0.04690 |
8 | 36142273.0 | 0.644 | 0.648 | -4.601 | 0.27600 |
9 | 3377762.0 | 0.750 | 0.608 | -4.008 | 0.00165 |
... | ... | ... | ... | ... | ... |
1546 | 437907.0 | 0.413 | 0.130 | -25.166 | 0.90000 |
1547 | 10741972.0 | 0.826 | 0.730 | -3.032 | 0.38300 |
1549 | 19102888.0 | 0.635 | 0.537 | -7.895 | 0.17200 |
1552 | 15019109.0 | 0.528 | 0.870 | -3.123 | 0.24000 |
1555 | 42227614.0 | 0.448 | 0.603 | -7.176 | 0.43300 |
646 rows × 5 columns
y = df3["Chord"]
clf = KNeighborsClassifier(n_neighbors=4)
clf.fit(X,y)
KNeighborsClassifier(n_neighbors=4)
X["pred"] = clf.predict(X)
X
Artist Followers | Danceability | Energy | Loudness | Acousticness | pred | |
---|---|---|---|---|---|---|
0 | 3377762.0 | 0.714 | 0.800 | -4.808 | 0.12700 | C#/Db |
1 | 2230022.0 | 0.591 | 0.764 | -5.484 | 0.03830 | C |
3 | 83293380.0 | 0.808 | 0.897 | -3.712 | 0.04690 | C |
8 | 36142273.0 | 0.644 | 0.648 | -4.601 | 0.27600 | C#/Db |
9 | 3377762.0 | 0.750 | 0.608 | -4.008 | 0.00165 | C#/Db |
... | ... | ... | ... | ... | ... | ... |
1546 | 437907.0 | 0.413 | 0.130 | -25.166 | 0.90000 | C#/Db |
1547 | 10741972.0 | 0.826 | 0.730 | -3.032 | 0.38300 | C#/Db |
1549 | 19102888.0 | 0.635 | 0.537 | -7.895 | 0.17200 | C#/Db |
1552 | 15019109.0 | 0.528 | 0.870 | -3.123 | 0.24000 | B |
1555 | 42227614.0 | 0.448 | 0.603 | -7.176 | 0.43300 | C |
646 rows × 6 columns
alt.Chart(X).mark_circle().encode(
x = "Energy",
y = "Danceability",
color = "pred"
)
c1 = alt.Chart(X).mark_circle().encode(
x = "Energy",
y = "Artist Followers",
color = "pred"
)
c0 = alt.Chart(df3).mark_circle().encode(
x = "Energy",
y = "Artist Followers",
color = "Chord"
)
c0 | c1
X
Artist Followers | Danceability | Energy | Loudness | Acousticness | pred | |
---|---|---|---|---|---|---|
0 | 3377762.0 | 0.714 | 0.800 | -4.808 | 0.12700 | C#/Db |
1 | 2230022.0 | 0.591 | 0.764 | -5.484 | 0.03830 | C |
3 | 83293380.0 | 0.808 | 0.897 | -3.712 | 0.04690 | C |
8 | 36142273.0 | 0.644 | 0.648 | -4.601 | 0.27600 | C#/Db |
9 | 3377762.0 | 0.750 | 0.608 | -4.008 | 0.00165 | C#/Db |
... | ... | ... | ... | ... | ... | ... |
1546 | 437907.0 | 0.413 | 0.130 | -25.166 | 0.90000 | C#/Db |
1547 | 10741972.0 | 0.826 | 0.730 | -3.032 | 0.38300 | C#/Db |
1549 | 19102888.0 | 0.635 | 0.537 | -7.895 | 0.17200 | C#/Db |
1552 | 15019109.0 | 0.528 | 0.870 | -3.123 | 0.24000 | B |
1555 | 42227614.0 | 0.448 | 0.603 | -7.176 | 0.43300 | C |
646 rows × 6 columns
StandardScaler¶
num_cols = ["Artist Followers", "Danceability", "Energy", "Loudness", "Acousticness"]
scaler = StandardScaler()
scaler.fit(df3[num_cols])
StandardScaler()
X_scaled = scaler.transform(df3[num_cols])
clf = KNeighborsClassifier(n_neighbors=4)
clf.fit(X_scaled,df3["Chord"])
KNeighborsClassifier(n_neighbors=4)
df3["pred"] = clf.predict(X_scaled)
c1 = alt.Chart(df3).mark_circle().encode(
x = "Energy",
y = "Artist Followers",
color = "pred"
)
df4 = df3.copy()
df4[cols] = X_scaled
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/4039862807.py in <module>
----> 1 df4[cols] = X_scaled
NameError: name 'cols' is not defined
c1 = alt.Chart(df4).mark_circle().encode(
x = "Energy",
y = "Artist Followers",
color = "pred"
)
c1
df4.mean(axis=0)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/2231066143.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
df4.mean(axis=0)
Index 7.907663e+02
Highest Charting Position 8.622601e+01
Number of Times Charted 1.057740e+01
Artist Followers 1.469321e+07
Popularity 6.941331e+01
Danceability 6.938994e-01
Energy 6.281130e-01
Loudness -6.409364e+00
Speechiness 1.248567e-01
Acousticness 2.430600e-01
Liveness 1.794019e-01
Tempo 1.236974e+02
Duration (ms) 1.992237e+05
Valence 5.107998e-01
dtype: float64
cols
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/4199983457.py in <module>
----> 1 cols
NameError: name 'cols' is not defined
df4.std(axis=0)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_59597/241135741.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
df4.std(axis=0)
Index 4.491292e+02
Highest Charting Position 5.671161e+01
Number of Times Charted 1.633495e+01
Artist Followers 1.689649e+07
Popularity 1.635074e+01
Danceability 1.426079e-01
Energy 1.569726e-01
Loudness 2.489291e+00
Speechiness 1.110082e-01
Acousticness 2.450287e-01
Liveness 1.457320e-01
Tempo 2.967576e+01
Duration (ms) 5.062622e+04
Valence 2.250633e-01
dtype: float64
log_loss¶
clf.score(X_scaled,df3["Chord"])
0.5386996904024768
clf.predict_proba(X_scaled).shape
(646, 4)
clf.predict_proba(X_scaled)
array([[0.25, 0.25, 0.25, 0.25],
[0. , 0.25, 0.5 , 0.25],
[0.25, 0.25, 0.5 , 0. ],
...,
[0. , 0.25, 0.25, 0.5 ],
[0.5 , 0.25, 0.25, 0. ],
[0. , 0.25, 0.25, 0.5 ]])
clf.classes_
array(['B', 'C', 'C#/Db', 'G'], dtype=object)
log_loss(df3["Chord"],clf.predict_proba(X_scaled))
0.8816211146814463