Cleaning the “Genre” column¶
import altair as alt
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from sklearn.preprocessing import StandardScaler
rng = np.random.default_rng()
df = pd.read_csv("../data/spotify_dataset.csv", na_values = " ")
df["Streams"] = pd.to_numeric(df["Streams"].map(lambda s: s.replace(",","")))
df = df[df.notna().all(axis=1)].copy()
numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]
scaler = StandardScaler()
scaler.fit(df[numeric_cols])
df[numeric_cols] = scaler.transform(df[numeric_cols])
df
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.731 | -1.494 | -0.164 | 2021-07-23--2021-07-30 | Beggin' | 12.535 | Måneskin | -0.680 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 0.169 | 1.031 | 0.614 | -0.664 | -0.486 | 1.234 | 0.378 | 0.289 | 0.327 | B |
1 | -1.729 | -1.477 | -0.469 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 12.124 | The Kid LAROI | -0.749 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | -0.695 | 0.808 | 0.345 | -0.683 | -0.841 | -0.543 | 1.593 | -1.191 | -0.162 | C#/Db |
2 | -1.727 | -1.494 | 0.019 | 2021-06-25--2021-07-02 | good 4 u | 10.024 | Olivia Rodrigo | -0.507 | 4ZtFanR9U6ndgddUvNcjcG | ['pop'] | ... | -0.892 | 0.189 | 0.520 | 0.275 | 0.345 | -0.669 | 1.491 | -0.420 | 0.763 | A |
3 | -1.725 | -1.460 | -0.347 | 2021-07-02--2021-07-09 | Bad Habits | 9.324 | Ed Sheeran | 4.114 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 0.829 | 1.631 | 1.051 | -0.805 | -0.806 | 1.269 | 0.109 | 0.702 | 0.336 | B |
4 | -1.722 | -1.425 | -0.591 | 2021-07-23--2021-07-30 | INDUSTRY BABY (feat. Jack Harlow) | 8.183 | Lil Nas X | -0.554 | 27NovPIUIRrOZoCHxABJwK | ['lgbtq+ hip hop', 'pop rap'] | ... | 0.323 | 0.436 | -0.423 | -0.563 | -0.913 | -0.910 | 0.919 | 0.298 | 1.669 | D#/Eb |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1551 | 1.720 | 1.844 | -0.591 | 2019-12-27--2020-01-03 | New Rules | -0.506 | Dua Lipa | 0.747 | 2ekn2ttSfGqwhhate0LSR0 | ['dance pop', 'pop', 'uk pop'] | ... | 0.506 | 0.412 | 0.131 | -0.492 | -0.983 | -0.196 | -0.228 | 0.241 | 0.411 | A |
1552 | 1.722 | 1.861 | -0.591 | 2019-12-27--2020-01-03 | Cheirosa - Ao Vivo | -0.508 | Jorge & Mateus | 0.018 | 2PWjKmjyTZeDpmOUa3a5da | ['sertanejo', 'sertanejo universitario'] | ... | -1.138 | 1.464 | 1.286 | -0.349 | -0.035 | 1.054 | 0.999 | -0.340 | 0.877 | B |
1553 | 1.724 | 1.879 | -0.591 | 2019-12-27--2020-01-03 | Havana (feat. Young Thug) | -0.509 | Camila Cabello | 0.479 | 1rfofaqEpACxVEHIZBJe6W | ['dance pop', 'electropop', 'pop', 'post-teen ... | ... | 0.527 | -0.684 | 0.803 | -0.849 | -0.259 | -0.342 | -0.603 | 0.411 | -0.531 | D |
1554 | 1.726 | 1.896 | -0.591 | 2019-12-27--2020-01-03 | Surtada - Remix Brega Funk | -0.513 | Dadá Boladão, Tati Zaqui, OIK | -0.870 | 5F8ffc8KWKNawllr5WsW0r | ['brega funk', 'funk carioca'] | ... | 0.997 | -0.517 | -0.270 | -0.589 | 0.001 | 0.006 | 1.057 | -0.958 | 1.612 | F |
1555 | 1.728 | 1.913 | -0.591 | 2019-12-27--2020-01-03 | Lover (Remix) [feat. Shawn Mendes] | -0.516 | Taylor Swift | 1.650 | 3i9UVldZOE0aD0JnyfAZZ0 | ['pop', 'post-teen pop'] | ... | -1.699 | -0.189 | -0.330 | -0.541 | 0.737 | -0.660 | 2.788 | 0.496 | -0.408 | G |
1545 rows × 23 columns
df["Genre"].map(lambda s: s[0])
0 [
1 [
2 [
3 [
4 [
..
1551 [
1552 [
1553 [
1554 [
1555 [
Name: Genre, Length: 1545, dtype: object
s = df.loc[1,"Genre"]
s
"['australian hip hop']"
list(s)
['[',
"'",
'a',
'u',
's',
't',
'r',
'a',
'l',
'i',
'a',
'n',
' ',
'h',
'i',
'p',
' ',
'h',
'o',
'p',
"'",
']']
eval(s)
['australian hip hop']
eval("3+5")
8
eval(3+5)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83360/1095583267.py in <module>
----> 1 eval(3+5)
TypeError: eval() arg 1 must be a string, bytes or code object
df["Genre"].map(lambda s: eval(s))
0 [indie rock italiano, italian pop]
1 [australian hip hop]
2 [pop]
3 [pop, uk pop]
4 [lgbtq+ hip hop, pop rap]
...
1551 [dance pop, pop, uk pop]
1552 [sertanejo, sertanejo universitario]
1553 [dance pop, electropop, pop, post-teen pop]
1554 [brega funk, funk carioca]
1555 [pop, post-teen pop]
Name: Genre, Length: 1545, dtype: object
df["Genre"].map(type)
0 <class 'str'>
1 <class 'str'>
2 <class 'str'>
3 <class 'str'>
4 <class 'str'>
...
1551 <class 'str'>
1552 <class 'str'>
1553 <class 'str'>
1554 <class 'str'>
1555 <class 'str'>
Name: Genre, Length: 1545, dtype: object
df["Genre_list"] = df["Genre"].map(eval)
df["Genre_list"].map(type).value_counts()
<class 'list'> 1545
Name: Genre_list, dtype: int64
df["Chord"].value_counts()
C#/Db 214
C 155
B 141
G 136
F 131
G#/Ab 130
D 125
A#/Bb 122
F#/Gb 121
A 118
E 112
D#/Eb 40
Name: Chord, dtype: int64
df.head(4)
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | Genre_list | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.731 | -1.494 | -0.164 | 2021-07-23--2021-07-30 | Beggin' | 12.535 | Måneskin | -0.680 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 1.031 | 0.614 | -0.664 | -0.486 | 1.234 | 0.378 | 0.289 | 0.327 | B | [indie rock italiano, italian pop] |
1 | -1.729 | -1.477 | -0.469 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 12.124 | The Kid LAROI | -0.749 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | 0.808 | 0.345 | -0.683 | -0.841 | -0.543 | 1.593 | -1.191 | -0.162 | C#/Db | [australian hip hop] |
2 | -1.727 | -1.494 | 0.019 | 2021-06-25--2021-07-02 | good 4 u | 10.024 | Olivia Rodrigo | -0.507 | 4ZtFanR9U6ndgddUvNcjcG | ['pop'] | ... | 0.189 | 0.520 | 0.275 | 0.345 | -0.669 | 1.491 | -0.420 | 0.763 | A | [pop] |
3 | -1.725 | -1.460 | -0.347 | 2021-07-02--2021-07-09 | Bad Habits | 9.324 | Ed Sheeran | 4.114 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 1.631 | 1.051 | -0.805 | -0.806 | 1.269 | 0.109 | 0.702 | 0.336 | B | [pop, uk pop] |
4 rows × 24 columns
df["Genre_list"].map(lambda x: "pop" in x)
0 False
1 False
2 True
3 True
4 False
...
1551 True
1552 False
1553 True
1554 False
1555 True
Name: Genre_list, Length: 1545, dtype: bool