Cleaning the “Genre” column

import altair as alt
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
pd.set_option('display.float_format', lambda x: '%.3f' % x)

from sklearn.preprocessing import StandardScaler

rng = np.random.default_rng()
df = pd.read_csv("../data/spotify_dataset.csv", na_values = " ")
df["Streams"] = pd.to_numeric(df["Streams"].map(lambda s: s.replace(",","")))
df = df[df.notna().all(axis=1)].copy()
numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]

scaler = StandardScaler()
scaler.fit(df[numeric_cols])

df[numeric_cols] = scaler.transform(df[numeric_cols])
df
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord
0 -1.731 -1.494 -0.164 2021-07-23--2021-07-30 Beggin' 12.535 Måneskin -0.680 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 0.169 1.031 0.614 -0.664 -0.486 1.234 0.378 0.289 0.327 B
1 -1.729 -1.477 -0.469 2021-07-23--2021-07-30 STAY (with Justin Bieber) 12.124 The Kid LAROI -0.749 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... -0.695 0.808 0.345 -0.683 -0.841 -0.543 1.593 -1.191 -0.162 C#/Db
2 -1.727 -1.494 0.019 2021-06-25--2021-07-02 good 4 u 10.024 Olivia Rodrigo -0.507 4ZtFanR9U6ndgddUvNcjcG ['pop'] ... -0.892 0.189 0.520 0.275 0.345 -0.669 1.491 -0.420 0.763 A
3 -1.725 -1.460 -0.347 2021-07-02--2021-07-09 Bad Habits 9.324 Ed Sheeran 4.114 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 0.829 1.631 1.051 -0.805 -0.806 1.269 0.109 0.702 0.336 B
4 -1.722 -1.425 -0.591 2021-07-23--2021-07-30 INDUSTRY BABY (feat. Jack Harlow) 8.183 Lil Nas X -0.554 27NovPIUIRrOZoCHxABJwK ['lgbtq+ hip hop', 'pop rap'] ... 0.323 0.436 -0.423 -0.563 -0.913 -0.910 0.919 0.298 1.669 D#/Eb
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1551 1.720 1.844 -0.591 2019-12-27--2020-01-03 New Rules -0.506 Dua Lipa 0.747 2ekn2ttSfGqwhhate0LSR0 ['dance pop', 'pop', 'uk pop'] ... 0.506 0.412 0.131 -0.492 -0.983 -0.196 -0.228 0.241 0.411 A
1552 1.722 1.861 -0.591 2019-12-27--2020-01-03 Cheirosa - Ao Vivo -0.508 Jorge & Mateus 0.018 2PWjKmjyTZeDpmOUa3a5da ['sertanejo', 'sertanejo universitario'] ... -1.138 1.464 1.286 -0.349 -0.035 1.054 0.999 -0.340 0.877 B
1553 1.724 1.879 -0.591 2019-12-27--2020-01-03 Havana (feat. Young Thug) -0.509 Camila Cabello 0.479 1rfofaqEpACxVEHIZBJe6W ['dance pop', 'electropop', 'pop', 'post-teen ... ... 0.527 -0.684 0.803 -0.849 -0.259 -0.342 -0.603 0.411 -0.531 D
1554 1.726 1.896 -0.591 2019-12-27--2020-01-03 Surtada - Remix Brega Funk -0.513 Dadá Boladão, Tati Zaqui, OIK -0.870 5F8ffc8KWKNawllr5WsW0r ['brega funk', 'funk carioca'] ... 0.997 -0.517 -0.270 -0.589 0.001 0.006 1.057 -0.958 1.612 F
1555 1.728 1.913 -0.591 2019-12-27--2020-01-03 Lover (Remix) [feat. Shawn Mendes] -0.516 Taylor Swift 1.650 3i9UVldZOE0aD0JnyfAZZ0 ['pop', 'post-teen pop'] ... -1.699 -0.189 -0.330 -0.541 0.737 -0.660 2.788 0.496 -0.408 G

1545 rows × 23 columns

df["Genre"].map(lambda s: s[0])
0       [
1       [
2       [
3       [
4       [
       ..
1551    [
1552    [
1553    [
1554    [
1555    [
Name: Genre, Length: 1545, dtype: object
s = df.loc[1,"Genre"]
s
"['australian hip hop']"
list(s)
['[',
 "'",
 'a',
 'u',
 's',
 't',
 'r',
 'a',
 'l',
 'i',
 'a',
 'n',
 ' ',
 'h',
 'i',
 'p',
 ' ',
 'h',
 'o',
 'p',
 "'",
 ']']
eval(s)
['australian hip hop']
eval("3+5")
8
eval(3+5)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83360/1095583267.py in <module>
----> 1 eval(3+5)

TypeError: eval() arg 1 must be a string, bytes or code object
df["Genre"].map(lambda s: eval(s))
0                [indie rock italiano, italian pop]
1                              [australian hip hop]
2                                             [pop]
3                                     [pop, uk pop]
4                         [lgbtq+ hip hop, pop rap]
                           ...                     
1551                       [dance pop, pop, uk pop]
1552           [sertanejo, sertanejo universitario]
1553    [dance pop, electropop, pop, post-teen pop]
1554                     [brega funk, funk carioca]
1555                           [pop, post-teen pop]
Name: Genre, Length: 1545, dtype: object
df["Genre"].map(type)
0       <class 'str'>
1       <class 'str'>
2       <class 'str'>
3       <class 'str'>
4       <class 'str'>
            ...      
1551    <class 'str'>
1552    <class 'str'>
1553    <class 'str'>
1554    <class 'str'>
1555    <class 'str'>
Name: Genre, Length: 1545, dtype: object
df["Genre_list"] = df["Genre"].map(eval)
df["Genre_list"].map(type).value_counts()
<class 'list'>    1545
Name: Genre_list, dtype: int64
df["Chord"].value_counts()
C#/Db    214
C        155
B        141
G        136
F        131
G#/Ab    130
D        125
A#/Bb    122
F#/Gb    121
A        118
E        112
D#/Eb     40
Name: Chord, dtype: int64
df.head(4)
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord Genre_list
0 -1.731 -1.494 -0.164 2021-07-23--2021-07-30 Beggin' 12.535 Måneskin -0.680 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 1.031 0.614 -0.664 -0.486 1.234 0.378 0.289 0.327 B [indie rock italiano, italian pop]
1 -1.729 -1.477 -0.469 2021-07-23--2021-07-30 STAY (with Justin Bieber) 12.124 The Kid LAROI -0.749 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... 0.808 0.345 -0.683 -0.841 -0.543 1.593 -1.191 -0.162 C#/Db [australian hip hop]
2 -1.727 -1.494 0.019 2021-06-25--2021-07-02 good 4 u 10.024 Olivia Rodrigo -0.507 4ZtFanR9U6ndgddUvNcjcG ['pop'] ... 0.189 0.520 0.275 0.345 -0.669 1.491 -0.420 0.763 A [pop]
3 -1.725 -1.460 -0.347 2021-07-02--2021-07-09 Bad Habits 9.324 Ed Sheeran 4.114 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 1.631 1.051 -0.805 -0.806 1.269 0.109 0.702 0.336 B [pop, uk pop]

4 rows × 24 columns

df["Genre_list"].map(lambda x: "pop" in x)
0       False
1       False
2        True
3        True
4       False
        ...  
1551     True
1552    False
1553     True
1554    False
1555     True
Name: Genre_list, Length: 1545, dtype: bool