{ "cells": [ { "cell_type": "markdown", "id": "df958f0a", "metadata": {}, "source": [ "# Cleaning the \"Genre\" column" ] }, { "cell_type": "code", "execution_count": 1, "id": "a28a15b9", "metadata": {}, "outputs": [], "source": [ "import altair as alt\n", "import numpy as np\n", "import pandas as pd\n", "from pandas.api.types import is_numeric_dtype\n", "pd.set_option('display.float_format', lambda x: '%.3f' % x)\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "rng = np.random.default_rng()" ] }, { "cell_type": "code", "execution_count": 2, "id": "01c87f3d", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"../data/spotify_dataset.csv\", na_values = \" \")\n", "df[\"Streams\"] = pd.to_numeric(df[\"Streams\"].map(lambda s: s.replace(\",\",\"\")))\n", "df = df[df.notna().all(axis=1)].copy()\n", "numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]\n", "\n", "scaler = StandardScaler()\n", "scaler.fit(df[numeric_cols])\n", "\n", "df[numeric_cols] = scaler.transform(df[numeric_cols])" ] }, { "cell_type": "code", "execution_count": 3, "id": "156bd759-1e39-4b33-ba60-e5bbb154b1db", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Index | \n", "Highest Charting Position | \n", "Number of Times Charted | \n", "Week of Highest Charting | \n", "Song Name | \n", "Streams | \n", "Artist | \n", "Artist Followers | \n", "Song ID | \n", "Genre | \n", "... | \n", "Danceability | \n", "Energy | \n", "Loudness | \n", "Speechiness | \n", "Acousticness | \n", "Liveness | \n", "Tempo | \n", "Duration (ms) | \n", "Valence | \n", "Chord | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-1.731 | \n", "-1.494 | \n", "-0.164 | \n", "2021-07-23--2021-07-30 | \n", "Beggin' | \n", "12.535 | \n", "Måneskin | \n", "-0.680 | \n", "3Wrjm47oTz2sjIgck11l5e | \n", "['indie rock italiano', 'italian pop'] | \n", "... | \n", "0.169 | \n", "1.031 | \n", "0.614 | \n", "-0.664 | \n", "-0.486 | \n", "1.234 | \n", "0.378 | \n", "0.289 | \n", "0.327 | \n", "B | \n", "
| 1 | \n", "-1.729 | \n", "-1.477 | \n", "-0.469 | \n", "2021-07-23--2021-07-30 | \n", "STAY (with Justin Bieber) | \n", "12.124 | \n", "The Kid LAROI | \n", "-0.749 | \n", "5HCyWlXZPP0y6Gqq8TgA20 | \n", "['australian hip hop'] | \n", "... | \n", "-0.695 | \n", "0.808 | \n", "0.345 | \n", "-0.683 | \n", "-0.841 | \n", "-0.543 | \n", "1.593 | \n", "-1.191 | \n", "-0.162 | \n", "C#/Db | \n", "
| 2 | \n", "-1.727 | \n", "-1.494 | \n", "0.019 | \n", "2021-06-25--2021-07-02 | \n", "good 4 u | \n", "10.024 | \n", "Olivia Rodrigo | \n", "-0.507 | \n", "4ZtFanR9U6ndgddUvNcjcG | \n", "['pop'] | \n", "... | \n", "-0.892 | \n", "0.189 | \n", "0.520 | \n", "0.275 | \n", "0.345 | \n", "-0.669 | \n", "1.491 | \n", "-0.420 | \n", "0.763 | \n", "A | \n", "
| 3 | \n", "-1.725 | \n", "-1.460 | \n", "-0.347 | \n", "2021-07-02--2021-07-09 | \n", "Bad Habits | \n", "9.324 | \n", "Ed Sheeran | \n", "4.114 | \n", "6PQ88X9TkUIAUIZJHW2upE | \n", "['pop', 'uk pop'] | \n", "... | \n", "0.829 | \n", "1.631 | \n", "1.051 | \n", "-0.805 | \n", "-0.806 | \n", "1.269 | \n", "0.109 | \n", "0.702 | \n", "0.336 | \n", "B | \n", "
| 4 | \n", "-1.722 | \n", "-1.425 | \n", "-0.591 | \n", "2021-07-23--2021-07-30 | \n", "INDUSTRY BABY (feat. Jack Harlow) | \n", "8.183 | \n", "Lil Nas X | \n", "-0.554 | \n", "27NovPIUIRrOZoCHxABJwK | \n", "['lgbtq+ hip hop', 'pop rap'] | \n", "... | \n", "0.323 | \n", "0.436 | \n", "-0.423 | \n", "-0.563 | \n", "-0.913 | \n", "-0.910 | \n", "0.919 | \n", "0.298 | \n", "1.669 | \n", "D#/Eb | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 1551 | \n", "1.720 | \n", "1.844 | \n", "-0.591 | \n", "2019-12-27--2020-01-03 | \n", "New Rules | \n", "-0.506 | \n", "Dua Lipa | \n", "0.747 | \n", "2ekn2ttSfGqwhhate0LSR0 | \n", "['dance pop', 'pop', 'uk pop'] | \n", "... | \n", "0.506 | \n", "0.412 | \n", "0.131 | \n", "-0.492 | \n", "-0.983 | \n", "-0.196 | \n", "-0.228 | \n", "0.241 | \n", "0.411 | \n", "A | \n", "
| 1552 | \n", "1.722 | \n", "1.861 | \n", "-0.591 | \n", "2019-12-27--2020-01-03 | \n", "Cheirosa - Ao Vivo | \n", "-0.508 | \n", "Jorge & Mateus | \n", "0.018 | \n", "2PWjKmjyTZeDpmOUa3a5da | \n", "['sertanejo', 'sertanejo universitario'] | \n", "... | \n", "-1.138 | \n", "1.464 | \n", "1.286 | \n", "-0.349 | \n", "-0.035 | \n", "1.054 | \n", "0.999 | \n", "-0.340 | \n", "0.877 | \n", "B | \n", "
| 1553 | \n", "1.724 | \n", "1.879 | \n", "-0.591 | \n", "2019-12-27--2020-01-03 | \n", "Havana (feat. Young Thug) | \n", "-0.509 | \n", "Camila Cabello | \n", "0.479 | \n", "1rfofaqEpACxVEHIZBJe6W | \n", "['dance pop', 'electropop', 'pop', 'post-teen ... | \n", "... | \n", "0.527 | \n", "-0.684 | \n", "0.803 | \n", "-0.849 | \n", "-0.259 | \n", "-0.342 | \n", "-0.603 | \n", "0.411 | \n", "-0.531 | \n", "D | \n", "
| 1554 | \n", "1.726 | \n", "1.896 | \n", "-0.591 | \n", "2019-12-27--2020-01-03 | \n", "Surtada - Remix Brega Funk | \n", "-0.513 | \n", "Dadá Boladão, Tati Zaqui, OIK | \n", "-0.870 | \n", "5F8ffc8KWKNawllr5WsW0r | \n", "['brega funk', 'funk carioca'] | \n", "... | \n", "0.997 | \n", "-0.517 | \n", "-0.270 | \n", "-0.589 | \n", "0.001 | \n", "0.006 | \n", "1.057 | \n", "-0.958 | \n", "1.612 | \n", "F | \n", "
| 1555 | \n", "1.728 | \n", "1.913 | \n", "-0.591 | \n", "2019-12-27--2020-01-03 | \n", "Lover (Remix) [feat. Shawn Mendes] | \n", "-0.516 | \n", "Taylor Swift | \n", "1.650 | \n", "3i9UVldZOE0aD0JnyfAZZ0 | \n", "['pop', 'post-teen pop'] | \n", "... | \n", "-1.699 | \n", "-0.189 | \n", "-0.330 | \n", "-0.541 | \n", "0.737 | \n", "-0.660 | \n", "2.788 | \n", "0.496 | \n", "-0.408 | \n", "G | \n", "
1545 rows × 23 columns
\n", "