{ "cells": [ { "cell_type": "markdown", "id": "df958f0a", "metadata": {}, "source": [ "# Cleaning the \"Genre\" column" ] }, { "cell_type": "code", "execution_count": 1, "id": "a28a15b9", "metadata": {}, "outputs": [], "source": [ "import altair as alt\n", "import numpy as np\n", "import pandas as pd\n", "from pandas.api.types import is_numeric_dtype\n", "pd.set_option('display.float_format', lambda x: '%.3f' % x)\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "rng = np.random.default_rng()" ] }, { "cell_type": "code", "execution_count": 2, "id": "01c87f3d", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"../data/spotify_dataset.csv\", na_values = \" \")\n", "df[\"Streams\"] = pd.to_numeric(df[\"Streams\"].map(lambda s: s.replace(\",\",\"\")))\n", "df = df[df.notna().all(axis=1)].copy()\n", "numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]\n", "\n", "scaler = StandardScaler()\n", "scaler.fit(df[numeric_cols])\n", "\n", "df[numeric_cols] = scaler.transform(df[numeric_cols])" ] }, { "cell_type": "code", "execution_count": 3, "id": "156bd759-1e39-4b33-ba60-e5bbb154b1db", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IndexHighest Charting PositionNumber of Times ChartedWeek of Highest ChartingSong NameStreamsArtistArtist FollowersSong IDGenre...DanceabilityEnergyLoudnessSpeechinessAcousticnessLivenessTempoDuration (ms)ValenceChord
0-1.731-1.494-0.1642021-07-23--2021-07-30Beggin'12.535Måneskin-0.6803Wrjm47oTz2sjIgck11l5e['indie rock italiano', 'italian pop']...0.1691.0310.614-0.664-0.4861.2340.3780.2890.327B
1-1.729-1.477-0.4692021-07-23--2021-07-30STAY (with Justin Bieber)12.124The Kid LAROI-0.7495HCyWlXZPP0y6Gqq8TgA20['australian hip hop']...-0.6950.8080.345-0.683-0.841-0.5431.593-1.191-0.162C#/Db
2-1.727-1.4940.0192021-06-25--2021-07-02good 4 u10.024Olivia Rodrigo-0.5074ZtFanR9U6ndgddUvNcjcG['pop']...-0.8920.1890.5200.2750.345-0.6691.491-0.4200.763A
3-1.725-1.460-0.3472021-07-02--2021-07-09Bad Habits9.324Ed Sheeran4.1146PQ88X9TkUIAUIZJHW2upE['pop', 'uk pop']...0.8291.6311.051-0.805-0.8061.2690.1090.7020.336B
4-1.722-1.425-0.5912021-07-23--2021-07-30INDUSTRY BABY (feat. Jack Harlow)8.183Lil Nas X-0.55427NovPIUIRrOZoCHxABJwK['lgbtq+ hip hop', 'pop rap']...0.3230.436-0.423-0.563-0.913-0.9100.9190.2981.669D#/Eb
..................................................................
15511.7201.844-0.5912019-12-27--2020-01-03New Rules-0.506Dua Lipa0.7472ekn2ttSfGqwhhate0LSR0['dance pop', 'pop', 'uk pop']...0.5060.4120.131-0.492-0.983-0.196-0.2280.2410.411A
15521.7221.861-0.5912019-12-27--2020-01-03Cheirosa - Ao Vivo-0.508Jorge & Mateus0.0182PWjKmjyTZeDpmOUa3a5da['sertanejo', 'sertanejo universitario']...-1.1381.4641.286-0.349-0.0351.0540.999-0.3400.877B
15531.7241.879-0.5912019-12-27--2020-01-03Havana (feat. Young Thug)-0.509Camila Cabello0.4791rfofaqEpACxVEHIZBJe6W['dance pop', 'electropop', 'pop', 'post-teen ......0.527-0.6840.803-0.849-0.259-0.342-0.6030.411-0.531D
15541.7261.896-0.5912019-12-27--2020-01-03Surtada - Remix Brega Funk-0.513Dadá Boladão, Tati Zaqui, OIK-0.8705F8ffc8KWKNawllr5WsW0r['brega funk', 'funk carioca']...0.997-0.517-0.270-0.5890.0010.0061.057-0.9581.612F
15551.7281.913-0.5912019-12-27--2020-01-03Lover (Remix) [feat. Shawn Mendes]-0.516Taylor Swift1.6503i9UVldZOE0aD0JnyfAZZ0['pop', 'post-teen pop']...-1.699-0.189-0.330-0.5410.737-0.6602.7880.496-0.408G
\n", "

1545 rows × 23 columns

\n", "
" ], "text/plain": [ " Index Highest Charting Position Number of Times Charted \\\n", "0 -1.731 -1.494 -0.164 \n", "1 -1.729 -1.477 -0.469 \n", "2 -1.727 -1.494 0.019 \n", "3 -1.725 -1.460 -0.347 \n", "4 -1.722 -1.425 -0.591 \n", "... ... ... ... \n", "1551 1.720 1.844 -0.591 \n", "1552 1.722 1.861 -0.591 \n", "1553 1.724 1.879 -0.591 \n", "1554 1.726 1.896 -0.591 \n", "1555 1.728 1.913 -0.591 \n", "\n", " Week of Highest Charting Song Name Streams \\\n", "0 2021-07-23--2021-07-30 Beggin' 12.535 \n", "1 2021-07-23--2021-07-30 STAY (with Justin Bieber) 12.124 \n", "2 2021-06-25--2021-07-02 good 4 u 10.024 \n", "3 2021-07-02--2021-07-09 Bad Habits 9.324 \n", "4 2021-07-23--2021-07-30 INDUSTRY BABY (feat. Jack Harlow) 8.183 \n", "... ... ... ... \n", "1551 2019-12-27--2020-01-03 New Rules -0.506 \n", "1552 2019-12-27--2020-01-03 Cheirosa - Ao Vivo -0.508 \n", "1553 2019-12-27--2020-01-03 Havana (feat. Young Thug) -0.509 \n", "1554 2019-12-27--2020-01-03 Surtada - Remix Brega Funk -0.513 \n", "1555 2019-12-27--2020-01-03 Lover (Remix) [feat. Shawn Mendes] -0.516 \n", "\n", " Artist Artist Followers Song ID \\\n", "0 Måneskin -0.680 3Wrjm47oTz2sjIgck11l5e \n", "1 The Kid LAROI -0.749 5HCyWlXZPP0y6Gqq8TgA20 \n", "2 Olivia Rodrigo -0.507 4ZtFanR9U6ndgddUvNcjcG \n", "3 Ed Sheeran 4.114 6PQ88X9TkUIAUIZJHW2upE \n", "4 Lil Nas X -0.554 27NovPIUIRrOZoCHxABJwK \n", "... ... ... ... \n", "1551 Dua Lipa 0.747 2ekn2ttSfGqwhhate0LSR0 \n", "1552 Jorge & Mateus 0.018 2PWjKmjyTZeDpmOUa3a5da \n", "1553 Camila Cabello 0.479 1rfofaqEpACxVEHIZBJe6W \n", "1554 Dadá Boladão, Tati Zaqui, OIK -0.870 5F8ffc8KWKNawllr5WsW0r \n", "1555 Taylor Swift 1.650 3i9UVldZOE0aD0JnyfAZZ0 \n", "\n", " Genre ... Danceability \\\n", "0 ['indie rock italiano', 'italian pop'] ... 0.169 \n", "1 ['australian hip hop'] ... -0.695 \n", "2 ['pop'] ... -0.892 \n", "3 ['pop', 'uk pop'] ... 0.829 \n", "4 ['lgbtq+ hip hop', 'pop rap'] ... 0.323 \n", "... ... ... ... \n", "1551 ['dance pop', 'pop', 'uk pop'] ... 0.506 \n", "1552 ['sertanejo', 'sertanejo universitario'] ... -1.138 \n", "1553 ['dance pop', 'electropop', 'pop', 'post-teen ... ... 0.527 \n", "1554 ['brega funk', 'funk carioca'] ... 0.997 \n", "1555 ['pop', 'post-teen pop'] ... -1.699 \n", "\n", " Energy Loudness Speechiness Acousticness Liveness Tempo \\\n", "0 1.031 0.614 -0.664 -0.486 1.234 0.378 \n", "1 0.808 0.345 -0.683 -0.841 -0.543 1.593 \n", "2 0.189 0.520 0.275 0.345 -0.669 1.491 \n", "3 1.631 1.051 -0.805 -0.806 1.269 0.109 \n", "4 0.436 -0.423 -0.563 -0.913 -0.910 0.919 \n", "... ... ... ... ... ... ... \n", "1551 0.412 0.131 -0.492 -0.983 -0.196 -0.228 \n", "1552 1.464 1.286 -0.349 -0.035 1.054 0.999 \n", "1553 -0.684 0.803 -0.849 -0.259 -0.342 -0.603 \n", "1554 -0.517 -0.270 -0.589 0.001 0.006 1.057 \n", "1555 -0.189 -0.330 -0.541 0.737 -0.660 2.788 \n", "\n", " Duration (ms) Valence Chord \n", "0 0.289 0.327 B \n", "1 -1.191 -0.162 C#/Db \n", "2 -0.420 0.763 A \n", "3 0.702 0.336 B \n", "4 0.298 1.669 D#/Eb \n", "... ... ... ... \n", "1551 0.241 0.411 A \n", "1552 -0.340 0.877 B \n", "1553 0.411 -0.531 D \n", "1554 -0.958 1.612 F \n", "1555 0.496 -0.408 G \n", "\n", "[1545 rows x 23 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "id": "a1663752", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 [\n", "1 [\n", "2 [\n", "3 [\n", "4 [\n", " ..\n", "1551 [\n", "1552 [\n", "1553 [\n", "1554 [\n", "1555 [\n", "Name: Genre, Length: 1545, dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Genre\"].map(lambda s: s[0])" ] }, { "cell_type": "code", "execution_count": 5, "id": "3f609658", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"['australian hip hop']\"" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = df.loc[1,\"Genre\"]\n", "s" ] }, { "cell_type": "code", "execution_count": 6, "id": "0fed6d0a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[',\n", " \"'\",\n", " 'a',\n", " 'u',\n", " 's',\n", " 't',\n", " 'r',\n", " 'a',\n", " 'l',\n", " 'i',\n", " 'a',\n", " 'n',\n", " ' ',\n", " 'h',\n", " 'i',\n", " 'p',\n", " ' ',\n", " 'h',\n", " 'o',\n", " 'p',\n", " \"'\",\n", " ']']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(s)" ] }, { "cell_type": "code", "execution_count": 7, "id": "5bfe03dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['australian hip hop']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval(s)" ] }, { "cell_type": "code", "execution_count": 8, "id": "433d4390", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval(\"3+5\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "c7ac025b", "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "eval() arg 1 must be a string, bytes or code object", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_38831/1095583267.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: eval() arg 1 must be a string, bytes or code object" ] } ], "source": [ "eval(3+5)" ] }, { "cell_type": "code", "execution_count": null, "id": "c8cdad35", "metadata": {}, "outputs": [], "source": [ "df[\"Genre\"].map(lambda s: eval(s))" ] }, { "cell_type": "code", "execution_count": null, "id": "51f62987", "metadata": {}, "outputs": [], "source": [ "df[\"Genre\"].map(type)" ] }, { "cell_type": "code", "execution_count": null, "id": "095a5026", "metadata": {}, "outputs": [], "source": [ "df[\"Genre_list\"] = df[\"Genre\"].map(eval)" ] }, { "cell_type": "code", "execution_count": null, "id": "e14cf428", "metadata": { "scrolled": true }, "outputs": [], "source": [ "df[\"Genre_list\"].map(type).value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "af86e0e0", "metadata": { "scrolled": true }, "outputs": [], "source": [ "df[\"Chord\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "cbc2dda8", "metadata": {}, "outputs": [], "source": [ "df.head(4)" ] }, { "cell_type": "code", "execution_count": null, "id": "1699f9ad", "metadata": {}, "outputs": [], "source": [ "df[\"Genre_list\"].map(lambda x: \"pop\" in x)" ] }, { "cell_type": "code", "execution_count": null, "id": "1c79bf00", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 }