Data cleaning

Import the Spotify dataset from Canvas->Files->Datasets.

We’ve worked a lot with this dataset, but so far we have used some unexplained commands to “clean” the dataset after importing. Today we’ll go through the process of cleaning the dataset in a more systematic way.

import pandas as pd
import altair as alt
import numpy as np
# change path so it works on your computer
df = pd.read_csv("../data/spotify_dataset.csv") 

Data cleaning

We get strange results if we try to use the Spotify dataset without any cleaning.

alt.Chart(df[:50]).mark_circle().encode(
    x = "Energy",
    y = "Loudness"
)
df["Loudness"].sum()
'-4.808-5.484-5.044-3.712-7.409-6.682-3.541-3.605-4.601-4.008-3.787-3.964-5.33-6.181-5.187-7.885-7.222-4.645-5.015-6.509-4.649-4.601-8.81-6.9-5.934-9.72-3.955-3.818-3.424-6.865-7.964-6.967-10.497-5.102-4.209 -6.862-6.414-3.728-10.059-4.522-6.334-5.382-4.41-7.079-4.878-3.671-6.345-4.733-7.454-3.167-7.018-9.921-6.985-4.77-9.918-4.404-5.988-5.483-8.173-3.424-5.603-6.446-4.645-8.964-4.521-4.687-2.81-2.758-5.679-5.196-5.865-3.224-2.981-7.114-3.115-9.016-4.572-4.407-6.4-7.357-6.349-5.307-4.668-7.846-3.158-4.593-5.356-7.85-12.603-7.542-4.771-3.497-4.899-2.749-3.126-4.175-5.201-6.549-4.858-3.787-6.642-2.088-4.374-6.153-4.754-5.472-0.515-4.632-5.748-10.109-5.574-11.177-8.433-3.597-4.662-3.183-6.039-5.797-6.312-4.425-4.892-7.956-7.301-9.389-5.298-6.694-4.556-10.706-9.461-4.163-5.616-7.241-8.532-5.335-5.303-4.48-4.227-6.044-3.128-9.575-9.961-4.637-7.644-7.846-4.578-4.439-6.362-7.476-1.891-4.972-7.227-6.658-4.402-5.697-3.825-6.237-8.493-10.34-5.85-5.049-7.23-7.636 -10.965-6.879-6.606-12.627-7.509-3.74-9.796-9.744-7.398-6.744-6.995-6.934-8.765-4.245-6.472-5.609-3.675-6.836-6.487-3.599-7.064-7.813-5.732-4.12-7.063-8.37-3.639-10.618-10.732-5.16-4.075-5.652-8.56-11.254-8.605-4.244-8.302-7.226-6.967-8.377-7.476-7.347-7.331-7.524-5.666-10.569-8.043-6.59-6.416-4.87-4.258-8.494-9.184-5.82-7.791-5.218-5.956-5.932-5.733-3.728-7.132-6.139-12.181-4.628-1.992-5.301-5.684-4.039-3.612-5.757-3.286-5.599-4.097-3.751-5.737-8.121-7.949-10.131-4.053-6.831-9.405-14.025-7.62-7.6-6.201-3.912-3.293-2.639-5.123-6.455-7.596-6.341-6.612-5.382-4.851-4.912-6.115-6.724-7.778-5.348-6.029-7.446-3.367-5.736-3.917-5.791-5.029-4.297-8.234-8.52-5.382-5.013-7.78-4.818-7.867-7.026-9.144-7.336-6.687-3.255-4.457-8.026-5.134-5.413-8.764-3.434-8.871-4.619-4.132-5.87-8.465-3.714-6.769-5.6-5.81-8.066-7.664-6.553-8.231-3.709-6.836-8.784-6.217-7.598-7.773-4.835-4.49-6.023-6.798-8.05-3.764-5.167-6.612-5.167-4.925-6.173-6.92-9.358-4.939-5.257-4.939-4.158-8.981-6.125-4.032-7.226-3.93-4.757-2.564-5.26-7.909-7.036-8.056-5.79-5.525-7.486-6.404-8.665-5.242-5.99-5.277-2.876-3.795-5.34-4.668-8.011-6.959-6.205-3.593-9.187-7.266-4.197-6.916-8.211-5.302-5.667-6.579-6.973-5.756-7.026-6.257-5.362-5.198-5.123-6.502-3.371-4.603-3.17-7.071-6.938-10.086-2.906-8.043-14.454-7.327-3.083-6.924-5.926-5.831-5.879-6.13-5.718-3.169-2.39-5.181-5.122-4.103-6.499-5.712-4.054-4.818-7.547-7.472-6.578-6.269-14.075-7.564-4.548-8.256-6.091-3.402-8.149-8.019-7.846-10.778-5.995-5.726-7.076-5.371-4.88-5.495-7.106-8.648-3.891-5.677-7.638-7.351-4.311-8.754-2.324-9.195-6.478-5.851-5.795-4.856-4.963-3.09-4.61-4.665-7.606-6.135-7.377-4.433-6.066-8.093-7.062-3.669-4.99-3.665-3.417-11.548-8.819-4.25-5.811-6.003-8.084-5.065-5.119-4.248-10.109-6.658-2.666-4.386-5.077-4.828-5.569-8.68-5.224-7.638-4.048 -4.232-7.831-8.601-3.28-4.596-6.972-4.955-5.53-4.386-3.049-6.629-7.015-4.107-4.096-6.655-5.239-12.805-6.169-13.125-3.09-4.787-5.428-8.621-4.566-7.376-4.272-7.973-6.257-2.173-4.798-4.246-4.583-6.081-4.083-5.946-6.335-5.497-6.55-5.222-3.123-5.471-6.859-9.989-4.318-5.453-7.681-5.891-3.152-5.36-9.934-7.627-5.46-4.563-4.943-4.778-6.254-4.395-5.616-6.136-10.22-7.05-7.188-6.637-9.805-10.48 -8.96-3.944-4.904-5.372-7.16-10.936-5.128-6.208-3.855-5.664-5.041-4.423-3.71-3.128-5.132-4.787-5.723-6.061-9.75-8.275-7.567-4.097-7.066-6.326-14.66-4.89-5.526-6.433-4.954-5.466-8.991-2.827-3.616-5.331-5.507-5.664-6.319-7.882-4.426-6.519-8.746-7.499-6.585-6.143-5.24-4.65-5.453-6.899-7.675-6.79-8.357-6.764-11.052-2.028-12.077-8.932-6.965-6.564-7.628-5.995-4.057-5.661-3.766-6.687-6.628-5.254-5.13-7.147-4.819-5.655-5.479-5.037-2.118-4.81-7.088-7.052-4.352-7.589-6.832-4.214-2.749-5.277-2.474-2.415-7.214-8.659-3.672-7.443-7.463-12.472-7.364-8.749-8.463-11.714-8.435-5.439-9.004-14.014-13.336-9.049-7.013-6.936-8.914-8.308-10.023 -15.231-16.485-5.062-13.056-7.48-3.315-16.661-5.662-4.177-7.183-12.25-7.146-5.584-6.531-1.283-5.536-3.592 -9.478-9.402-6.191-3.766-4.776-6.007-4.68-4.291-4.582-12.955-7.109-8.869-8.193-8.222-12.346-22.507-10.491-6.961-12.073-16.093-8.444-3.849-9.336-2.204-7.779-16.768-14.405-13.119-3.298-15.499-7.381-9.2-6.676-4.505-6.264-3.967-10.381-11.568-6.232-10.733-9.277-8.322-11.266-10.673-7.704-9.014-11.609-10.813-7.607-7.75-6.907-8.269-6.736-6.971-7.1-4.889-15.392-8.736-8.588-4.672-4.944-5.793-5.224-5.054-6.572-4.035-6.755-7.313-4.074-5.506-4.973-5.701-3.508-6.7-4.075-6.834-6.098-6.054-5.542-5.372-8.914-7.355-4.333-5.352-5.048-5.303-7.371-4.948-8.871-13.135-1.957-3.895-3.961-7.636-7.012 -7.091-7.328-6.865-6.535-7.42-5.695-6.664-6.336-4.574-6.444-4.644-8.937-3.945-9.966-6.297-8.415-4.592-3.316-7.833-6.964-7.295-5.724-4.249-7.034-5.998-6.098-10.798-5.156-5.963-4.727-16.541-10.196-5.195 -7.458-4.681-10.755-2.737-5.75-5.308-6.558-6.321-5.2-3.617-3.743-5.563-7.258-6.842-5.334-8.113-2.982-8.045-4.04-7.154-5.03-8.632-3.726-7.513-9.155-2.998-4.793-4.392-7.746-3.417-5.405-6.168-5.267-5.634-6.929-5.363-5.025-7.648-5.893-7.346-5.077-6.883-8.251-6.972-6.429-5.491-3.844-2.762-6.186-2.259-6.266-6.591-5.16-11.715-7.47-2.791-8.953-7.83-5.768-4.689-6.439-7.137-4.845-3.089-4.731-6.756-5.625-4.333-6.494-7.272-5.319-5.893-6.642-8.259-8.711-2.691-9.232-9.869-4.068-6.917-9.297-7.571-8.431-4.212-7.617-5.582-7.199-8.914-7.117-6.184-8.024 -4.113-6.313-4.578-4.78-5.224-7.355-6.916-7.264-5.386-10.646-6.829-9.726-8.426-7.426-5.825-7.813-7.507-5.688-3.756-5.749-5.521-4.719-5.585-6.371-6.456-4.387-10.945-4.157-5.064-6.145-4.874-4.593-6.805-5.591-4.772-5.217-7.011-3.08-10.568-4.959-6.66-4.368-5.085-5.88-6.942-7.037-7.067-5.908-4.78-5.312-5.697-4.895-6.196-4.164-3.812-9.628-5.884-3.011-6.153-5.562-5.146-8.742-8.9-8.018-7.907-9.208-5.613-6.678-5.415-8.322-5.114-8.505-7.34-10.63-17.57-5.792-6.66-2.561-5.396-4.635-5.442-5.908-6.896-11.143-7.159-3.966-5.171-7.359-10.037-10.769-9.843-8.761-6.503-10.445-5.702-7.121-7.935-3.84-5.609-8.995-13.69-12.906-4.902-7.336-8.553-5.453-15.065-6.417-8.225-5.725-2.473-5.132-5.948-3.838-7.664-7.616-5.233-4.928-6.184-2.494-9.236-5.726-6.36-6.668-9.727-5.756-5.802-5.22-10.157-6.356-5.124-6.328-5.89-7.371-5.062-6.935-6.257-3.733-7.064-4.561-8.946-5.56-5.189-3.702-5.98-5.161-6.612-8.442-6.027-3.875-3.909-2.957-11.337-14.097-14.625-5.926-4.85-7.434-2.569-5.377-3.527-6.213-4.105-7.899-6.963-7.487-5.817-5.73-8.383-6.761-6.871-4.961-5.656-6.687-5.923-7.39-5.313-6.097-3.831-2.344-4.833-3.504-5.804-9.072-5.817-7.671-3.242-5.382-4.269-6.976-4.162-3.159-5.464-5.065-7.621-2.729-4.045-4.64-5.728-7.726-5.718-7.416-5.234-8.165-8.985-6.825-5.019-6.22-8.969-3.905-2.082-7.735-8.819-6.6-5.888-4.296-5.268-11.729-6.8061.509-4.524-4.452-7.246-3.981-6.117-9.884-5.353-6.202-8.57-4.356-6.342-4.394-6.759-6.024-5.714-5.35-7.546-8.027-7.246-6.363-6.159-4.219-4.95-4.198-7.206-4.773-7.598-6.366-10.068-5.245-7.34-4.287-3.634-4.262-5.746-4.035-3.875-4.842-14.534-4.372-14.534-4.466-7.002 -6.637-6.599-6.083-4.363-9.045-6.591-8.008-4.014-8.521-5.522-9.0-3.836-8.65-3.875-4.43-5.09-10.805-5.444-1.573-6.35-4.993-2.634-6.099-5.85-5.624-2.97-3.936-3.592-2.409-7.043-5.532-7.182-3.18-2.44-5.425-10.34-9.909-3.158-8.187-18.717-7.397-6.483-6.084-8.217-8.051-7.274-4.784-3.387-8.624-3.752-3.69-7.648-5.245-4.931-13.273-5.709-5.786-7.706-9.299-4.629-6.808-3.68-4.151-7.837-7.385-4.425-6.381-11.144-9.884-6.75-3.095-9.211-3.952-8.11-5.595-4.454-8.212-6.615-6.571-8.054-6.116-5.184-4.842-4.306-2.592-4.628-4.692-5.395-4.535-6.262-4.478-5.368-4.799-6.973-4.363-5.734-4.375-6.594-8.137-4.941-9.087-4.864-7.889-4.41-8.144-8.468-2.622-3.268-1.865-3.001-5.746-7.031-3.639-6.103-7.847-5.113-5.554-6.124-4.823-4.015-4.502-5.001-8.24-7.953-5.954-6.146-6.943-5.221-4.316-7.268-7.475-8.996-8.529-4.242-7.293-5.416-2.961-7.918-9.887-5.715-4.717-2.228-4.295-6.101-5.736-3.835-8.57-5.26-4.141-6.703-6.9-5.469-3.082-10.426-5.507-3.999-6.718-6.633-6.542-6.238-4.496-5.708-9.901-6.181-5.895-6.978-3.661-6.293-5.402-3.73-6.02-6.274-6.224-5.568-5.331-8.0-7.428-5.628-5.397-7.086-5.822-5.167-4.24-4.803-7.589-12.761-5.937-6.672-4.156-4.711-4.139-6.793-4.055-6.256-9.326-4.485-6.11-4.397-5.749-7.105-9.435-6.527-5.486-7.058-6.665-4.957-5.342-3.39-6.376-5.408-6.46-3.208-6.713-2.708-8.543-5.01-3.244-8.637-7.57-5.188-8.766-3.582-3.147-2.814-8.191-7.782-4.178-8.33-5.139-5.906-5.455-4.594-4.443-5.617-2.454-4.271-5.782-3.046-14.505-6.993-5.086-6.456-5.211-5.215-7.981-6.658-3.106-6.439-4.805-6.164-4.188-4.565-8.922-1.94-8.601-4.124-6.043-4.796-4.902-18.435-4.501-5.608-8.389-6.625-7.564-6.913-6.112-7.319-6.843-4.51-7.766-8.186-7.353-7.595-3.067-4.673-3.019-6.378-6.959-8.794-4.228-5.414-12.92-10.198-4.895-4.123-9.897-10.348-8.237-6.637-8.381-8.943-3.32-6.406-3.001-3.231-6.913-3.292-3.922-8.866-5.195-2.986-2.203-6.482-4.027-5.883-9.754-8.575-8.206-6.585-2.652-11.713-2.331-5.95-6.747-6.29-3.723-11.047-6.861-5.127-5.774-4.711-5.892-6.684-6.365-4.105-6.146-8.461-6.946-7.787-7.161-4.197-8.98-5.937-5.793-7.491-4.809-5.212-4.362-7.908-4.491-5.244-4.84-3.87-5.839-5.193-9.922-11.198-5.407-8.073-3.052-6.356-3.121-2.107-5.125-10.847-6.625-9.077-20.43-4.162-5.941-6.543-13.066-10.086-16.735-6.753-8.608-6.278-6.121-8.882-7.261-5.35-8.408-8.082-7.536-4.351-8.533-9.263-6.991-6.445-2.665-8.513-8.054-3.841-2.853-5.969-4.218-8.676-10.304-2.881-4.167-6.605-4.63-4.578-5.191-2.326-4.841 -6.47-4.792-3.24-5.938-5.782-5.808-5.857-25.166-3.032-5.56-7.895-3.384-6.021-3.123-4.333-7.026-7.176'
df.dtypes
Index                         int64
Highest Charting Position     int64
Number of Times Charted       int64
Week of Highest Charting     object
Song Name                    object
Streams                      object
Artist                       object
Artist Followers             object
Song ID                      object
Genre                        object
Release Date                 object
Weeks Charted                object
Popularity                   object
Danceability                 object
Energy                       object
Loudness                     object
Speechiness                  object
Acousticness                 object
Liveness                     object
Tempo                        object
Duration (ms)                object
Valence                      object
Chord                        object
dtype: object
df["Loudness"][:10]
0    -4.808
1    -5.484
2    -5.044
3    -3.712
4    -7.409
5    -6.682
6    -3.541
7    -3.605
8    -4.601
9    -4.008
Name: Loudness, dtype: object
pd.to_numeric(df["Energy"])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string " "

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/287173585.py in <module>
----> 1 pd.to_numeric(df["Energy"])

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    181         coerce_numeric = errors not in ("ignore", "raise")
    182         try:
--> 183             values, _ = lib.maybe_convert_numeric(
    184                 values, set(), coerce_numeric=coerce_numeric
    185             )

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string " " at position 35
df.loc[30:40,"Energy"]
30    0.616
31    0.796
32    0.272
33    0.622
34    0.816
35         
36    0.536
37    0.621
38     0.71
39    0.573
40    0.541
Name: Energy, dtype: object
df.replace(" ",np.nan)
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord
0 1 1 8 2021-07-23--2021-07-30 Beggin' 48,633,449 Måneskin 3377762 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 0.714 0.8 -4.808 0.0504 0.127 0.359 134.002 211560 0.589 B
1 2 2 3 2021-07-23--2021-07-30 STAY (with Justin Bieber) 47,248,719 The Kid LAROI 2230022 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... 0.591 0.764 -5.484 0.0483 0.0383 0.103 169.928 141806 0.478 C#/Db
2 3 1 11 2021-06-25--2021-07-02 good 4 u 40,162,559 Olivia Rodrigo 6266514 4ZtFanR9U6ndgddUvNcjcG ['pop'] ... 0.563 0.664 -5.044 0.154 0.335 0.0849 166.928 178147 0.688 A
3 4 3 5 2021-07-02--2021-07-09 Bad Habits 37,799,456 Ed Sheeran 83293380 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 0.808 0.897 -3.712 0.0348 0.0469 0.364 126.026 231041 0.591 B
4 5 5 1 2021-07-23--2021-07-30 INDUSTRY BABY (feat. Jack Harlow) 33,948,454 Lil Nas X 5473565 27NovPIUIRrOZoCHxABJwK ['lgbtq+ hip hop', 'pop rap'] ... 0.736 0.704 -7.409 0.0615 0.0203 0.0501 149.995 212000 0.894 D#/Eb
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1551 1552 195 1 2019-12-27--2020-01-03 New Rules 4,630,675 Dua Lipa 27167675 2ekn2ttSfGqwhhate0LSR0 ['dance pop', 'pop', 'uk pop'] ... 0.762 0.7 -6.021 0.0694 0.00261 0.153 116.073 209320 0.608 A
1552 1553 196 1 2019-12-27--2020-01-03 Cheirosa - Ao Vivo 4,623,030 Jorge & Mateus 15019109 2PWjKmjyTZeDpmOUa3a5da ['sertanejo', 'sertanejo universitario'] ... 0.528 0.87 -3.123 0.0851 0.24 0.333 152.37 181930 0.714 B
1553 1554 197 1 2019-12-27--2020-01-03 Havana (feat. Young Thug) 4,620,876 Camila Cabello 22698747 1rfofaqEpACxVEHIZBJe6W ['dance pop', 'electropop', 'pop', 'post-teen ... ... 0.765 0.523 -4.333 0.03 0.184 0.132 104.988 217307 0.394 D
1554 1555 198 1 2019-12-27--2020-01-03 Surtada - Remix Brega Funk 4,607,385 Dadá Boladão, Tati Zaqui, OIK 208630 5F8ffc8KWKNawllr5WsW0r ['brega funk', 'funk carioca'] ... 0.832 0.55 -7.026 0.0587 0.249 0.182 154.064 152784 0.881 F
1555 1556 199 1 2019-12-27--2020-01-03 Lover (Remix) [feat. Shawn Mendes] 4,595,450 Taylor Swift 42227614 3i9UVldZOE0aD0JnyfAZZ0 ['pop', 'post-teen pop'] ... 0.448 0.603 -7.176 0.064 0.433 0.0862 205.272 221307 0.422 G

1556 rows × 23 columns

pd.to_numeric(df["Energy"])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string " "

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/287173585.py in <module>
----> 1 pd.to_numeric(df["Energy"])

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    181         coerce_numeric = errors not in ("ignore", "raise")
    182         try:
--> 183             values, _ = lib.maybe_convert_numeric(
    184                 values, set(), coerce_numeric=coerce_numeric
    185             )

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string " " at position 35
df = df.replace(" ",np.nan)
pd.to_numeric(df["Energy"])
0       0.800
1       0.764
2       0.664
3       0.897
4       0.704
        ...  
1551    0.700
1552    0.870
1553    0.523
1554    0.550
1555    0.603
Name: Energy, Length: 1556, dtype: float64
df["Energy"] = pd.to_numeric(df["Energy"])
pd.to_numeric(df["Streams"])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "48,633,449"

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/2129868835.py in <module>
----> 1 pd.to_numeric(df["Streams"])

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    181         coerce_numeric = errors not in ("ignore", "raise")
    182         try:
--> 183             values, _ = lib.maybe_convert_numeric(
    184                 values, set(), coerce_numeric=coerce_numeric
    185             )

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "48,633,449" at position 0
int("48,633,449")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1792345402.py in <module>
----> 1 int("48,633,449")

ValueError: invalid literal for int() with base 10: '48,633,449'
df["Streams"].replace(",","")
0       48,633,449
1       47,248,719
2       40,162,559
3       37,799,456
4       33,948,454
           ...    
1551     4,630,675
1552     4,623,030
1553     4,620,876
1554     4,607,385
1555     4,595,450
Name: Streams, Length: 1556, dtype: object
"mathematics".replace("t","10")
'ma10hema10ics'
"48,633,449".replace(",","")
'48633449'
# equivalent to df.replace(" ",np.nan)
df.applymap(lambda x: np.nan if x == " " else x)
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord
0 1 1 8 2021-07-23--2021-07-30 Beggin' 48,633,449 Måneskin 3377762 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 0.714 0.800 -4.808 0.0504 0.127 0.359 134.002 211560 0.589 B
1 2 2 3 2021-07-23--2021-07-30 STAY (with Justin Bieber) 47,248,719 The Kid LAROI 2230022 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... 0.591 0.764 -5.484 0.0483 0.0383 0.103 169.928 141806 0.478 C#/Db
2 3 1 11 2021-06-25--2021-07-02 good 4 u 40,162,559 Olivia Rodrigo 6266514 4ZtFanR9U6ndgddUvNcjcG ['pop'] ... 0.563 0.664 -5.044 0.154 0.335 0.0849 166.928 178147 0.688 A
3 4 3 5 2021-07-02--2021-07-09 Bad Habits 37,799,456 Ed Sheeran 83293380 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 0.808 0.897 -3.712 0.0348 0.0469 0.364 126.026 231041 0.591 B
4 5 5 1 2021-07-23--2021-07-30 INDUSTRY BABY (feat. Jack Harlow) 33,948,454 Lil Nas X 5473565 27NovPIUIRrOZoCHxABJwK ['lgbtq+ hip hop', 'pop rap'] ... 0.736 0.704 -7.409 0.0615 0.0203 0.0501 149.995 212000 0.894 D#/Eb
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1551 1552 195 1 2019-12-27--2020-01-03 New Rules 4,630,675 Dua Lipa 27167675 2ekn2ttSfGqwhhate0LSR0 ['dance pop', 'pop', 'uk pop'] ... 0.762 0.700 -6.021 0.0694 0.00261 0.153 116.073 209320 0.608 A
1552 1553 196 1 2019-12-27--2020-01-03 Cheirosa - Ao Vivo 4,623,030 Jorge & Mateus 15019109 2PWjKmjyTZeDpmOUa3a5da ['sertanejo', 'sertanejo universitario'] ... 0.528 0.870 -3.123 0.0851 0.24 0.333 152.37 181930 0.714 B
1553 1554 197 1 2019-12-27--2020-01-03 Havana (feat. Young Thug) 4,620,876 Camila Cabello 22698747 1rfofaqEpACxVEHIZBJe6W ['dance pop', 'electropop', 'pop', 'post-teen ... ... 0.765 0.523 -4.333 0.03 0.184 0.132 104.988 217307 0.394 D
1554 1555 198 1 2019-12-27--2020-01-03 Surtada - Remix Brega Funk 4,607,385 Dadá Boladão, Tati Zaqui, OIK 208630 5F8ffc8KWKNawllr5WsW0r ['brega funk', 'funk carioca'] ... 0.832 0.550 -7.026 0.0587 0.249 0.182 154.064 152784 0.881 F
1555 1556 199 1 2019-12-27--2020-01-03 Lover (Remix) [feat. Shawn Mendes] 4,595,450 Taylor Swift 42227614 3i9UVldZOE0aD0JnyfAZZ0 ['pop', 'post-teen pop'] ... 0.448 0.603 -7.176 0.064 0.433 0.0862 205.272 221307 0.422 G

1556 rows × 23 columns

df["Streams"].map(lambda s: s.replace(",",""))
0       48633449
1       47248719
2       40162559
3       37799456
4       33948454
          ...   
1551     4630675
1552     4623030
1553     4620876
1554     4607385
1555     4595450
Name: Streams, Length: 1556, dtype: object
df["Streams"] = df["Streams"].map(lambda s: s.replace(",",""))
df.head()
Index Highest Charting Position Number of Times Charted Week of Highest Charting Song Name Streams Artist Artist Followers Song ID Genre ... Danceability Energy Loudness Speechiness Acousticness Liveness Tempo Duration (ms) Valence Chord
0 1 1 8 2021-07-23--2021-07-30 Beggin' 48633449 Måneskin 3377762 3Wrjm47oTz2sjIgck11l5e ['indie rock italiano', 'italian pop'] ... 0.714 0.800 -4.808 0.0504 0.127 0.359 134.002 211560 0.589 B
1 2 2 3 2021-07-23--2021-07-30 STAY (with Justin Bieber) 47248719 The Kid LAROI 2230022 5HCyWlXZPP0y6Gqq8TgA20 ['australian hip hop'] ... 0.591 0.764 -5.484 0.0483 0.0383 0.103 169.928 141806 0.478 C#/Db
2 3 1 11 2021-06-25--2021-07-02 good 4 u 40162559 Olivia Rodrigo 6266514 4ZtFanR9U6ndgddUvNcjcG ['pop'] ... 0.563 0.664 -5.044 0.154 0.335 0.0849 166.928 178147 0.688 A
3 4 3 5 2021-07-02--2021-07-09 Bad Habits 37799456 Ed Sheeran 83293380 6PQ88X9TkUIAUIZJHW2upE ['pop', 'uk pop'] ... 0.808 0.897 -3.712 0.0348 0.0469 0.364 126.026 231041 0.591 B
4 5 5 1 2021-07-23--2021-07-30 INDUSTRY BABY (feat. Jack Harlow) 33948454 Lil Nas X 5473565 27NovPIUIRrOZoCHxABJwK ['lgbtq+ hip hop', 'pop rap'] ... 0.736 0.704 -7.409 0.0615 0.0203 0.0501 149.995 212000 0.894 D#/Eb

5 rows × 23 columns

df.dtypes
Index                          int64
Highest Charting Position      int64
Number of Times Charted        int64
Week of Highest Charting      object
Song Name                     object
Streams                       object
Artist                        object
Artist Followers              object
Song ID                       object
Genre                         object
Release Date                  object
Weeks Charted                 object
Popularity                    object
Danceability                  object
Energy                       float64
Loudness                      object
Speechiness                   object
Acousticness                  object
Liveness                      object
Tempo                         object
Duration (ms)                 object
Valence                       object
Chord                         object
dtype: object

Handling errors

Let’s write a function which takes as input a number x, and as output returns 3/x.

def f(x):
    return 3/x
f(5)
0.6
f(0)
---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1333598009.py in <module>
----> 1 f(0)

/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/3712785429.py in f(x)
      1 def f(x):
----> 2     return 3/x

ZeroDivisionError: division by zero

What if we want to avoid this sort of error? (For example, maybe this is part of a much longer program, and we don’t want the whole program to crash if this function gets a bad input.)

def f(x):
    if x == 0:
        return np.nan
    else:
        return 3/x
f(10)
0.3
f(0)
nan
type(f(0))
float

We fixed the 0 error, but we will never catch all possible errors this way.

f("5")
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/221224352.py in <module>
----> 1 f("5")

/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/3342001434.py in f(x)
      3         return np.nan
      4     else:
----> 5         return 3/x

TypeError: unsupported operand type(s) for /: 'int' and 'str'

We want something like

def f(x):
    if 3/x does not cause an error:
        return 3/x
    else:
        return np.nan

As far as I know, that can’t be accomplished using an if statement. We instead need to use the commands try and except.

def f(x):
    try:
        return 3/x
    except:
        return "It didn't work"
f(5)
0.6
f("math 10")
"It didn't work"
def can_divide(x):
    try:
        3/x
        return True
    except:
        return False
can_divide(5)
True
can_divide(0)
False

Making columns numeric

df.dtypes
Index                          int64
Highest Charting Position      int64
Number of Times Charted        int64
Week of Highest Charting      object
Song Name                     object
Streams                       object
Artist                        object
Artist Followers              object
Song ID                       object
Genre                         object
Release Date                  object
Weeks Charted                 object
Popularity                    object
Danceability                  object
Energy                       float64
Loudness                      object
Speechiness                   object
Acousticness                  object
Liveness                      object
Tempo                         object
Duration (ms)                 object
Valence                       object
Chord                         object
dtype: object
pd.to_numeric(df["Speechiness"])
0       0.0504
1       0.0483
2       0.1540
3       0.0348
4       0.0615
         ...  
1551    0.0694
1552    0.0851
1553    0.0300
1554    0.0587
1555    0.0640
Name: Speechiness, Length: 1556, dtype: float64
pd.to_numeric(df["Release Date"])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "2017-12-08"

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1358387063.py in <module>
----> 1 pd.to_numeric(df["Release Date"])

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    181         coerce_numeric = errors not in ("ignore", "raise")
    182         try:
--> 183             values, _ = lib.maybe_convert_numeric(
    184                 values, set(), coerce_numeric=coerce_numeric
    185             )

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()

ValueError: Unable to parse string "2017-12-08" at position 0
# c column name
def can_be_numeric(c):
    try:
        pd.to_numeric(df[c])
        return True
    except:
        return False
can_be_numeric("Release Date")
False
can_be_numeric("Speechiness")
True
# all the columns that can be numeric
good_cols = [c for c in df.columns if can_be_numeric(c)]
good_cols
['Index',
 'Highest Charting Position',
 'Number of Times Charted',
 'Streams',
 'Artist Followers',
 'Popularity',
 'Danceability',
 'Energy',
 'Loudness',
 'Speechiness',
 'Acousticness',
 'Liveness',
 'Tempo',
 'Duration (ms)',
 'Valence']
df[good_cols] = df[good_cols].apply(pd.to_numeric, axis=0)
df.dtypes
Index                          int64
Highest Charting Position      int64
Number of Times Charted        int64
Week of Highest Charting      object
Song Name                     object
Streams                        int64
Artist                        object
Artist Followers             float64
Song ID                       object
Genre                         object
Release Date                  object
Weeks Charted                 object
Popularity                   float64
Danceability                 float64
Energy                       float64
Loudness                     float64
Speechiness                  float64
Acousticness                 float64
Liveness                     float64
Tempo                        float64
Duration (ms)                float64
Valence                      float64
Chord                         object
dtype: object
alt.Chart(df).mark_circle().encode(
    x = "Energy",
    y = "Loudness"
)
chartlist = [alt.Chart(df).mark_circle().encode(
    x = "Energy",
    y = c
) for c in good_cols]
chartlist
[alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...),
 alt.Chart(...)]
chartlist[5]
alt.hconcat(chartlist[0],chartlist[1])
alt.hconcat(chartlist)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1249897995.py in <module>
----> 1 alt.hconcat(chartlist)

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/altair/vegalite/v4/api.py in hconcat(*charts, **kwargs)
   2210 def hconcat(*charts, **kwargs):
   2211     """Concatenate charts horizontally"""
-> 2212     return HConcatChart(hconcat=charts, **kwargs)
   2213 
   2214 

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/altair/vegalite/v4/api.py in __init__(self, data, hconcat, **kwargs)
   2184         # TODO: move common data to top level?
   2185         for spec in hconcat:
-> 2186             _check_if_valid_subspec(spec, "HConcatChart")
   2187         super(HConcatChart, self).__init__(data=data, hconcat=list(hconcat), **kwargs)
   2188         self.data, self.hconcat = _combine_subchart_data(self.data, self.hconcat)

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/altair/vegalite/v4/api.py in _check_if_valid_subspec(spec, classname)
   2036 
   2037     if not isinstance(spec, (core.SchemaBase, dict)):
-> 2038         raise ValueError("Only chart objects can be used in {0}.".format(classname))
   2039     for attr in TOPLEVEL_ONLY_KEYS:
   2040         if isinstance(spec, core.SchemaBase):

ValueError: Only chart objects can be used in HConcatChart.
m = alt.hconcat(*chartlist)
m