Data cleaning¶
Import the Spotify dataset from Canvas->Files->Datasets.
We’ve worked a lot with this dataset, but so far we have used some unexplained commands to “clean” the dataset after importing. Today we’ll go through the process of cleaning the dataset in a more systematic way.
import pandas as pd
import altair as alt
import numpy as np
# change path so it works on your computer
df = pd.read_csv("../data/spotify_dataset.csv")
Data cleaning¶
We get strange results if we try to use the Spotify dataset without any cleaning.
alt.Chart(df[:50]).mark_circle().encode(
x = "Energy",
y = "Loudness"
)
df["Loudness"].sum()
'-4.808-5.484-5.044-3.712-7.409-6.682-3.541-3.605-4.601-4.008-3.787-3.964-5.33-6.181-5.187-7.885-7.222-4.645-5.015-6.509-4.649-4.601-8.81-6.9-5.934-9.72-3.955-3.818-3.424-6.865-7.964-6.967-10.497-5.102-4.209 -6.862-6.414-3.728-10.059-4.522-6.334-5.382-4.41-7.079-4.878-3.671-6.345-4.733-7.454-3.167-7.018-9.921-6.985-4.77-9.918-4.404-5.988-5.483-8.173-3.424-5.603-6.446-4.645-8.964-4.521-4.687-2.81-2.758-5.679-5.196-5.865-3.224-2.981-7.114-3.115-9.016-4.572-4.407-6.4-7.357-6.349-5.307-4.668-7.846-3.158-4.593-5.356-7.85-12.603-7.542-4.771-3.497-4.899-2.749-3.126-4.175-5.201-6.549-4.858-3.787-6.642-2.088-4.374-6.153-4.754-5.472-0.515-4.632-5.748-10.109-5.574-11.177-8.433-3.597-4.662-3.183-6.039-5.797-6.312-4.425-4.892-7.956-7.301-9.389-5.298-6.694-4.556-10.706-9.461-4.163-5.616-7.241-8.532-5.335-5.303-4.48-4.227-6.044-3.128-9.575-9.961-4.637-7.644-7.846-4.578-4.439-6.362-7.476-1.891-4.972-7.227-6.658-4.402-5.697-3.825-6.237-8.493-10.34-5.85-5.049-7.23-7.636 -10.965-6.879-6.606-12.627-7.509-3.74-9.796-9.744-7.398-6.744-6.995-6.934-8.765-4.245-6.472-5.609-3.675-6.836-6.487-3.599-7.064-7.813-5.732-4.12-7.063-8.37-3.639-10.618-10.732-5.16-4.075-5.652-8.56-11.254-8.605-4.244-8.302-7.226-6.967-8.377-7.476-7.347-7.331-7.524-5.666-10.569-8.043-6.59-6.416-4.87-4.258-8.494-9.184-5.82-7.791-5.218-5.956-5.932-5.733-3.728-7.132-6.139-12.181-4.628-1.992-5.301-5.684-4.039-3.612-5.757-3.286-5.599-4.097-3.751-5.737-8.121-7.949-10.131-4.053-6.831-9.405-14.025-7.62-7.6-6.201-3.912-3.293-2.639-5.123-6.455-7.596-6.341-6.612-5.382-4.851-4.912-6.115-6.724-7.778-5.348-6.029-7.446-3.367-5.736-3.917-5.791-5.029-4.297-8.234-8.52-5.382-5.013-7.78-4.818-7.867-7.026-9.144-7.336-6.687-3.255-4.457-8.026-5.134-5.413-8.764-3.434-8.871-4.619-4.132-5.87-8.465-3.714-6.769-5.6-5.81-8.066-7.664-6.553-8.231-3.709-6.836-8.784-6.217-7.598-7.773-4.835-4.49-6.023-6.798-8.05-3.764-5.167-6.612-5.167-4.925-6.173-6.92-9.358-4.939-5.257-4.939-4.158-8.981-6.125-4.032-7.226-3.93-4.757-2.564-5.26-7.909-7.036-8.056-5.79-5.525-7.486-6.404-8.665-5.242-5.99-5.277-2.876-3.795-5.34-4.668-8.011-6.959-6.205-3.593-9.187-7.266-4.197-6.916-8.211-5.302-5.667-6.579-6.973-5.756-7.026-6.257-5.362-5.198-5.123-6.502-3.371-4.603-3.17-7.071-6.938-10.086-2.906-8.043-14.454-7.327-3.083-6.924-5.926-5.831-5.879-6.13-5.718-3.169-2.39-5.181-5.122-4.103-6.499-5.712-4.054-4.818-7.547-7.472-6.578-6.269-14.075-7.564-4.548-8.256-6.091-3.402-8.149-8.019-7.846-10.778-5.995-5.726-7.076-5.371-4.88-5.495-7.106-8.648-3.891-5.677-7.638-7.351-4.311-8.754-2.324-9.195-6.478-5.851-5.795-4.856-4.963-3.09-4.61-4.665-7.606-6.135-7.377-4.433-6.066-8.093-7.062-3.669-4.99-3.665-3.417-11.548-8.819-4.25-5.811-6.003-8.084-5.065-5.119-4.248-10.109-6.658-2.666-4.386-5.077-4.828-5.569-8.68-5.224-7.638-4.048 -4.232-7.831-8.601-3.28-4.596-6.972-4.955-5.53-4.386-3.049-6.629-7.015-4.107-4.096-6.655-5.239-12.805-6.169-13.125-3.09-4.787-5.428-8.621-4.566-7.376-4.272-7.973-6.257-2.173-4.798-4.246-4.583-6.081-4.083-5.946-6.335-5.497-6.55-5.222-3.123-5.471-6.859-9.989-4.318-5.453-7.681-5.891-3.152-5.36-9.934-7.627-5.46-4.563-4.943-4.778-6.254-4.395-5.616-6.136-10.22-7.05-7.188-6.637-9.805-10.48 -8.96-3.944-4.904-5.372-7.16-10.936-5.128-6.208-3.855-5.664-5.041-4.423-3.71-3.128-5.132-4.787-5.723-6.061-9.75-8.275-7.567-4.097-7.066-6.326-14.66-4.89-5.526-6.433-4.954-5.466-8.991-2.827-3.616-5.331-5.507-5.664-6.319-7.882-4.426-6.519-8.746-7.499-6.585-6.143-5.24-4.65-5.453-6.899-7.675-6.79-8.357-6.764-11.052-2.028-12.077-8.932-6.965-6.564-7.628-5.995-4.057-5.661-3.766-6.687-6.628-5.254-5.13-7.147-4.819-5.655-5.479-5.037-2.118-4.81-7.088-7.052-4.352-7.589-6.832-4.214-2.749-5.277-2.474-2.415-7.214-8.659-3.672-7.443-7.463-12.472-7.364-8.749-8.463-11.714-8.435-5.439-9.004-14.014-13.336-9.049-7.013-6.936-8.914-8.308-10.023 -15.231-16.485-5.062-13.056-7.48-3.315-16.661-5.662-4.177-7.183-12.25-7.146-5.584-6.531-1.283-5.536-3.592 -9.478-9.402-6.191-3.766-4.776-6.007-4.68-4.291-4.582-12.955-7.109-8.869-8.193-8.222-12.346-22.507-10.491-6.961-12.073-16.093-8.444-3.849-9.336-2.204-7.779-16.768-14.405-13.119-3.298-15.499-7.381-9.2-6.676-4.505-6.264-3.967-10.381-11.568-6.232-10.733-9.277-8.322-11.266-10.673-7.704-9.014-11.609-10.813-7.607-7.75-6.907-8.269-6.736-6.971-7.1-4.889-15.392-8.736-8.588-4.672-4.944-5.793-5.224-5.054-6.572-4.035-6.755-7.313-4.074-5.506-4.973-5.701-3.508-6.7-4.075-6.834-6.098-6.054-5.542-5.372-8.914-7.355-4.333-5.352-5.048-5.303-7.371-4.948-8.871-13.135-1.957-3.895-3.961-7.636-7.012 -7.091-7.328-6.865-6.535-7.42-5.695-6.664-6.336-4.574-6.444-4.644-8.937-3.945-9.966-6.297-8.415-4.592-3.316-7.833-6.964-7.295-5.724-4.249-7.034-5.998-6.098-10.798-5.156-5.963-4.727-16.541-10.196-5.195 -7.458-4.681-10.755-2.737-5.75-5.308-6.558-6.321-5.2-3.617-3.743-5.563-7.258-6.842-5.334-8.113-2.982-8.045-4.04-7.154-5.03-8.632-3.726-7.513-9.155-2.998-4.793-4.392-7.746-3.417-5.405-6.168-5.267-5.634-6.929-5.363-5.025-7.648-5.893-7.346-5.077-6.883-8.251-6.972-6.429-5.491-3.844-2.762-6.186-2.259-6.266-6.591-5.16-11.715-7.47-2.791-8.953-7.83-5.768-4.689-6.439-7.137-4.845-3.089-4.731-6.756-5.625-4.333-6.494-7.272-5.319-5.893-6.642-8.259-8.711-2.691-9.232-9.869-4.068-6.917-9.297-7.571-8.431-4.212-7.617-5.582-7.199-8.914-7.117-6.184-8.024 -4.113-6.313-4.578-4.78-5.224-7.355-6.916-7.264-5.386-10.646-6.829-9.726-8.426-7.426-5.825-7.813-7.507-5.688-3.756-5.749-5.521-4.719-5.585-6.371-6.456-4.387-10.945-4.157-5.064-6.145-4.874-4.593-6.805-5.591-4.772-5.217-7.011-3.08-10.568-4.959-6.66-4.368-5.085-5.88-6.942-7.037-7.067-5.908-4.78-5.312-5.697-4.895-6.196-4.164-3.812-9.628-5.884-3.011-6.153-5.562-5.146-8.742-8.9-8.018-7.907-9.208-5.613-6.678-5.415-8.322-5.114-8.505-7.34-10.63-17.57-5.792-6.66-2.561-5.396-4.635-5.442-5.908-6.896-11.143-7.159-3.966-5.171-7.359-10.037-10.769-9.843-8.761-6.503-10.445-5.702-7.121-7.935-3.84-5.609-8.995-13.69-12.906-4.902-7.336-8.553-5.453-15.065-6.417-8.225-5.725-2.473-5.132-5.948-3.838-7.664-7.616-5.233-4.928-6.184-2.494-9.236-5.726-6.36-6.668-9.727-5.756-5.802-5.22-10.157-6.356-5.124-6.328-5.89-7.371-5.062-6.935-6.257-3.733-7.064-4.561-8.946-5.56-5.189-3.702-5.98-5.161-6.612-8.442-6.027-3.875-3.909-2.957-11.337-14.097-14.625-5.926-4.85-7.434-2.569-5.377-3.527-6.213-4.105-7.899-6.963-7.487-5.817-5.73-8.383-6.761-6.871-4.961-5.656-6.687-5.923-7.39-5.313-6.097-3.831-2.344-4.833-3.504-5.804-9.072-5.817-7.671-3.242-5.382-4.269-6.976-4.162-3.159-5.464-5.065-7.621-2.729-4.045-4.64-5.728-7.726-5.718-7.416-5.234-8.165-8.985-6.825-5.019-6.22-8.969-3.905-2.082-7.735-8.819-6.6-5.888-4.296-5.268-11.729-6.8061.509-4.524-4.452-7.246-3.981-6.117-9.884-5.353-6.202-8.57-4.356-6.342-4.394-6.759-6.024-5.714-5.35-7.546-8.027-7.246-6.363-6.159-4.219-4.95-4.198-7.206-4.773-7.598-6.366-10.068-5.245-7.34-4.287-3.634-4.262-5.746-4.035-3.875-4.842-14.534-4.372-14.534-4.466-7.002 -6.637-6.599-6.083-4.363-9.045-6.591-8.008-4.014-8.521-5.522-9.0-3.836-8.65-3.875-4.43-5.09-10.805-5.444-1.573-6.35-4.993-2.634-6.099-5.85-5.624-2.97-3.936-3.592-2.409-7.043-5.532-7.182-3.18-2.44-5.425-10.34-9.909-3.158-8.187-18.717-7.397-6.483-6.084-8.217-8.051-7.274-4.784-3.387-8.624-3.752-3.69-7.648-5.245-4.931-13.273-5.709-5.786-7.706-9.299-4.629-6.808-3.68-4.151-7.837-7.385-4.425-6.381-11.144-9.884-6.75-3.095-9.211-3.952-8.11-5.595-4.454-8.212-6.615-6.571-8.054-6.116-5.184-4.842-4.306-2.592-4.628-4.692-5.395-4.535-6.262-4.478-5.368-4.799-6.973-4.363-5.734-4.375-6.594-8.137-4.941-9.087-4.864-7.889-4.41-8.144-8.468-2.622-3.268-1.865-3.001-5.746-7.031-3.639-6.103-7.847-5.113-5.554-6.124-4.823-4.015-4.502-5.001-8.24-7.953-5.954-6.146-6.943-5.221-4.316-7.268-7.475-8.996-8.529-4.242-7.293-5.416-2.961-7.918-9.887-5.715-4.717-2.228-4.295-6.101-5.736-3.835-8.57-5.26-4.141-6.703-6.9-5.469-3.082-10.426-5.507-3.999-6.718-6.633-6.542-6.238-4.496-5.708-9.901-6.181-5.895-6.978-3.661-6.293-5.402-3.73-6.02-6.274-6.224-5.568-5.331-8.0-7.428-5.628-5.397-7.086-5.822-5.167-4.24-4.803-7.589-12.761-5.937-6.672-4.156-4.711-4.139-6.793-4.055-6.256-9.326-4.485-6.11-4.397-5.749-7.105-9.435-6.527-5.486-7.058-6.665-4.957-5.342-3.39-6.376-5.408-6.46-3.208-6.713-2.708-8.543-5.01-3.244-8.637-7.57-5.188-8.766-3.582-3.147-2.814-8.191-7.782-4.178-8.33-5.139-5.906-5.455-4.594-4.443-5.617-2.454-4.271-5.782-3.046-14.505-6.993-5.086-6.456-5.211-5.215-7.981-6.658-3.106-6.439-4.805-6.164-4.188-4.565-8.922-1.94-8.601-4.124-6.043-4.796-4.902-18.435-4.501-5.608-8.389-6.625-7.564-6.913-6.112-7.319-6.843-4.51-7.766-8.186-7.353-7.595-3.067-4.673-3.019-6.378-6.959-8.794-4.228-5.414-12.92-10.198-4.895-4.123-9.897-10.348-8.237-6.637-8.381-8.943-3.32-6.406-3.001-3.231-6.913-3.292-3.922-8.866-5.195-2.986-2.203-6.482-4.027-5.883-9.754-8.575-8.206-6.585-2.652-11.713-2.331-5.95-6.747-6.29-3.723-11.047-6.861-5.127-5.774-4.711-5.892-6.684-6.365-4.105-6.146-8.461-6.946-7.787-7.161-4.197-8.98-5.937-5.793-7.491-4.809-5.212-4.362-7.908-4.491-5.244-4.84-3.87-5.839-5.193-9.922-11.198-5.407-8.073-3.052-6.356-3.121-2.107-5.125-10.847-6.625-9.077-20.43-4.162-5.941-6.543-13.066-10.086-16.735-6.753-8.608-6.278-6.121-8.882-7.261-5.35-8.408-8.082-7.536-4.351-8.533-9.263-6.991-6.445-2.665-8.513-8.054-3.841-2.853-5.969-4.218-8.676-10.304-2.881-4.167-6.605-4.63-4.578-5.191-2.326-4.841 -6.47-4.792-3.24-5.938-5.782-5.808-5.857-25.166-3.032-5.56-7.895-3.384-6.021-3.123-4.333-7.026-7.176'
df.dtypes
Index int64
Highest Charting Position int64
Number of Times Charted int64
Week of Highest Charting object
Song Name object
Streams object
Artist object
Artist Followers object
Song ID object
Genre object
Release Date object
Weeks Charted object
Popularity object
Danceability object
Energy object
Loudness object
Speechiness object
Acousticness object
Liveness object
Tempo object
Duration (ms) object
Valence object
Chord object
dtype: object
df["Loudness"][:10]
0 -4.808
1 -5.484
2 -5.044
3 -3.712
4 -7.409
5 -6.682
6 -3.541
7 -3.605
8 -4.601
9 -4.008
Name: Loudness, dtype: object
pd.to_numeric(df["Energy"])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string " "
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/287173585.py in <module>
----> 1 pd.to_numeric(df["Energy"])
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
181 coerce_numeric = errors not in ("ignore", "raise")
182 try:
--> 183 values, _ = lib.maybe_convert_numeric(
184 values, set(), coerce_numeric=coerce_numeric
185 )
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string " " at position 35
df.loc[30:40,"Energy"]
30 0.616
31 0.796
32 0.272
33 0.622
34 0.816
35
36 0.536
37 0.621
38 0.71
39 0.573
40 0.541
Name: Energy, dtype: object
df.replace(" ",np.nan)
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 2021-07-23--2021-07-30 | Beggin' | 48,633,449 | Måneskin | 3377762 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 0.714 | 0.8 | -4.808 | 0.0504 | 0.127 | 0.359 | 134.002 | 211560 | 0.589 | B |
1 | 2 | 2 | 3 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 47,248,719 | The Kid LAROI | 2230022 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | 0.591 | 0.764 | -5.484 | 0.0483 | 0.0383 | 0.103 | 169.928 | 141806 | 0.478 | C#/Db |
2 | 3 | 1 | 11 | 2021-06-25--2021-07-02 | good 4 u | 40,162,559 | Olivia Rodrigo | 6266514 | 4ZtFanR9U6ndgddUvNcjcG | ['pop'] | ... | 0.563 | 0.664 | -5.044 | 0.154 | 0.335 | 0.0849 | 166.928 | 178147 | 0.688 | A |
3 | 4 | 3 | 5 | 2021-07-02--2021-07-09 | Bad Habits | 37,799,456 | Ed Sheeran | 83293380 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 0.808 | 0.897 | -3.712 | 0.0348 | 0.0469 | 0.364 | 126.026 | 231041 | 0.591 | B |
4 | 5 | 5 | 1 | 2021-07-23--2021-07-30 | INDUSTRY BABY (feat. Jack Harlow) | 33,948,454 | Lil Nas X | 5473565 | 27NovPIUIRrOZoCHxABJwK | ['lgbtq+ hip hop', 'pop rap'] | ... | 0.736 | 0.704 | -7.409 | 0.0615 | 0.0203 | 0.0501 | 149.995 | 212000 | 0.894 | D#/Eb |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1551 | 1552 | 195 | 1 | 2019-12-27--2020-01-03 | New Rules | 4,630,675 | Dua Lipa | 27167675 | 2ekn2ttSfGqwhhate0LSR0 | ['dance pop', 'pop', 'uk pop'] | ... | 0.762 | 0.7 | -6.021 | 0.0694 | 0.00261 | 0.153 | 116.073 | 209320 | 0.608 | A |
1552 | 1553 | 196 | 1 | 2019-12-27--2020-01-03 | Cheirosa - Ao Vivo | 4,623,030 | Jorge & Mateus | 15019109 | 2PWjKmjyTZeDpmOUa3a5da | ['sertanejo', 'sertanejo universitario'] | ... | 0.528 | 0.87 | -3.123 | 0.0851 | 0.24 | 0.333 | 152.37 | 181930 | 0.714 | B |
1553 | 1554 | 197 | 1 | 2019-12-27--2020-01-03 | Havana (feat. Young Thug) | 4,620,876 | Camila Cabello | 22698747 | 1rfofaqEpACxVEHIZBJe6W | ['dance pop', 'electropop', 'pop', 'post-teen ... | ... | 0.765 | 0.523 | -4.333 | 0.03 | 0.184 | 0.132 | 104.988 | 217307 | 0.394 | D |
1554 | 1555 | 198 | 1 | 2019-12-27--2020-01-03 | Surtada - Remix Brega Funk | 4,607,385 | Dadá Boladão, Tati Zaqui, OIK | 208630 | 5F8ffc8KWKNawllr5WsW0r | ['brega funk', 'funk carioca'] | ... | 0.832 | 0.55 | -7.026 | 0.0587 | 0.249 | 0.182 | 154.064 | 152784 | 0.881 | F |
1555 | 1556 | 199 | 1 | 2019-12-27--2020-01-03 | Lover (Remix) [feat. Shawn Mendes] | 4,595,450 | Taylor Swift | 42227614 | 3i9UVldZOE0aD0JnyfAZZ0 | ['pop', 'post-teen pop'] | ... | 0.448 | 0.603 | -7.176 | 0.064 | 0.433 | 0.0862 | 205.272 | 221307 | 0.422 | G |
1556 rows × 23 columns
pd.to_numeric(df["Energy"])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string " "
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/287173585.py in <module>
----> 1 pd.to_numeric(df["Energy"])
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
181 coerce_numeric = errors not in ("ignore", "raise")
182 try:
--> 183 values, _ = lib.maybe_convert_numeric(
184 values, set(), coerce_numeric=coerce_numeric
185 )
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string " " at position 35
df = df.replace(" ",np.nan)
pd.to_numeric(df["Energy"])
0 0.800
1 0.764
2 0.664
3 0.897
4 0.704
...
1551 0.700
1552 0.870
1553 0.523
1554 0.550
1555 0.603
Name: Energy, Length: 1556, dtype: float64
df["Energy"] = pd.to_numeric(df["Energy"])
pd.to_numeric(df["Streams"])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "48,633,449"
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/2129868835.py in <module>
----> 1 pd.to_numeric(df["Streams"])
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
181 coerce_numeric = errors not in ("ignore", "raise")
182 try:
--> 183 values, _ = lib.maybe_convert_numeric(
184 values, set(), coerce_numeric=coerce_numeric
185 )
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "48,633,449" at position 0
int("48,633,449")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1792345402.py in <module>
----> 1 int("48,633,449")
ValueError: invalid literal for int() with base 10: '48,633,449'
df["Streams"].replace(",","")
0 48,633,449
1 47,248,719
2 40,162,559
3 37,799,456
4 33,948,454
...
1551 4,630,675
1552 4,623,030
1553 4,620,876
1554 4,607,385
1555 4,595,450
Name: Streams, Length: 1556, dtype: object
"mathematics".replace("t","10")
'ma10hema10ics'
"48,633,449".replace(",","")
'48633449'
# equivalent to df.replace(" ",np.nan)
df.applymap(lambda x: np.nan if x == " " else x)
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 2021-07-23--2021-07-30 | Beggin' | 48,633,449 | Måneskin | 3377762 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 0.714 | 0.800 | -4.808 | 0.0504 | 0.127 | 0.359 | 134.002 | 211560 | 0.589 | B |
1 | 2 | 2 | 3 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 47,248,719 | The Kid LAROI | 2230022 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | 0.591 | 0.764 | -5.484 | 0.0483 | 0.0383 | 0.103 | 169.928 | 141806 | 0.478 | C#/Db |
2 | 3 | 1 | 11 | 2021-06-25--2021-07-02 | good 4 u | 40,162,559 | Olivia Rodrigo | 6266514 | 4ZtFanR9U6ndgddUvNcjcG | ['pop'] | ... | 0.563 | 0.664 | -5.044 | 0.154 | 0.335 | 0.0849 | 166.928 | 178147 | 0.688 | A |
3 | 4 | 3 | 5 | 2021-07-02--2021-07-09 | Bad Habits | 37,799,456 | Ed Sheeran | 83293380 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 0.808 | 0.897 | -3.712 | 0.0348 | 0.0469 | 0.364 | 126.026 | 231041 | 0.591 | B |
4 | 5 | 5 | 1 | 2021-07-23--2021-07-30 | INDUSTRY BABY (feat. Jack Harlow) | 33,948,454 | Lil Nas X | 5473565 | 27NovPIUIRrOZoCHxABJwK | ['lgbtq+ hip hop', 'pop rap'] | ... | 0.736 | 0.704 | -7.409 | 0.0615 | 0.0203 | 0.0501 | 149.995 | 212000 | 0.894 | D#/Eb |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1551 | 1552 | 195 | 1 | 2019-12-27--2020-01-03 | New Rules | 4,630,675 | Dua Lipa | 27167675 | 2ekn2ttSfGqwhhate0LSR0 | ['dance pop', 'pop', 'uk pop'] | ... | 0.762 | 0.700 | -6.021 | 0.0694 | 0.00261 | 0.153 | 116.073 | 209320 | 0.608 | A |
1552 | 1553 | 196 | 1 | 2019-12-27--2020-01-03 | Cheirosa - Ao Vivo | 4,623,030 | Jorge & Mateus | 15019109 | 2PWjKmjyTZeDpmOUa3a5da | ['sertanejo', 'sertanejo universitario'] | ... | 0.528 | 0.870 | -3.123 | 0.0851 | 0.24 | 0.333 | 152.37 | 181930 | 0.714 | B |
1553 | 1554 | 197 | 1 | 2019-12-27--2020-01-03 | Havana (feat. Young Thug) | 4,620,876 | Camila Cabello | 22698747 | 1rfofaqEpACxVEHIZBJe6W | ['dance pop', 'electropop', 'pop', 'post-teen ... | ... | 0.765 | 0.523 | -4.333 | 0.03 | 0.184 | 0.132 | 104.988 | 217307 | 0.394 | D |
1554 | 1555 | 198 | 1 | 2019-12-27--2020-01-03 | Surtada - Remix Brega Funk | 4,607,385 | Dadá Boladão, Tati Zaqui, OIK | 208630 | 5F8ffc8KWKNawllr5WsW0r | ['brega funk', 'funk carioca'] | ... | 0.832 | 0.550 | -7.026 | 0.0587 | 0.249 | 0.182 | 154.064 | 152784 | 0.881 | F |
1555 | 1556 | 199 | 1 | 2019-12-27--2020-01-03 | Lover (Remix) [feat. Shawn Mendes] | 4,595,450 | Taylor Swift | 42227614 | 3i9UVldZOE0aD0JnyfAZZ0 | ['pop', 'post-teen pop'] | ... | 0.448 | 0.603 | -7.176 | 0.064 | 0.433 | 0.0862 | 205.272 | 221307 | 0.422 | G |
1556 rows × 23 columns
df["Streams"].map(lambda s: s.replace(",",""))
0 48633449
1 47248719
2 40162559
3 37799456
4 33948454
...
1551 4630675
1552 4623030
1553 4620876
1554 4607385
1555 4595450
Name: Streams, Length: 1556, dtype: object
df["Streams"] = df["Streams"].map(lambda s: s.replace(",",""))
df.head()
Index | Highest Charting Position | Number of Times Charted | Week of Highest Charting | Song Name | Streams | Artist | Artist Followers | Song ID | Genre | ... | Danceability | Energy | Loudness | Speechiness | Acousticness | Liveness | Tempo | Duration (ms) | Valence | Chord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 8 | 2021-07-23--2021-07-30 | Beggin' | 48633449 | Måneskin | 3377762 | 3Wrjm47oTz2sjIgck11l5e | ['indie rock italiano', 'italian pop'] | ... | 0.714 | 0.800 | -4.808 | 0.0504 | 0.127 | 0.359 | 134.002 | 211560 | 0.589 | B |
1 | 2 | 2 | 3 | 2021-07-23--2021-07-30 | STAY (with Justin Bieber) | 47248719 | The Kid LAROI | 2230022 | 5HCyWlXZPP0y6Gqq8TgA20 | ['australian hip hop'] | ... | 0.591 | 0.764 | -5.484 | 0.0483 | 0.0383 | 0.103 | 169.928 | 141806 | 0.478 | C#/Db |
2 | 3 | 1 | 11 | 2021-06-25--2021-07-02 | good 4 u | 40162559 | Olivia Rodrigo | 6266514 | 4ZtFanR9U6ndgddUvNcjcG | ['pop'] | ... | 0.563 | 0.664 | -5.044 | 0.154 | 0.335 | 0.0849 | 166.928 | 178147 | 0.688 | A |
3 | 4 | 3 | 5 | 2021-07-02--2021-07-09 | Bad Habits | 37799456 | Ed Sheeran | 83293380 | 6PQ88X9TkUIAUIZJHW2upE | ['pop', 'uk pop'] | ... | 0.808 | 0.897 | -3.712 | 0.0348 | 0.0469 | 0.364 | 126.026 | 231041 | 0.591 | B |
4 | 5 | 5 | 1 | 2021-07-23--2021-07-30 | INDUSTRY BABY (feat. Jack Harlow) | 33948454 | Lil Nas X | 5473565 | 27NovPIUIRrOZoCHxABJwK | ['lgbtq+ hip hop', 'pop rap'] | ... | 0.736 | 0.704 | -7.409 | 0.0615 | 0.0203 | 0.0501 | 149.995 | 212000 | 0.894 | D#/Eb |
5 rows × 23 columns
df.dtypes
Index int64
Highest Charting Position int64
Number of Times Charted int64
Week of Highest Charting object
Song Name object
Streams object
Artist object
Artist Followers object
Song ID object
Genre object
Release Date object
Weeks Charted object
Popularity object
Danceability object
Energy float64
Loudness object
Speechiness object
Acousticness object
Liveness object
Tempo object
Duration (ms) object
Valence object
Chord object
dtype: object
Handling errors¶
Let’s write a function which takes as input a number x, and as output returns 3/x.
def f(x):
return 3/x
f(5)
0.6
f(0)
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1333598009.py in <module>
----> 1 f(0)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/3712785429.py in f(x)
1 def f(x):
----> 2 return 3/x
ZeroDivisionError: division by zero
What if we want to avoid this sort of error? (For example, maybe this is part of a much longer program, and we don’t want the whole program to crash if this function gets a bad input.)
def f(x):
if x == 0:
return np.nan
else:
return 3/x
f(10)
0.3
f(0)
nan
type(f(0))
float
We fixed the 0 error, but we will never catch all possible errors this way.
f("5")
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/221224352.py in <module>
----> 1 f("5")
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/3342001434.py in f(x)
3 return np.nan
4 else:
----> 5 return 3/x
TypeError: unsupported operand type(s) for /: 'int' and 'str'
We want something like
def f(x):
if 3/x does not cause an error:
return 3/x
else:
return np.nan
As far as I know, that can’t be accomplished using an if
statement. We instead need to use the commands try
and except
.
def f(x):
try:
return 3/x
except:
return "It didn't work"
f(5)
0.6
f("math 10")
"It didn't work"
def can_divide(x):
try:
3/x
return True
except:
return False
can_divide(5)
True
can_divide(0)
False
Making columns numeric¶
df.dtypes
Index int64
Highest Charting Position int64
Number of Times Charted int64
Week of Highest Charting object
Song Name object
Streams object
Artist object
Artist Followers object
Song ID object
Genre object
Release Date object
Weeks Charted object
Popularity object
Danceability object
Energy float64
Loudness object
Speechiness object
Acousticness object
Liveness object
Tempo object
Duration (ms) object
Valence object
Chord object
dtype: object
pd.to_numeric(df["Speechiness"])
0 0.0504
1 0.0483
2 0.1540
3 0.0348
4 0.0615
...
1551 0.0694
1552 0.0851
1553 0.0300
1554 0.0587
1555 0.0640
Name: Speechiness, Length: 1556, dtype: float64
pd.to_numeric(df["Release Date"])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "2017-12-08"
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1358387063.py in <module>
----> 1 pd.to_numeric(df["Release Date"])
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
181 coerce_numeric = errors not in ("ignore", "raise")
182 try:
--> 183 values, _ = lib.maybe_convert_numeric(
184 values, set(), coerce_numeric=coerce_numeric
185 )
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "2017-12-08" at position 0
# c column name
def can_be_numeric(c):
try:
pd.to_numeric(df[c])
return True
except:
return False
can_be_numeric("Release Date")
False
can_be_numeric("Speechiness")
True
# all the columns that can be numeric
good_cols = [c for c in df.columns if can_be_numeric(c)]
good_cols
['Index',
'Highest Charting Position',
'Number of Times Charted',
'Streams',
'Artist Followers',
'Popularity',
'Danceability',
'Energy',
'Loudness',
'Speechiness',
'Acousticness',
'Liveness',
'Tempo',
'Duration (ms)',
'Valence']
df[good_cols] = df[good_cols].apply(pd.to_numeric, axis=0)
df.dtypes
Index int64
Highest Charting Position int64
Number of Times Charted int64
Week of Highest Charting object
Song Name object
Streams int64
Artist object
Artist Followers float64
Song ID object
Genre object
Release Date object
Weeks Charted object
Popularity float64
Danceability float64
Energy float64
Loudness float64
Speechiness float64
Acousticness float64
Liveness float64
Tempo float64
Duration (ms) float64
Valence float64
Chord object
dtype: object
alt.Chart(df).mark_circle().encode(
x = "Energy",
y = "Loudness"
)
chartlist = [alt.Chart(df).mark_circle().encode(
x = "Energy",
y = c
) for c in good_cols]
chartlist
[alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...),
alt.Chart(...)]
chartlist[5]
alt.hconcat(chartlist[0],chartlist[1])
alt.hconcat(chartlist)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83288/1249897995.py in <module>
----> 1 alt.hconcat(chartlist)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/altair/vegalite/v4/api.py in hconcat(*charts, **kwargs)
2210 def hconcat(*charts, **kwargs):
2211 """Concatenate charts horizontally"""
-> 2212 return HConcatChart(hconcat=charts, **kwargs)
2213
2214
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/altair/vegalite/v4/api.py in __init__(self, data, hconcat, **kwargs)
2184 # TODO: move common data to top level?
2185 for spec in hconcat:
-> 2186 _check_if_valid_subspec(spec, "HConcatChart")
2187 super(HConcatChart, self).__init__(data=data, hconcat=list(hconcat), **kwargs)
2188 self.data, self.hconcat = _combine_subchart_data(self.data, self.hconcat)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/altair/vegalite/v4/api.py in _check_if_valid_subspec(spec, classname)
2036
2037 if not isinstance(spec, (core.SchemaBase, dict)):
-> 2038 raise ValueError("Only chart objects can be used in {0}.".format(classname))
2039 for attr in TOPLEVEL_ONLY_KEYS:
2040 if isinstance(spec, core.SchemaBase):
ValueError: Only chart objects can be used in HConcatChart.
m = alt.hconcat(*chartlist)
m