Week 4 Video notebooks
Contents
Week 4 Video notebooks¶
This is the notebook file corresponding to the Week 4 videos.
Encoding data types¶
Reference: Altair documentation
import pandas as pd
import altair as alt
df = pd.DataFrame({"a":[3,2,1,4],"b":[4,8,3,1]})
alt.Chart(df).mark_bar().encode(
x = "a",
y = "b"
)
df
a | b | |
---|---|---|
0 | 3 | 4 |
1 | 2 | 8 |
2 | 1 | 3 |
3 | 4 | 1 |
alt.Chart(df).mark_bar(width=50).encode(
x = "a",
y = "b"
)
alt.Chart(df).mark_bar().encode(
x = "a:N",
y = "b",
color = "a:N"
)
alt.Chart(df).mark_bar().encode(
x = "a:O",
y = "b",
color = "a:O"
)
alt.Chart(df).mark_bar().encode(
x = alt.X("a:O", sort=None),
y = "b",
color = "a:O"
)
df.a
0 3
1 2
2 1
3 4
Name: a, dtype: int64
Interactive bar chart¶
import pandas as pd
import altair as alt
import seaborn as sns
penguin = sns.load_dataset("penguins")
penguin
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
... | ... | ... | ... | ... | ... | ... | ... |
339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
344 rows × 7 columns
penguin.columns
Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
'flipper_length_mm', 'body_mass_g', 'sex'],
dtype='object')
c1 = alt.Chart(penguin).mark_circle().encode(
x = alt.X('bill_length_mm', scale=alt.Scale(zero=False)),
y = alt.Y('flipper_length_mm',scale=alt.Scale(domain=(160,240))),
color = "species"
)
type(c1)
altair.vegalite.v4.api.Chart
c1
brush = alt.selection_interval()
c1.add_selection(brush)
c2 = alt.Chart(penguin).mark_bar().encode(
x = "species",
y = "count()",
color = "species"
)
c2
c1 = c1.add_selection(brush)
penguin.species.unique()
array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)
c2 = alt.Chart(penguin).mark_bar().encode(
x = alt.X("species", scale = alt.Scale(domain=penguin.species.unique())),
y = alt.Y("count()", scale = alt.Scale(domain=(0,160))),
color = "species"
).transform_filter(brush)
c1|c2
pd.to_datetime¶
import pandas as pd
import altair as alt
import seaborn as sns
taxis = sns.load_dataset("taxis")
taxis.head(6)
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan |
1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan |
2 | 2019-03-27 17:53:01 | 2019-03-27 18:00:25 | 1 | 1.37 | 7.5 | 2.36 | 0.0 | 14.16 | yellow | credit card | Alphabet City | West Village | Manhattan | Manhattan |
3 | 2019-03-10 01:23:59 | 2019-03-10 01:49:51 | 1 | 7.70 | 27.0 | 6.15 | 0.0 | 36.95 | yellow | credit card | Hudson Sq | Yorkville West | Manhattan | Manhattan |
4 | 2019-03-30 13:27:42 | 2019-03-30 13:37:14 | 3 | 2.16 | 9.0 | 1.10 | 0.0 | 13.40 | yellow | credit card | Midtown East | Yorkville West | Manhattan | Manhattan |
5 | 2019-03-11 10:37:23 | 2019-03-11 10:47:31 | 1 | 0.49 | 7.5 | 2.16 | 0.0 | 12.96 | yellow | credit card | Times Sq/Theatre District | Midtown East | Manhattan | Manhattan |
alt.Chart(taxis[::10]).mark_circle().encode(
x = "pickup",
y = "distance"
)
taxis.dtypes
pickup object
dropoff object
passengers int64
distance float64
fare float64
tip float64
tolls float64
total float64
color object
payment object
pickup_zone object
dropoff_zone object
pickup_borough object
dropoff_borough object
dtype: object
taxis.loc[10,"pickup"]
'2019-03-16 10:02:25'
type(taxis.loc[10,"pickup"])
str
alt.Chart(taxis[::10]).mark_circle().encode(
x = "pickup:T",
y = "distance",
tooltip = "pickup:T"
)
taxis["pickup"]
0 2019-03-23 20:21:09
1 2019-03-04 16:11:55
2 2019-03-27 17:53:01
3 2019-03-10 01:23:59
4 2019-03-30 13:27:42
...
6428 2019-03-31 09:51:53
6429 2019-03-31 17:38:00
6430 2019-03-23 22:55:18
6431 2019-03-04 10:09:25
6432 2019-03-13 19:31:22
Name: pickup, Length: 6433, dtype: object
pd.to_datetime(taxis["pickup"])
0 2019-03-23 20:21:09
1 2019-03-04 16:11:55
2 2019-03-27 17:53:01
3 2019-03-10 01:23:59
4 2019-03-30 13:27:42
...
6428 2019-03-31 09:51:53
6429 2019-03-31 17:38:00
6430 2019-03-23 22:55:18
6431 2019-03-04 10:09:25
6432 2019-03-13 19:31:22
Name: pickup, Length: 6433, dtype: datetime64[ns]
pd.to_datetime(taxis["pickup"]).loc[0].day_name()
'Saturday'
taxis.iloc[:3]
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan |
1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan |
2 | 2019-03-27 17:53:01 | 2019-03-27 18:00:25 | 1 | 1.37 | 7.5 | 2.36 | 0.0 | 14.16 | yellow | credit card | Alphabet City | West Village | Manhattan | Manhattan |
alt.Chart(taxis[::10]).mark_circle().encode(
x = "pickup",
y = "distance",
tooltip = "pickup"
)
taxis["pickup"] = pd.to_datetime(taxis["pickup"])
taxis.iloc[:3]
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan |
1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan |
2 | 2019-03-27 17:53:01 | 2019-03-27 18:00:25 | 1 | 1.37 | 7.5 | 2.36 | 0.0 | 14.16 | yellow | credit card | Alphabet City | West Village | Manhattan | Manhattan |
alt.Chart(taxis[::10]).mark_circle().encode(
x = "pickup",
y = "distance",
tooltip = "pickup"
)
alt.Chart(taxis[:5000]).mark_circle().encode(
x = "pickup",
y = "distance",
tooltip = "pickup"
)
map and lambda functions¶
import pandas as pd
import seaborn as sns
df = pd.DataFrame({"a":[3,2,1,4],"b":[4.3,8.1,-2.9,1.8]})
df
a | b | |
---|---|---|
0 | 3 | 4.3 |
1 | 2 | 8.1 |
2 | 1 | -2.9 |
3 | 4 | 1.8 |
df["b"]**2
0 18.49
1 65.61
2 8.41
3 3.24
Name: b, dtype: float64
df["b"].map(round)
0 4
1 8
2 -3
3 2
Name: b, dtype: int64
def square(x):
return x**2
df["b"].map(square)
0 18.49
1 65.61
2 8.41
3 3.24
Name: b, dtype: float64
df["b"].map(lambda x: x**2)
0 18.49
1 65.61
2 8.41
3 3.24
Name: b, dtype: float64
taxis = sns.load_dataset("taxis")
taxis["dropoff_zone"].isna().sum()
45
taxis["dropoff_zone"] = taxis["dropoff_zone"].fillna("")
taxis["dropoff_zone"].isna().sum()
0
taxis["dropoff_zone"] == "Upper West Side"
0 False
1 False
2 False
3 False
4 False
...
6428 False
6429 False
6430 False
6431 False
6432 False
Name: dropoff_zone, Length: 6433, dtype: bool
taxis.loc[1,"dropoff_zone"]
'Upper West Side South'
"Upper West Side" in taxis.loc[1,"dropoff_zone"]
True
taxis["dropoff_zone"].map(lambda s: "Upper West Side" in s)
0 False
1 True
2 False
3 False
4 False
...
6428 False
6429 False
6430 False
6431 False
6432 False
Name: dropoff_zone, Length: 6433, dtype: bool
taxis["dropoff_zone"].map(lambda s: "Upper West Side" in s).sum()
265
taxis[taxis["dropoff_zone"].map(lambda s: "Upper West Side" in s)]
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan |
39 | 2019-03-25 22:50:56 | 2019-03-25 22:59:51 | 1 | 2.47 | 10.0 | 2.76 | 0.0 | 16.56 | yellow | credit card | Garment District | Upper West Side South | Manhattan | Manhattan |
49 | 2019-03-04 21:35:00 | 2019-03-04 21:53:42 | 1 | 4.87 | 17.5 | 0.00 | 0.0 | 21.30 | yellow | cash | West Village | Upper West Side South | Manhattan | Manhattan |
72 | 2019-03-12 15:44:53 | 2019-03-12 16:02:05 | 1 | 1.98 | 11.5 | 2.96 | 0.0 | 17.76 | yellow | credit card | Yorkville West | Upper West Side North | Manhattan | Manhattan |
94 | 2019-03-06 20:00:28 | 2019-03-06 20:11:50 | 3 | 2.60 | 11.0 | 4.40 | 0.0 | 19.20 | yellow | credit card | Upper East Side South | Upper West Side North | Manhattan | Manhattan |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6250 | 2019-03-17 14:00:01 | 2019-03-17 14:06:24 | 5 | 1.18 | 6.5 | 1.46 | 0.0 | 8.76 | green | credit card | East Harlem South | Upper West Side North | Manhattan | Manhattan |
6285 | 2019-03-28 20:54:51 | 2019-03-28 21:00:54 | 1 | 1.12 | 6.5 | 0.78 | 0.0 | 8.58 | green | credit card | Morningside Heights | Upper West Side North | Manhattan | Manhattan |
6349 | 2019-03-12 16:08:29 | 2019-03-12 16:16:39 | 1 | 1.92 | 8.0 | 0.00 | 0.0 | 12.55 | green | cash | East Harlem South | Upper West Side North | Manhattan | Manhattan |
6420 | 2019-03-16 15:39:23 | 2019-03-16 15:46:18 | 2 | 1.20 | 7.0 | 0.00 | 0.0 | 7.80 | green | cash | Central Harlem | Upper West Side North | Manhattan | Manhattan |
6426 | 2019-03-28 08:04:47 | 2019-03-28 08:07:46 | 1 | 0.71 | 4.5 | 0.50 | 0.0 | 5.80 | green | credit card | Central Park | Upper West Side North | Manhattan | Manhattan |
265 rows × 14 columns