Week 4 Video notebooks

This is the notebook file corresponding to the Week 4 videos.

Encoding data types

Reference: Altair documentation

import pandas as pd
import altair as alt
df = pd.DataFrame({"a":[3,2,1,4],"b":[4,8,3,1]})
alt.Chart(df).mark_bar().encode(
    x = "a",
    y = "b"
)
df
a b
0 3 4
1 2 8
2 1 3
3 4 1
alt.Chart(df).mark_bar(width=50).encode(
    x = "a",
    y = "b"
)
alt.Chart(df).mark_bar().encode(
    x = "a:N",
    y = "b",
    color = "a:N"
)
alt.Chart(df).mark_bar().encode(
    x = "a:O",
    y = "b",
    color = "a:O"
)
alt.Chart(df).mark_bar().encode(
    x = alt.X("a:O", sort=None),
    y = "b",
    color = "a:O"
)
df.a
0    3
1    2
2    1
3    4
Name: a, dtype: int64

Interactive bar chart

import pandas as pd
import altair as alt
import seaborn as sns
penguin = sns.load_dataset("penguins")
penguin
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male

344 rows × 7 columns

penguin.columns
Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')
c1 = alt.Chart(penguin).mark_circle().encode(
    x = alt.X('bill_length_mm', scale=alt.Scale(zero=False)),
    y = alt.Y('flipper_length_mm',scale=alt.Scale(domain=(160,240))),
    color = "species"
)
type(c1)
altair.vegalite.v4.api.Chart
c1
brush = alt.selection_interval()
c1.add_selection(brush)
c2 = alt.Chart(penguin).mark_bar().encode(
    x = "species",
    y = "count()",
    color = "species"
)
c2
c1 = c1.add_selection(brush)
penguin.species.unique()
array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)
c2 = alt.Chart(penguin).mark_bar().encode(
    x = alt.X("species", scale = alt.Scale(domain=penguin.species.unique())),
    y = alt.Y("count()", scale = alt.Scale(domain=(0,160))),
    color = "species"
).transform_filter(brush)
c1|c2

pd.to_datetime

import pandas as pd
import altair as alt
import seaborn as sns
taxis = sns.load_dataset("taxis")
taxis.head(6)
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
0 2019-03-23 20:21:09 2019-03-23 20:27:24 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan
2 2019-03-27 17:53:01 2019-03-27 18:00:25 1 1.37 7.5 2.36 0.0 14.16 yellow credit card Alphabet City West Village Manhattan Manhattan
3 2019-03-10 01:23:59 2019-03-10 01:49:51 1 7.70 27.0 6.15 0.0 36.95 yellow credit card Hudson Sq Yorkville West Manhattan Manhattan
4 2019-03-30 13:27:42 2019-03-30 13:37:14 3 2.16 9.0 1.10 0.0 13.40 yellow credit card Midtown East Yorkville West Manhattan Manhattan
5 2019-03-11 10:37:23 2019-03-11 10:47:31 1 0.49 7.5 2.16 0.0 12.96 yellow credit card Times Sq/Theatre District Midtown East Manhattan Manhattan
alt.Chart(taxis[::10]).mark_circle().encode(
    x = "pickup",
    y = "distance"
)
taxis.dtypes
pickup              object
dropoff             object
passengers           int64
distance           float64
fare               float64
tip                float64
tolls              float64
total              float64
color               object
payment             object
pickup_zone         object
dropoff_zone        object
pickup_borough      object
dropoff_borough     object
dtype: object
taxis.loc[10,"pickup"]
'2019-03-16 10:02:25'
type(taxis.loc[10,"pickup"])
str
alt.Chart(taxis[::10]).mark_circle().encode(
    x = "pickup:T",
    y = "distance",
    tooltip = "pickup:T"
)
taxis["pickup"]
0       2019-03-23 20:21:09
1       2019-03-04 16:11:55
2       2019-03-27 17:53:01
3       2019-03-10 01:23:59
4       2019-03-30 13:27:42
               ...         
6428    2019-03-31 09:51:53
6429    2019-03-31 17:38:00
6430    2019-03-23 22:55:18
6431    2019-03-04 10:09:25
6432    2019-03-13 19:31:22
Name: pickup, Length: 6433, dtype: object
pd.to_datetime(taxis["pickup"])
0      2019-03-23 20:21:09
1      2019-03-04 16:11:55
2      2019-03-27 17:53:01
3      2019-03-10 01:23:59
4      2019-03-30 13:27:42
               ...        
6428   2019-03-31 09:51:53
6429   2019-03-31 17:38:00
6430   2019-03-23 22:55:18
6431   2019-03-04 10:09:25
6432   2019-03-13 19:31:22
Name: pickup, Length: 6433, dtype: datetime64[ns]
pd.to_datetime(taxis["pickup"]).loc[0].day_name()
'Saturday'
taxis.iloc[:3]
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
0 2019-03-23 20:21:09 2019-03-23 20:27:24 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan
2 2019-03-27 17:53:01 2019-03-27 18:00:25 1 1.37 7.5 2.36 0.0 14.16 yellow credit card Alphabet City West Village Manhattan Manhattan
alt.Chart(taxis[::10]).mark_circle().encode(
    x = "pickup",
    y = "distance",
    tooltip = "pickup"
)
taxis["pickup"] = pd.to_datetime(taxis["pickup"])
taxis.iloc[:3]
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
0 2019-03-23 20:21:09 2019-03-23 20:27:24 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan
2 2019-03-27 17:53:01 2019-03-27 18:00:25 1 1.37 7.5 2.36 0.0 14.16 yellow credit card Alphabet City West Village Manhattan Manhattan
alt.Chart(taxis[::10]).mark_circle().encode(
    x = "pickup",
    y = "distance",
    tooltip = "pickup"
)
alt.Chart(taxis[:5000]).mark_circle().encode(
    x = "pickup",
    y = "distance",
    tooltip = "pickup"
)

map and lambda functions

import pandas as pd
import seaborn as sns
df = pd.DataFrame({"a":[3,2,1,4],"b":[4.3,8.1,-2.9,1.8]})
df
a b
0 3 4.3
1 2 8.1
2 1 -2.9
3 4 1.8
df["b"]**2
0    18.49
1    65.61
2     8.41
3     3.24
Name: b, dtype: float64
df["b"].map(round)
0    4
1    8
2   -3
3    2
Name: b, dtype: int64
def square(x):
    return x**2
df["b"].map(square)
0    18.49
1    65.61
2     8.41
3     3.24
Name: b, dtype: float64
df["b"].map(lambda x: x**2)
0    18.49
1    65.61
2     8.41
3     3.24
Name: b, dtype: float64
taxis = sns.load_dataset("taxis")
taxis["dropoff_zone"].isna().sum()
45
taxis["dropoff_zone"] = taxis["dropoff_zone"].fillna("")
taxis["dropoff_zone"].isna().sum()
0
taxis["dropoff_zone"] == "Upper West Side"
0       False
1       False
2       False
3       False
4       False
        ...  
6428    False
6429    False
6430    False
6431    False
6432    False
Name: dropoff_zone, Length: 6433, dtype: bool
taxis.loc[1,"dropoff_zone"]
'Upper West Side South'
"Upper West Side" in taxis.loc[1,"dropoff_zone"]
True
taxis["dropoff_zone"].map(lambda s: "Upper West Side" in s)
0       False
1        True
2       False
3       False
4       False
        ...  
6428    False
6429    False
6430    False
6431    False
6432    False
Name: dropoff_zone, Length: 6433, dtype: bool
taxis["dropoff_zone"].map(lambda s: "Upper West Side" in s).sum()
265
taxis[taxis["dropoff_zone"].map(lambda s: "Upper West Side" in s)]
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan
39 2019-03-25 22:50:56 2019-03-25 22:59:51 1 2.47 10.0 2.76 0.0 16.56 yellow credit card Garment District Upper West Side South Manhattan Manhattan
49 2019-03-04 21:35:00 2019-03-04 21:53:42 1 4.87 17.5 0.00 0.0 21.30 yellow cash West Village Upper West Side South Manhattan Manhattan
72 2019-03-12 15:44:53 2019-03-12 16:02:05 1 1.98 11.5 2.96 0.0 17.76 yellow credit card Yorkville West Upper West Side North Manhattan Manhattan
94 2019-03-06 20:00:28 2019-03-06 20:11:50 3 2.60 11.0 4.40 0.0 19.20 yellow credit card Upper East Side South Upper West Side North Manhattan Manhattan
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6250 2019-03-17 14:00:01 2019-03-17 14:06:24 5 1.18 6.5 1.46 0.0 8.76 green credit card East Harlem South Upper West Side North Manhattan Manhattan
6285 2019-03-28 20:54:51 2019-03-28 21:00:54 1 1.12 6.5 0.78 0.0 8.58 green credit card Morningside Heights Upper West Side North Manhattan Manhattan
6349 2019-03-12 16:08:29 2019-03-12 16:16:39 1 1.92 8.0 0.00 0.0 12.55 green cash East Harlem South Upper West Side North Manhattan Manhattan
6420 2019-03-16 15:39:23 2019-03-16 15:46:18 2 1.20 7.0 0.00 0.0 7.80 green cash Central Harlem Upper West Side North Manhattan Manhattan
6426 2019-03-28 08:04:47 2019-03-28 08:07:46 1 0.71 4.5 0.50 0.0 5.80 green credit card Central Park Upper West Side North Manhattan Manhattan

265 rows × 14 columns