Week 2 Videos

Introduction to Altair

import pandas as pd
import altair as alt

df = pd.DataFrame({'A': [0,1,1,3,4], 'B': [5,-2,6,20,2]})
alt.Chart(df)
---------------------------------------------------------------------------
SchemaValidationError                     Traceback (most recent call last)
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in to_dict(self, *args, **kwargs)
   2018             copy.data = core.InlineData(values=[{}])
   2019             return super(Chart, copy).to_dict(*args, **kwargs)
-> 2020         return super().to_dict(*args, **kwargs)
   2021 
   2022     def add_selection(self, *selections):

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in to_dict(self, *args, **kwargs)
    391         if dct is None:
    392             kwargs["validate"] = "deep"
--> 393             dct = super(TopLevelMixin, copy).to_dict(*args, **kwargs)
    394 
    395         # TODO: following entries are added after validation. Should they be validated?

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/utils/schemapi.py in to_dict(self, validate, ignore, context)
    338                 self.validate(result)
    339             except jsonschema.ValidationError as err:
--> 340                 raise SchemaValidationError(self, err)
    341         return result
    342 

SchemaValidationError: Invalid specification

        altair.vegalite.v4.api.Chart, validating 'required'

        'mark' is a required property
        
alt.Chart(...)
alt.Chart(df).mark_circle()
df
A B
0 0 5
1 1 -2
2 1 6
3 3 20
4 4 2
alt.Chart(df).mark_circle().encode(
    x="A"
)
alt.Chart(df).mark_circle().encode(
    x="A",
    y="B"
)
alt.Chart(df).mark_circle().encode(
    x="A",
    y="B",
    size="B"
)
alt.Chart(df).mark_circle().encode(
    x="A",
    y="B",
    size="B",
    color="A"
)
alt.Chart(df).mark_circle().encode(
    x="A",
    y="B",
    size="B",
    color="A:N"
)

Using Altair with a real dataset

df = pd.read_csv("../Data/cars.csv")
df
Name Miles_per_Gallon Cylinders Displacement Horsepower Weight_in_lbs Acceleration Year Origin
0 chevrolet chevelle malibu 18.0 8 307.0 130.0 3504 12.0 1970-01-01 USA
1 buick skylark 320 15.0 8 350.0 165.0 3693 11.5 1970-01-01 USA
2 plymouth satellite 18.0 8 318.0 150.0 3436 11.0 1970-01-01 USA
3 amc rebel sst 16.0 8 304.0 150.0 3433 12.0 1970-01-01 USA
4 ford torino 17.0 8 302.0 140.0 3449 10.5 1970-01-01 USA
... ... ... ... ... ... ... ... ... ...
401 ford mustang gl 27.0 4 140.0 86.0 2790 15.6 1982-01-01 USA
402 vw pickup 44.0 4 97.0 52.0 2130 24.6 1982-01-01 Europe
403 dodge rampage 32.0 4 135.0 84.0 2295 11.6 1982-01-01 USA
404 ford ranger 28.0 4 120.0 79.0 2625 18.6 1982-01-01 USA
405 chevy s-10 31.0 4 119.0 82.0 2720 19.4 1982-01-01 USA

406 rows × 9 columns

alt.Chart(df).mark_circle().encode(
    x="Horsepower"
)
df.describe()
Miles_per_Gallon Cylinders Displacement Horsepower Weight_in_lbs Acceleration
count 398.000000 406.000000 406.000000 400.000000 406.000000 406.000000
mean 23.514573 5.475369 194.779557 105.082500 2979.413793 15.519704
std 7.815984 1.712160 104.922458 38.768779 847.004328 2.803359
min 9.000000 3.000000 68.000000 46.000000 1613.000000 8.000000
25% 17.500000 4.000000 105.000000 75.750000 2226.500000 13.700000
50% 23.000000 4.000000 151.000000 95.000000 2822.500000 15.500000
75% 29.000000 8.000000 302.000000 130.000000 3618.250000 17.175000
max 46.600000 8.000000 455.000000 230.000000 5140.000000 24.800000
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs"
)
((df["Horsepower"]>200) & (df["Weight_in_lbs"]<3500)).sum()
1
df[(df["Horsepower"]>200) & (df["Weight_in_lbs"]<3500)]
Name Miles_per_Gallon Cylinders Displacement Horsepower Weight_in_lbs Acceleration Year Origin
19 buick estate wagon (sw) 14.0 8 455.0 225.0 3086 10.0 1970-01-01 USA
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip="Name"
)
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"]
)
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin"
)
df.Origin.unique()
array(['USA', 'Europe', 'Japan'], dtype=object)

Encoding Data Types in Altair

Reference: Encoding Data Types in the Altair documentation

df = pd.read_csv("../Data/cars.csv")
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin"
)
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin:N"
)
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin:O"
)
alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin:Q"
)
alt.Chart(df).mark_circle().encode(
    x="Horsepower:Q",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin"
)
alt.Chart(df).mark_circle().encode(
    x="Horsepower:O",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin"
)
alt.Chart(df).mark_bar().encode(
    x="Horsepower:O",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin"
)
df[df["Horsepower"]==52]
Name Miles_per_Gallon Cylinders Displacement Horsepower Weight_in_lbs Acceleration Year Origin
151 toyota corona 31.0 4 76.0 52.0 1649 16.5 1974-01-01 Japan
202 chevrolet chevette 29.0 4 85.0 52.0 2035 22.2 1976-01-01 USA
253 mazda glc deluxe 32.8 4 78.0 52.0 1985 19.4 1978-01-01 Japan
402 vw pickup 44.0 4 97.0 52.0 2130 24.6 1982-01-01 Europe

Interactive bar chart

df = pd.read_csv("../Data/cars.csv")
brush = alt.selection_interval()

c1 = alt.Chart(df).mark_circle().encode(
    x="Horsepower",
    y="Weight_in_lbs",
    tooltip=["Year","Origin","Name"],
    color="Origin"
).add_selection(brush)
c2 = alt.Chart(df).mark_bar().encode(
    x="Origin",
    y="count()",
)
c2
df.Origin.value_counts()
USA       254
Japan      79
Europe     73
Name: Origin, dtype: int64
c1|c2
c2 = alt.Chart(df).mark_bar().encode(
    x="Origin",
    y=alt.Y("count()",scale=alt.Scale(domain=(0,300))),
).transform_filter(brush)
c1|c2