Week 2 Videos
Contents
Week 2 Videos¶
Introduction to Altair¶
import pandas as pd
import altair as alt
df = pd.DataFrame({'A': [0,1,1,3,4], 'B': [5,-2,6,20,2]})
alt.Chart(df)
---------------------------------------------------------------------------
SchemaValidationError Traceback (most recent call last)
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in to_dict(self, *args, **kwargs)
2018 copy.data = core.InlineData(values=[{}])
2019 return super(Chart, copy).to_dict(*args, **kwargs)
-> 2020 return super().to_dict(*args, **kwargs)
2021
2022 def add_selection(self, *selections):
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in to_dict(self, *args, **kwargs)
391 if dct is None:
392 kwargs["validate"] = "deep"
--> 393 dct = super(TopLevelMixin, copy).to_dict(*args, **kwargs)
394
395 # TODO: following entries are added after validation. Should they be validated?
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/utils/schemapi.py in to_dict(self, validate, ignore, context)
338 self.validate(result)
339 except jsonschema.ValidationError as err:
--> 340 raise SchemaValidationError(self, err)
341 return result
342
SchemaValidationError: Invalid specification
altair.vegalite.v4.api.Chart, validating 'required'
'mark' is a required property
alt.Chart(...)
alt.Chart(df).mark_circle()
df
A | B | |
---|---|---|
0 | 0 | 5 |
1 | 1 | -2 |
2 | 1 | 6 |
3 | 3 | 20 |
4 | 4 | 2 |
alt.Chart(df).mark_circle().encode(
x="A"
)
alt.Chart(df).mark_circle().encode(
x="A",
y="B"
)
alt.Chart(df).mark_circle().encode(
x="A",
y="B",
size="B"
)
alt.Chart(df).mark_circle().encode(
x="A",
y="B",
size="B",
color="A"
)
alt.Chart(df).mark_circle().encode(
x="A",
y="B",
size="B",
color="A:N"
)
Using Altair with a real dataset¶
df = pd.read_csv("../Data/cars.csv")
df
Name | Miles_per_Gallon | Cylinders | Displacement | Horsepower | Weight_in_lbs | Acceleration | Year | Origin | |
---|---|---|---|---|---|---|---|---|---|
0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 1970-01-01 | USA |
1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 1970-01-01 | USA |
2 | plymouth satellite | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 1970-01-01 | USA |
3 | amc rebel sst | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 1970-01-01 | USA |
4 | ford torino | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 1970-01-01 | USA |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
401 | ford mustang gl | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 1982-01-01 | USA |
402 | vw pickup | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 1982-01-01 | Europe |
403 | dodge rampage | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 1982-01-01 | USA |
404 | ford ranger | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 1982-01-01 | USA |
405 | chevy s-10 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 1982-01-01 | USA |
406 rows × 9 columns
alt.Chart(df).mark_circle().encode(
x="Horsepower"
)
df.describe()
Miles_per_Gallon | Cylinders | Displacement | Horsepower | Weight_in_lbs | Acceleration | |
---|---|---|---|---|---|---|
count | 398.000000 | 406.000000 | 406.000000 | 400.000000 | 406.000000 | 406.000000 |
mean | 23.514573 | 5.475369 | 194.779557 | 105.082500 | 2979.413793 | 15.519704 |
std | 7.815984 | 1.712160 | 104.922458 | 38.768779 | 847.004328 | 2.803359 |
min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 |
25% | 17.500000 | 4.000000 | 105.000000 | 75.750000 | 2226.500000 | 13.700000 |
50% | 23.000000 | 4.000000 | 151.000000 | 95.000000 | 2822.500000 | 15.500000 |
75% | 29.000000 | 8.000000 | 302.000000 | 130.000000 | 3618.250000 | 17.175000 |
max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 |
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs"
)
((df["Horsepower"]>200) & (df["Weight_in_lbs"]<3500)).sum()
1
df[(df["Horsepower"]>200) & (df["Weight_in_lbs"]<3500)]
Name | Miles_per_Gallon | Cylinders | Displacement | Horsepower | Weight_in_lbs | Acceleration | Year | Origin | |
---|---|---|---|---|---|---|---|---|---|
19 | buick estate wagon (sw) | 14.0 | 8 | 455.0 | 225.0 | 3086 | 10.0 | 1970-01-01 | USA |
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip="Name"
)
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"]
)
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin"
)
df.Origin.unique()
array(['USA', 'Europe', 'Japan'], dtype=object)
Encoding Data Types in Altair¶
Reference: Encoding Data Types in the Altair documentation
df = pd.read_csv("../Data/cars.csv")
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin"
)
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin:N"
)
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin:O"
)
alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin:Q"
)
alt.Chart(df).mark_circle().encode(
x="Horsepower:Q",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin"
)
alt.Chart(df).mark_circle().encode(
x="Horsepower:O",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin"
)
alt.Chart(df).mark_bar().encode(
x="Horsepower:O",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin"
)
df[df["Horsepower"]==52]
Name | Miles_per_Gallon | Cylinders | Displacement | Horsepower | Weight_in_lbs | Acceleration | Year | Origin | |
---|---|---|---|---|---|---|---|---|---|
151 | toyota corona | 31.0 | 4 | 76.0 | 52.0 | 1649 | 16.5 | 1974-01-01 | Japan |
202 | chevrolet chevette | 29.0 | 4 | 85.0 | 52.0 | 2035 | 22.2 | 1976-01-01 | USA |
253 | mazda glc deluxe | 32.8 | 4 | 78.0 | 52.0 | 1985 | 19.4 | 1978-01-01 | Japan |
402 | vw pickup | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 1982-01-01 | Europe |
Interactive bar chart¶
df = pd.read_csv("../Data/cars.csv")
brush = alt.selection_interval()
c1 = alt.Chart(df).mark_circle().encode(
x="Horsepower",
y="Weight_in_lbs",
tooltip=["Year","Origin","Name"],
color="Origin"
).add_selection(brush)
c2 = alt.Chart(df).mark_bar().encode(
x="Origin",
y="count()",
)
c2
df.Origin.value_counts()
USA 254
Japan 79
Europe 73
Name: Origin, dtype: int64
c1|c2
c2 = alt.Chart(df).mark_bar().encode(
x="Origin",
y=alt.Y("count()",scale=alt.Scale(domain=(0,300))),
).transform_filter(brush)
c1|c2