Week 3 Videos
Contents
Week 3 Videos#
Which species in the penguins dataset has the longest median bill length?
import pandas as pd
import altair as alt
import seaborn as sns
df = sns.load_dataset("penguins")
Median length using groupby
#
df.head(3)
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
df.groupby("species")
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f813784ae20>
df.groupby("species").median()
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|---|
species | ||||
Adelie | 38.80 | 18.40 | 190.0 | 3700.0 |
Chinstrap | 49.55 | 18.45 | 196.0 | 3700.0 |
Gentoo | 47.30 | 15.00 | 216.0 | 5000.0 |
df.groupby("species").median()["bill_length_mm"]
species
Adelie 38.80
Chinstrap 49.55
Gentoo 47.30
Name: bill_length_mm, dtype: float64
df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False)
species
Chinstrap 49.55
Gentoo 47.30
Adelie 38.80
Name: bill_length_mm, dtype: float64
df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False).index
Index(['Chinstrap', 'Gentoo', 'Adelie'], dtype='object', name='species')
df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False).index[0]
'Chinstrap'
df.groupby("species").median()["bill_length_mm"]
species
Adelie 38.80
Chinstrap 49.55
Gentoo 47.30
Name: bill_length_mm, dtype: float64
df.groupby("species").median()["bill_length_mm"].idxmax()
'Chinstrap'
Bar charts in Altair#
alt.Chart(df).mark_bar().encode(
x="species",
y="bill_length_mm"
)
df.sample(3)
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
302 | Gentoo | Biscoe | 47.4 | 14.6 | 212.0 | 4725.0 | Female |
319 | Gentoo | Biscoe | 51.1 | 16.5 | 225.0 | 5250.0 | Male |
24 | Adelie | Biscoe | 38.8 | 17.2 | 180.0 | 3800.0 | Male |
df.groupby("species").max()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [32], line 1
----> 1 df.groupby("species").max()
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/groupby.py:1676, in GroupBy.max(self, numeric_only, min_count)
1673 @final
1674 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
1675 def max(self, numeric_only: bool = False, min_count: int = -1):
-> 1676 return self._agg_general(
1677 numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max
1678 )
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/groupby.py:1024, in BaseGroupBy._agg_general(self, numeric_only, min_count, alias, npfunc)
1022 result = None
1023 try:
-> 1024 result = self._cython_agg_general(
1025 how=alias,
1026 alt=npfunc,
1027 numeric_only=numeric_only,
1028 min_count=min_count,
1029 )
1030 except DataError:
1031 pass
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/generic.py:1015, in DataFrameGroupBy._cython_agg_general(self, how, alt, numeric_only, min_count)
1012 def _cython_agg_general(
1013 self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
1014 ) -> DataFrame:
-> 1015 agg_mgr = self._cython_agg_blocks(
1016 how, alt=alt, numeric_only=numeric_only, min_count=min_count
1017 )
1018 return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/generic.py:1118, in DataFrameGroupBy._cython_agg_blocks(self, how, alt, numeric_only, min_count)
1113 return cast_agg_result(result, bvalues, how)
1115 # TypeError -> we may have an exception in trying to aggregate
1116 # continue and exclude the block
1117 # NotImplementedError -> "ohlc" with wrong dtype
-> 1118 new_mgr = data.apply(blk_func, ignore_failures=True)
1120 if not len(new_mgr):
1121 raise DataError("No numeric types to aggregate")
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/managers.py:425, in BlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
423 try:
424 if callable(f):
--> 425 applied = b.apply(f, **kwargs)
426 else:
427 applied = getattr(b, f)(**kwargs)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:380, in Block.apply(self, func, **kwargs)
377 with np.errstate(all="ignore"):
378 result = func(self.values, **kwargs)
--> 380 return self._split_op_result(result)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:416, in Block._split_op_result(self, result)
413 return nbs
415 if not isinstance(result, Block):
--> 416 result = self.make_block(result)
418 return [result]
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:286, in Block.make_block(self, values, placement)
283 if self.is_extension:
284 values = _block_shape(values, ndim=self.ndim)
--> 286 return make_block(values, placement=placement, ndim=self.ndim)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:2751, in make_block(values, placement, klass, ndim, dtype)
2746 elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
2747 # TODO: This is no longer hit internally; does it need to be retained
2748 # for e.g. pyarrow?
2749 values = DatetimeArray._simple_new(values, dtype=dtype)
-> 2751 return klass(values, ndim=ndim, placement=placement)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:142, in Block.__init__(self, values, placement, ndim)
139 self.values = self._maybe_coerce_values(values)
141 if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
--> 142 raise ValueError(
143 f"Wrong number of items passed {len(self.values)}, "
144 f"placement implies {len(self.mgr_locs)}"
145 )
ValueError: Wrong number of items passed 1, placement implies 2
df.groupby("species").max(numeric_only=True)
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|---|
species | ||||
Adelie | 46.0 | 21.5 | 210.0 | 4775.0 |
Chinstrap | 58.0 | 20.8 | 212.0 | 4800.0 |
Gentoo | 59.6 | 17.3 | 231.0 | 6300.0 |
alt.Chart(df).mark_bar().encode(
x="bill_depth_mm",
y="bill_length_mm",
color="species",
tooltip=["bill_length_mm"]
)
alt.Chart(df).mark_circle().encode(
x="bill_depth_mm",
y="bill_length_mm",
color="species"
)
Median values using Altair#
alt.Chart(df).mark_bar().encode(
x="species",
y="bill_length_mm"
)
alt.Chart(df).mark_bar().encode(
x="species",
y="median(bill_length_mm)",
tooltip=["median(bill_length_mm)"]
)
alt.Chart(df).mark_bar().encode(
x=alt.X("species", sort="y"),
y="median(bill_length_mm)",
tooltip=["median(bill_length_mm)"]
)
alt.Chart(df).mark_bar().encode(
x=alt.X("species", sort="-y"),
y="median(bill_length_mm)",
tooltip=["median(bill_length_mm)"]
)
my_index = df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False).index
my_index
Index(['Chinstrap', 'Gentoo', 'Adelie'], dtype='object', name='species')
alt.Chart(df).mark_bar().encode(
x=alt.X("species", scale=alt.Scale(domain=my_index)),
y="median(bill_length_mm)",
tooltip=["median(bill_length_mm)"]
)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [44], line 2
1 alt.Chart(df).mark_bar().encode(
----> 2 x=alt.X("species", scale=alt.Scale(domain=my_index)),
3 y="median(bill_length_mm)",
4 tooltip=["median(bill_length_mm)"]
5 )
File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/vegalite/v4/schema/core.py:13244, in Scale.__init__(self, align, base, bins, clamp, constant, domain, domainMax, domainMid, domainMin, exponent, interpolate, nice, padding, paddingInner, paddingOuter, range, rangeMax, rangeMin, reverse, round, scheme, type, zero, **kwds)
13238 def __init__(self, align=Undefined, base=Undefined, bins=Undefined, clamp=Undefined,
13239 constant=Undefined, domain=Undefined, domainMax=Undefined, domainMid=Undefined,
13240 domainMin=Undefined, exponent=Undefined, interpolate=Undefined, nice=Undefined,
13241 padding=Undefined, paddingInner=Undefined, paddingOuter=Undefined, range=Undefined,
13242 rangeMax=Undefined, rangeMin=Undefined, reverse=Undefined, round=Undefined,
13243 scheme=Undefined, type=Undefined, zero=Undefined, **kwds):
> 13244 super(Scale, self).__init__(align=align, base=base, bins=bins, clamp=clamp, constant=constant,
13245 domain=domain, domainMax=domainMax, domainMid=domainMid,
13246 domainMin=domainMin, exponent=exponent, interpolate=interpolate,
13247 nice=nice, padding=padding, paddingInner=paddingInner,
13248 paddingOuter=paddingOuter, range=range, rangeMax=rangeMax,
13249 rangeMin=rangeMin, reverse=reverse, round=round, scheme=scheme,
13250 type=type, zero=zero, **kwds)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/utils/schemapi.py:177, in SchemaBase.__init__(self, *args, **kwds)
174 object.__setattr__(self, "_kwds", kwds)
176 if DEBUG_MODE and self._class_is_valid_at_instantiation:
--> 177 self.to_dict(validate=True)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/utils/schemapi.py:338, in SchemaBase.to_dict(self, validate, ignore, context)
336 if validate:
337 try:
--> 338 self.validate(result)
339 except jsonschema.ValidationError as err:
340 raise SchemaValidationError(self, err)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/utils/schemapi.py:443, in SchemaBase.validate(cls, instance, schema)
441 schema = cls._schema
442 resolver = jsonschema.RefResolver.from_schema(cls._rootschema or cls._schema)
--> 443 return jsonschema.validate(
444 instance, schema, cls=cls._validator, resolver=resolver
445 )
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:932, in validate(instance, schema, cls, *args, **kwargs)
930 cls.check_schema(schema)
931 validator = cls(schema, *args, **kwargs)
--> 932 error = exceptions.best_match(validator.iter_errors(instance))
933 if error is not None:
934 raise error
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/exceptions.py:367, in best_match(errors, key)
329 """
330 Try to find an error that appears to be the best match among given errors.
331
(...)
364 set of inputs from version to version if better heuristics are added.
365 """
366 errors = iter(errors)
--> 367 best = next(errors, None)
368 if best is None:
369 return
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
325 continue
327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
329 # set details if not already set by the called fn
330 error._set(
331 validator=k,
332 validator_value=v,
333 instance=instance,
334 schema=_schema,
335 )
336 if k != u"$ref":
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:263, in ref(validator, ref, instance, schema)
260 validator.resolver.push_scope(scope)
262 try:
--> 263 for error in validator.descend(instance, resolved):
264 yield error
265 finally:
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:344, in create.<locals>.Validator.descend(self, instance, schema, path, schema_path)
343 def descend(self, instance, schema, path=None, schema_path=None):
--> 344 for error in self.iter_errors(instance, schema):
345 if path is not None:
346 error.path.appendleft(path)
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
325 continue
327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
329 # set details if not already set by the called fn
330 error._set(
331 validator=k,
332 validator_value=v,
333 instance=instance,
334 schema=_schema,
335 )
336 if k != u"$ref":
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:282, in properties(validator, properties, instance, schema)
280 for property, subschema in iteritems(properties):
281 if property in instance:
--> 282 for error in validator.descend(
283 instance[property],
284 subschema,
285 path=property,
286 schema_path=property,
287 ):
288 yield error
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:344, in create.<locals>.Validator.descend(self, instance, schema, path, schema_path)
343 def descend(self, instance, schema, path=None, schema_path=None):
--> 344 for error in self.iter_errors(instance, schema):
345 if path is not None:
346 error.path.appendleft(path)
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
325 continue
327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
329 # set details if not already set by the called fn
330 error._set(
331 validator=k,
332 validator_value=v,
333 instance=instance,
334 schema=_schema,
335 )
336 if k != u"$ref":
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:322, in anyOf(validator, anyOf, instance, schema)
320 all_errors = []
321 for index, subschema in enumerate(anyOf):
--> 322 errs = list(validator.descend(instance, subschema, schema_path=index))
323 if not errs:
324 break
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:344, in create.<locals>.Validator.descend(self, instance, schema, path, schema_path)
343 def descend(self, instance, schema, path=None, schema_path=None):
--> 344 for error in self.iter_errors(instance, schema):
345 if path is not None:
346 error.path.appendleft(path)
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
325 continue
327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
329 # set details if not already set by the called fn
330 error._set(
331 validator=k,
332 validator_value=v,
333 instance=instance,
334 schema=_schema,
335 )
336 if k != u"$ref":
File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:106, in const(validator, const, instance, schema)
105 def const(validator, const, instance, schema):
--> 106 if not equal(instance, const):
107 yield ValidationError("%r was expected" % (const,))
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
my_list = list(my_index)
my_list
['Chinstrap', 'Gentoo', 'Adelie']
alt.Chart(df).mark_bar().encode(
x=alt.X("species", scale=alt.Scale(domain=my_list)),
y="median(bill_length_mm)",
tooltip=["median(bill_length_mm)"]
)