Week 3 Videos#

  • Which species in the penguins dataset has the longest median bill length?

import pandas as pd
import altair as alt
import seaborn as sns
df = sns.load_dataset("penguins")

Median length using groupby#

df.head(3)
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
df.groupby("species")
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f813784ae20>
df.groupby("species").median()
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
species
Adelie 38.80 18.40 190.0 3700.0
Chinstrap 49.55 18.45 196.0 3700.0
Gentoo 47.30 15.00 216.0 5000.0
df.groupby("species").median()["bill_length_mm"]
species
Adelie       38.80
Chinstrap    49.55
Gentoo       47.30
Name: bill_length_mm, dtype: float64
df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False)
species
Chinstrap    49.55
Gentoo       47.30
Adelie       38.80
Name: bill_length_mm, dtype: float64
df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False).index
Index(['Chinstrap', 'Gentoo', 'Adelie'], dtype='object', name='species')
df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False).index[0]
'Chinstrap'
df.groupby("species").median()["bill_length_mm"]
species
Adelie       38.80
Chinstrap    49.55
Gentoo       47.30
Name: bill_length_mm, dtype: float64
df.groupby("species").median()["bill_length_mm"].idxmax()
'Chinstrap'

Bar charts in Altair#

alt.Chart(df).mark_bar().encode(
    x="species",
    y="bill_length_mm"
)
df.sample(3)
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
302 Gentoo Biscoe 47.4 14.6 212.0 4725.0 Female
319 Gentoo Biscoe 51.1 16.5 225.0 5250.0 Male
24 Adelie Biscoe 38.8 17.2 180.0 3800.0 Male
df.groupby("species").max()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [32], line 1
----> 1 df.groupby("species").max()

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/groupby.py:1676, in GroupBy.max(self, numeric_only, min_count)
   1673 @final
   1674 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
   1675 def max(self, numeric_only: bool = False, min_count: int = -1):
-> 1676     return self._agg_general(
   1677         numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max
   1678     )

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/groupby.py:1024, in BaseGroupBy._agg_general(self, numeric_only, min_count, alias, npfunc)
   1022 result = None
   1023 try:
-> 1024     result = self._cython_agg_general(
   1025         how=alias,
   1026         alt=npfunc,
   1027         numeric_only=numeric_only,
   1028         min_count=min_count,
   1029     )
   1030 except DataError:
   1031     pass

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/generic.py:1015, in DataFrameGroupBy._cython_agg_general(self, how, alt, numeric_only, min_count)
   1012 def _cython_agg_general(
   1013     self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
   1014 ) -> DataFrame:
-> 1015     agg_mgr = self._cython_agg_blocks(
   1016         how, alt=alt, numeric_only=numeric_only, min_count=min_count
   1017     )
   1018     return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/groupby/generic.py:1118, in DataFrameGroupBy._cython_agg_blocks(self, how, alt, numeric_only, min_count)
   1113     return cast_agg_result(result, bvalues, how)
   1115 # TypeError -> we may have an exception in trying to aggregate
   1116 #  continue and exclude the block
   1117 # NotImplementedError -> "ohlc" with wrong dtype
-> 1118 new_mgr = data.apply(blk_func, ignore_failures=True)
   1120 if not len(new_mgr):
   1121     raise DataError("No numeric types to aggregate")

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/managers.py:425, in BlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    423 try:
    424     if callable(f):
--> 425         applied = b.apply(f, **kwargs)
    426     else:
    427         applied = getattr(b, f)(**kwargs)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:380, in Block.apply(self, func, **kwargs)
    377 with np.errstate(all="ignore"):
    378     result = func(self.values, **kwargs)
--> 380 return self._split_op_result(result)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:416, in Block._split_op_result(self, result)
    413     return nbs
    415 if not isinstance(result, Block):
--> 416     result = self.make_block(result)
    418 return [result]

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:286, in Block.make_block(self, values, placement)
    283 if self.is_extension:
    284     values = _block_shape(values, ndim=self.ndim)
--> 286 return make_block(values, placement=placement, ndim=self.ndim)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:2751, in make_block(values, placement, klass, ndim, dtype)
   2746 elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
   2747     # TODO: This is no longer hit internally; does it need to be retained
   2748     #  for e.g. pyarrow?
   2749     values = DatetimeArray._simple_new(values, dtype=dtype)
-> 2751 return klass(values, ndim=ndim, placement=placement)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/internals/blocks.py:142, in Block.__init__(self, values, placement, ndim)
    139 self.values = self._maybe_coerce_values(values)
    141 if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
--> 142     raise ValueError(
    143         f"Wrong number of items passed {len(self.values)}, "
    144         f"placement implies {len(self.mgr_locs)}"
    145     )

ValueError: Wrong number of items passed 1, placement implies 2
df.groupby("species").max(numeric_only=True)
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
species
Adelie 46.0 21.5 210.0 4775.0
Chinstrap 58.0 20.8 212.0 4800.0
Gentoo 59.6 17.3 231.0 6300.0
alt.Chart(df).mark_bar().encode(
    x="bill_depth_mm",
    y="bill_length_mm",
    color="species",
    tooltip=["bill_length_mm"]
)
alt.Chart(df).mark_circle().encode(
    x="bill_depth_mm",
    y="bill_length_mm",
    color="species"
)

Median values using Altair#

alt.Chart(df).mark_bar().encode(
    x="species",
    y="bill_length_mm"
)
alt.Chart(df).mark_bar().encode(
    x="species",
    y="median(bill_length_mm)",
    tooltip=["median(bill_length_mm)"]
)
alt.Chart(df).mark_bar().encode(
    x=alt.X("species", sort="y"),
    y="median(bill_length_mm)",
    tooltip=["median(bill_length_mm)"]
)
alt.Chart(df).mark_bar().encode(
    x=alt.X("species", sort="-y"),
    y="median(bill_length_mm)",
    tooltip=["median(bill_length_mm)"]
)
my_index = df.groupby("species").median()["bill_length_mm"].sort_values(ascending=False).index
my_index
Index(['Chinstrap', 'Gentoo', 'Adelie'], dtype='object', name='species')
alt.Chart(df).mark_bar().encode(
    x=alt.X("species", scale=alt.Scale(domain=my_index)),
    y="median(bill_length_mm)",
    tooltip=["median(bill_length_mm)"]
)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [44], line 2
      1 alt.Chart(df).mark_bar().encode(
----> 2     x=alt.X("species", scale=alt.Scale(domain=my_index)),
      3     y="median(bill_length_mm)",
      4     tooltip=["median(bill_length_mm)"]
      5 )

File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/vegalite/v4/schema/core.py:13244, in Scale.__init__(self, align, base, bins, clamp, constant, domain, domainMax, domainMid, domainMin, exponent, interpolate, nice, padding, paddingInner, paddingOuter, range, rangeMax, rangeMin, reverse, round, scheme, type, zero, **kwds)
  13238 def __init__(self, align=Undefined, base=Undefined, bins=Undefined, clamp=Undefined,
  13239              constant=Undefined, domain=Undefined, domainMax=Undefined, domainMid=Undefined,
  13240              domainMin=Undefined, exponent=Undefined, interpolate=Undefined, nice=Undefined,
  13241              padding=Undefined, paddingInner=Undefined, paddingOuter=Undefined, range=Undefined,
  13242              rangeMax=Undefined, rangeMin=Undefined, reverse=Undefined, round=Undefined,
  13243              scheme=Undefined, type=Undefined, zero=Undefined, **kwds):
> 13244     super(Scale, self).__init__(align=align, base=base, bins=bins, clamp=clamp, constant=constant,
  13245                                 domain=domain, domainMax=domainMax, domainMid=domainMid,
  13246                                 domainMin=domainMin, exponent=exponent, interpolate=interpolate,
  13247                                 nice=nice, padding=padding, paddingInner=paddingInner,
  13248                                 paddingOuter=paddingOuter, range=range, rangeMax=rangeMax,
  13249                                 rangeMin=rangeMin, reverse=reverse, round=round, scheme=scheme,
  13250                                 type=type, zero=zero, **kwds)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/utils/schemapi.py:177, in SchemaBase.__init__(self, *args, **kwds)
    174 object.__setattr__(self, "_kwds", kwds)
    176 if DEBUG_MODE and self._class_is_valid_at_instantiation:
--> 177     self.to_dict(validate=True)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/utils/schemapi.py:338, in SchemaBase.to_dict(self, validate, ignore, context)
    336 if validate:
    337     try:
--> 338         self.validate(result)
    339     except jsonschema.ValidationError as err:
    340         raise SchemaValidationError(self, err)

File /shared-libs/python3.9/py/lib/python3.9/site-packages/altair/utils/schemapi.py:443, in SchemaBase.validate(cls, instance, schema)
    441     schema = cls._schema
    442 resolver = jsonschema.RefResolver.from_schema(cls._rootschema or cls._schema)
--> 443 return jsonschema.validate(
    444     instance, schema, cls=cls._validator, resolver=resolver
    445 )

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:932, in validate(instance, schema, cls, *args, **kwargs)
    930 cls.check_schema(schema)
    931 validator = cls(schema, *args, **kwargs)
--> 932 error = exceptions.best_match(validator.iter_errors(instance))
    933 if error is not None:
    934     raise error

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/exceptions.py:367, in best_match(errors, key)
    329 """
    330 Try to find an error that appears to be the best match among given errors.
    331 
   (...)
    364     set of inputs from version to version if better heuristics are added.
    365 """
    366 errors = iter(errors)
--> 367 best = next(errors, None)
    368 if best is None:
    369     return

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
    325     continue
    327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
    329     # set details if not already set by the called fn
    330     error._set(
    331         validator=k,
    332         validator_value=v,
    333         instance=instance,
    334         schema=_schema,
    335     )
    336     if k != u"$ref":

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:263, in ref(validator, ref, instance, schema)
    260 validator.resolver.push_scope(scope)
    262 try:
--> 263     for error in validator.descend(instance, resolved):
    264         yield error
    265 finally:

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:344, in create.<locals>.Validator.descend(self, instance, schema, path, schema_path)
    343 def descend(self, instance, schema, path=None, schema_path=None):
--> 344     for error in self.iter_errors(instance, schema):
    345         if path is not None:
    346             error.path.appendleft(path)

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
    325     continue
    327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
    329     # set details if not already set by the called fn
    330     error._set(
    331         validator=k,
    332         validator_value=v,
    333         instance=instance,
    334         schema=_schema,
    335     )
    336     if k != u"$ref":

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:282, in properties(validator, properties, instance, schema)
    280 for property, subschema in iteritems(properties):
    281     if property in instance:
--> 282         for error in validator.descend(
    283             instance[property],
    284             subschema,
    285             path=property,
    286             schema_path=property,
    287         ):
    288             yield error

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:344, in create.<locals>.Validator.descend(self, instance, schema, path, schema_path)
    343 def descend(self, instance, schema, path=None, schema_path=None):
--> 344     for error in self.iter_errors(instance, schema):
    345         if path is not None:
    346             error.path.appendleft(path)

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
    325     continue
    327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
    329     # set details if not already set by the called fn
    330     error._set(
    331         validator=k,
    332         validator_value=v,
    333         instance=instance,
    334         schema=_schema,
    335     )
    336     if k != u"$ref":

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:322, in anyOf(validator, anyOf, instance, schema)
    320 all_errors = []
    321 for index, subschema in enumerate(anyOf):
--> 322     errs = list(validator.descend(instance, subschema, schema_path=index))
    323     if not errs:
    324         break

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:344, in create.<locals>.Validator.descend(self, instance, schema, path, schema_path)
    343 def descend(self, instance, schema, path=None, schema_path=None):
--> 344     for error in self.iter_errors(instance, schema):
    345         if path is not None:
    346             error.path.appendleft(path)

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/validators.py:328, in create.<locals>.Validator.iter_errors(self, instance, _schema)
    325     continue
    327 errors = validator(self, v, instance, _schema) or ()
--> 328 for error in errors:
    329     # set details if not already set by the called fn
    330     error._set(
    331         validator=k,
    332         validator_value=v,
    333         instance=instance,
    334         schema=_schema,
    335     )
    336     if k != u"$ref":

File /shared-libs/python3.9/py-core/lib/python3.9/site-packages/jsonschema/_validators.py:106, in const(validator, const, instance, schema)
    105 def const(validator, const, instance, schema):
--> 106     if not equal(instance, const):
    107         yield ValidationError("%r was expected" % (const,))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
my_list = list(my_index)
my_list
['Chinstrap', 'Gentoo', 'Adelie']
alt.Chart(df).mark_bar().encode(
    x=alt.X("species", scale=alt.Scale(domain=my_list)),
    y="median(bill_length_mm)",
    tooltip=["median(bill_length_mm)"]
)