Midterm Review¶
import numpy as np
import pandas as pd
import altair as alt
from sklearn.linear_model import LinearRegression
rng = np.random.default_rng()
A = rng.integers(0,10,size=(1,5))
A
array([[1, 6, 4, 5, 4]])
A = rng.integers(0,10,size=(5))
A
array([3, 3, 8, 6, 5])
A = rng.integers(0,10,size=(2,5))
A
array([[5, 9, 6, 0, 9],
[4, 2, 1, 2, 8]])
A = rng.integers(0,10,size=(5,10))
# How many columns in A contain a 6
B = (A == 6)
type(B)
numpy.ndarray
B.sum(axis=0).shape
(10,)
A
array([[8, 8, 8, 6, 5, 1, 3, 6, 1, 7],
[8, 8, 9, 5, 1, 7, 5, 6, 4, 7],
[2, 7, 1, 9, 5, 9, 8, 8, 6, 2],
[8, 2, 4, 1, 0, 1, 9, 0, 5, 7],
[9, 5, 0, 2, 9, 4, 8, 7, 9, 9]])
B = (A == 6)
B[2,0:4]
array([False, False, False, False])
B[2,1::2]
array([False, False, False, False, False])
C = (B.sum(axis=0))
C
array([0, 0, 0, 1, 0, 0, 0, 2, 1, 0])
(C > 0).sum()
3
A
array([[8, 8, 8, 6, 5, 1, 3, 6, 1, 7],
[8, 8, 9, 5, 1, 7, 5, 6, 4, 7],
[2, 7, 1, 9, 5, 9, 8, 8, 6, 2],
[8, 2, 4, 1, 0, 1, 9, 0, 5, 7],
[9, 5, 0, 2, 9, 4, 8, 7, 9, 9]])
df = pd.DataFrame(A)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 8 | 8 | 6 | 5 | 1 | 3 | 6 | 1 | 7 |
1 | 8 | 8 | 9 | 5 | 1 | 7 | 5 | 6 | 4 | 7 |
2 | 2 | 7 | 1 | 9 | 5 | 9 | 8 | 8 | 6 | 2 |
3 | 8 | 2 | 4 | 1 | 0 | 1 | 9 | 0 | 5 | 7 |
4 | 9 | 5 | 0 | 2 | 9 | 4 | 8 | 7 | 9 | 9 |
def has6(c):
b = (6 in c)
return b
def has6(c):
return (6 in list(c))
has6([2,5,6,3])
True
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 8 | 8 | 6 | 5 | 1 | 3 | 6 | 1 | 7 |
1 | 8 | 8 | 9 | 5 | 1 | 7 | 5 | 6 | 4 | 7 |
2 | 2 | 7 | 1 | 9 | 5 | 9 | 8 | 8 | 6 | 2 |
3 | 8 | 2 | 4 | 1 | 0 | 1 | 9 | 0 | 5 | 7 |
4 | 9 | 5 | 0 | 2 | 9 | 4 | 8 | 7 | 9 | 9 |
df.apply(has6,axis=0)
0 False
1 False
2 False
3 True
4 False
5 False
6 False
7 True
8 True
9 False
dtype: bool
df == 6
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | True | False | False | False | True | False | False |
1 | False | False | False | False | False | False | False | True | False | False |
2 | False | False | False | False | False | False | False | False | True | False |
3 | False | False | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False |
(df == 6).any(axis=0)
0 False
1 False
2 False
3 True
4 False
5 False
6 False
7 True
8 True
9 False
dtype: bool
(df == 6).any(axis=1)
0 True
1 True
2 True
3 False
4 False
dtype: bool
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 8 | 8 | 6 | 5 | 1 | 3 | 6 | 1 | 7 |
1 | 8 | 8 | 9 | 5 | 1 | 7 | 5 | 6 | 4 | 7 |
2 | 2 | 7 | 1 | 9 | 5 | 9 | 8 | 8 | 6 | 2 |
3 | 8 | 2 | 4 | 1 | 0 | 1 | 9 | 0 | 5 | 7 |
4 | 9 | 5 | 0 | 2 | 9 | 4 | 8 | 7 | 9 | 9 |
help(df.apply)
Help on method apply in module pandas.core.frame:
apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type=None, args=(), **kwargs) method of pandas.core.frame.DataFrame instance
Apply a function along an axis of the DataFrame.
Objects passed to the function are Series objects whose index is
either the DataFrame's index (``axis=0``) or the DataFrame's columns
(``axis=1``). By default (``result_type=None``), the final return type
is inferred from the return type of the applied function. Otherwise,
it depends on the `result_type` argument.
Parameters
----------
func : function
Function to apply to each column or row.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis along which the function is applied:
* 0 or 'index': apply function to each column.
* 1 or 'columns': apply function to each row.
raw : bool, default False
Determines if row or column is passed as a Series or ndarray object:
* ``False`` : passes each row or column as a Series to the
function.
* ``True`` : the passed function will receive ndarray objects
instead.
If you are just applying a NumPy reduction function this will
achieve much better performance.
result_type : {'expand', 'reduce', 'broadcast', None}, default None
These only act when ``axis=1`` (columns):
* 'expand' : list-like results will be turned into columns.
* 'reduce' : returns a Series if possible rather than expanding
list-like results. This is the opposite of 'expand'.
* 'broadcast' : results will be broadcast to the original shape
of the DataFrame, the original index and columns will be
retained.
The default behaviour (None) depends on the return value of the
applied function: list-like results will be returned as a Series
of those. However if the apply function returns a Series these
are expanded to columns.
args : tuple
Positional arguments to pass to `func` in addition to the
array/series.
**kwargs
Additional keyword arguments to pass as keywords arguments to
`func`.
Returns
-------
Series or DataFrame
Result of applying ``func`` along the given axis of the
DataFrame.
See Also
--------
DataFrame.applymap: For elementwise operations.
DataFrame.aggregate: Only perform aggregating type operations.
DataFrame.transform: Only perform transforming type operations.
Notes
-----
Functions that mutate the passed object can produce unexpected
behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
for more details.
Examples
--------
>>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
>>> df
A B
0 4 9
1 4 9
2 4 9
Using a numpy universal function (in this case the same as
``np.sqrt(df)``):
>>> df.apply(np.sqrt)
A B
0 2.0 3.0
1 2.0 3.0
2 2.0 3.0
Using a reducing function on either axis
>>> df.apply(np.sum, axis=0)
A 12
B 27
dtype: int64
>>> df.apply(np.sum, axis=1)
0 13
1 13
2 13
dtype: int64
Returning a list-like will result in a Series
>>> df.apply(lambda x: [1, 2], axis=1)
0 [1, 2]
1 [1, 2]
2 [1, 2]
dtype: object
Passing ``result_type='expand'`` will expand list-like results
to columns of a Dataframe
>>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
0 1
0 1 2
1 1 2
2 1 2
Returning a Series inside the function is similar to passing
``result_type='expand'``. The resulting column names
will be the Series index.
>>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
foo bar
0 1 2
1 1 2
2 1 2
Passing ``result_type='broadcast'`` will ensure the same shape
result, whether list-like or scalar is returned by the function,
and broadcast it along the axis. The resulting column names will
be the originals.
>>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
A B
0 1 2
1 1 2
2 1 2
df.apply(lambda row: row[2]-row[3], axis=1)
0 2
1 4
2 -8
3 3
4 -2
dtype: int64
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 8 | 8 | 6 | 5 | 1 | 3 | 6 | 1 | 7 |
1 | 8 | 8 | 9 | 5 | 1 | 7 | 5 | 6 | 4 | 7 |
2 | 2 | 7 | 1 | 9 | 5 | 9 | 8 | 8 | 6 | 2 |
3 | 8 | 2 | 4 | 1 | 0 | 1 | 9 | 0 | 5 | 7 |
4 | 9 | 5 | 0 | 2 | 9 | 4 | 8 | 7 | 9 | 9 |
df.max(axis=0)
0 9
1 8
2 9
3 9
4 9
5 9
6 9
7 8
8 9
9 9
dtype: int64
df.apply(max, axis=0)
0 9
1 8
2 9
3 9
4 9
5 9
6 9
7 8
8 9
9 9
dtype: int64
df.apply(lambda col: col[2]-col[3], axis=0)
0 -6
1 5
2 -3
3 8
4 5
5 8
6 -1
7 8
8 1
9 -5
dtype: int64
df.apply(len, axis=1)
0 10
1 10
2 10
3 10
4 10
dtype: int64
df.applymap(lambda x: 0 if x < 5 else 1000)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1000 | 1000 | 1000 | 1000 | 1000 | 0 | 0 | 1000 | 0 | 1000 |
1 | 1000 | 1000 | 1000 | 1000 | 0 | 1000 | 1000 | 1000 | 0 | 1000 |
2 | 0 | 1000 | 0 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 0 |
3 | 1000 | 0 | 0 | 0 | 0 | 0 | 1000 | 0 | 1000 | 1000 |
4 | 1000 | 1000 | 0 | 0 | 1000 | 0 | 1000 | 1000 | 1000 | 1000 |
df.applymap(lambda x: x+10)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 18 | 18 | 18 | 16 | 15 | 11 | 13 | 16 | 11 | 17 |
1 | 18 | 18 | 19 | 15 | 11 | 17 | 15 | 16 | 14 | 17 |
2 | 12 | 17 | 11 | 19 | 15 | 19 | 18 | 18 | 16 | 12 |
3 | 18 | 12 | 14 | 11 | 10 | 11 | 19 | 10 | 15 | 17 |
4 | 19 | 15 | 10 | 12 | 19 | 14 | 18 | 17 | 19 | 19 |
s = pd.Series({"a":10,"b":10,"c":3})
s
a 10
b 10
c 3
dtype: int64
t = pd.Series({"a":1,"b":1,"c":2})
t
a 1
b 1
c 2
dtype: int64
s.max()
10
{"a":10,"b":10,"c":3}.max()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83319/1825081433.py in <module>
----> 1 {"a":10,"b":10,"c":3}.max()
AttributeError: 'dict' object has no attribute 'max'
s.map(lambda x: x+3)
a 13
b 13
c 6
dtype: int64
df.map(lambda x: x+3)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83319/3670777216.py in <module>
----> 1 df.map(lambda x: x+3)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__(self, name)
5485 ):
5486 return self[name]
-> 5487 return object.__getattribute__(self, name)
5488
5489 def __setattr__(self, name: str, value) -> None:
AttributeError: 'DataFrame' object has no attribute 'map'
s.applymap(lambda x: x+3)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83319/2831778842.py in <module>
----> 1 s.applymap(lambda x: x+3)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__(self, name)
5485 ):
5486 return self[name]
-> 5487 return object.__getattribute__(self, name)
5488
5489 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'applymap'