pandas examples

import pandas as pd
import numpy as np
rng = np.random.default_rng()

Making a DataFrame

Topics:

  • Constructing a DataFrame from a dictionary;

  • from a NumPy array.

  • read_csv

  • Naming the rows and columns.

  • An empty DataFrame.

  • dtypes.

d = {"A": [3,1,4,1], "C": [2,8,7,1]}
d["A"]
[3, 1, 4, 1]
pd.DataFrame(d)
A C
0 3 2
1 1 8
2 4 7
3 1 1

Can you tell what is causing this error?

d2 = {"A": [3,1,4,1], "C": [2,8,7,1], "G": "chris", 7: [3,3,4]}
pd.DataFrame(d2)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/3680709340.py in <module>
      1 d2 = {"A": [3,1,4,1], "C": [2,8,7,1], "G": "chris", 7: [3,3,4]}
----> 2 pd.DataFrame(d2)

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    612         elif isinstance(data, dict):
    613             # GH#38939 de facto copy defaults to False only in non-dict cases
--> 614             mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
    615         elif isinstance(data, ma.MaskedArray):
    616             import numpy.ma.mrecords as mrecords

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
    462         # TODO: can we get rid of the dt64tz special case above?
    463 
--> 464     return arrays_to_mgr(
    465         arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
    466     )

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity, typ, consolidate)
    117         # figure out the index, if necessary
    118         if index is None:
--> 119             index = _extract_index(arrays)
    120         else:
    121             index = ensure_index(index)

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in _extract_index(data)
    633             lengths = list(set(raw_lengths))
    634             if len(lengths) > 1:
--> 635                 raise ValueError("All arrays must be of the same length")
    636 
    637             if have_dicts:

ValueError: All arrays must be of the same length
d2 = {"A": [3,1,4,1], "C": [2,8,7,1], "G": "chris", 7: [3,3,4,3.1]}
pd.DataFrame(d2)
A C G 7
0 3 2 chris 3.0
1 1 8 chris 3.0
2 4 7 chris 4.0
3 1 1 chris 3.1
A = rng.random(size=(10,4))
A
array([[0.59008082, 0.02860251, 0.92401521, 0.67658537],
       [0.39107826, 0.71027222, 0.66003209, 0.9162059 ],
       [0.36012957, 0.01039311, 0.36182634, 0.34900878],
       [0.18094549, 0.4245525 , 0.26039557, 0.57658844],
       [0.39373595, 0.38649149, 0.92519635, 0.33716329],
       [0.72006158, 0.48791468, 0.39723519, 0.28245805],
       [0.79592556, 0.54922316, 0.07904228, 0.80273698],
       [0.27802416, 0.55483903, 0.74845139, 0.26473047],
       [0.88508451, 0.17906794, 0.70485172, 0.52653599],
       [0.42038575, 0.10287395, 0.69269369, 0.68827117]])
DataFrame(A)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/1874637228.py in <module>
----> 1 DataFrame(A)

NameError: name 'DataFrame' is not defined
pd.DataFrame(A)
0 1 2 3
0 0.590081 0.028603 0.924015 0.676585
1 0.391078 0.710272 0.660032 0.916206
2 0.360130 0.010393 0.361826 0.349009
3 0.180945 0.424553 0.260396 0.576588
4 0.393736 0.386491 0.925196 0.337163
5 0.720062 0.487915 0.397235 0.282458
6 0.795926 0.549223 0.079042 0.802737
7 0.278024 0.554839 0.748451 0.264730
8 0.885085 0.179068 0.704852 0.526536
9 0.420386 0.102874 0.692694 0.688271

The way we will most often get a pandas DataFrame is by reading in data from an external file, usually a csv file. We will do this using pd.read_csv. For now, we are working with these randomly generated DataFrames instead.

A
array([[0.59008082, 0.02860251, 0.92401521, 0.67658537],
       [0.39107826, 0.71027222, 0.66003209, 0.9162059 ],
       [0.36012957, 0.01039311, 0.36182634, 0.34900878],
       [0.18094549, 0.4245525 , 0.26039557, 0.57658844],
       [0.39373595, 0.38649149, 0.92519635, 0.33716329],
       [0.72006158, 0.48791468, 0.39723519, 0.28245805],
       [0.79592556, 0.54922316, 0.07904228, 0.80273698],
       [0.27802416, 0.55483903, 0.74845139, 0.26473047],
       [0.88508451, 0.17906794, 0.70485172, 0.52653599],
       [0.42038575, 0.10287395, 0.69269369, 0.68827117]])
pd.DataFrame(A,columns=list("abcd"),index=range(0,22,2))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/4048147504.py in <module>
----> 1 pd.DataFrame(A,columns=list("abcd"),index=range(0,22,2))

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    670                 )
    671             else:
--> 672                 mgr = ndarray_to_mgr(
    673                     data,
    674                     index,

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
    322     )
    323 
--> 324     _check_values_indices_shape_match(values, index, columns)
    325 
    326     if typ == "array":

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in _check_values_indices_shape_match(values, index, columns)
    391         passed = values.shape
    392         implied = (len(index), len(columns))
--> 393         raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
    394 
    395 

ValueError: Shape of passed values is (10, 4), indices imply (11, 4)
df = pd.DataFrame(A,columns=list("abcd"),index=range(0,20,2))
df
a b c d
0 0.590081 0.028603 0.924015 0.676585
2 0.391078 0.710272 0.660032 0.916206
4 0.360130 0.010393 0.361826 0.349009
6 0.180945 0.424553 0.260396 0.576588
8 0.393736 0.386491 0.925196 0.337163
10 0.720062 0.487915 0.397235 0.282458
12 0.795926 0.549223 0.079042 0.802737
14 0.278024 0.554839 0.748451 0.264730
16 0.885085 0.179068 0.704852 0.526536
18 0.420386 0.102874 0.692694 0.688271

Accessing columns of a DataFrame

Topics:

  • Accessing one column

  • Accessing multiple columns

  • Using loc (by label) and iloc (by position)

  • Making a new column

# Motivation: this means: look for a column named 2
df[2]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 2

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/946357898.py in <module>
      1 # Motivation: this means: look for a column named 2
----> 2 df[2]

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
   3456             if self.columns.nlevels > 1:
   3457                 return self._getitem_multilevel(key)
-> 3458             indexer = self.columns.get_loc(key)
   3459             if is_integer(indexer):
   3460                 indexer = [indexer]

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 
   3365         if is_scalar(key) and isna(key) and not self.hasnans:

KeyError: 2
my_series = df["c"] # pandas Series
my_series
0     0.924015
2     0.660032
4     0.361826
6     0.260396
8     0.925196
10    0.397235
12    0.079042
14    0.748451
16    0.704852
18    0.692694
Name: c, dtype: float64
my_series[3]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    384                 try:
--> 385                     return self._range.index(new_key)
    386                 except ValueError as err:

ValueError: 3 is not in range

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/1108723751.py in <module>
----> 1 my_series[3]

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key)
    940 
    941         elif key_is_scalar:
--> 942             return self._get_value(key)
    943 
    944         if is_hashable(key):

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
   1049 
   1050         # Similar to Index.get_value, but we do not fall back to positional
-> 1051         loc = self.index.get_loc(label)
   1052         return self.index._get_values_for_loc(self, loc, label)
   1053 

~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    385                     return self._range.index(new_key)
    386                 except ValueError as err:
--> 387                     raise KeyError(key) from err
    388             raise KeyError(key)
    389         return super().get_loc(key, method=method, tolerance=tolerance)

KeyError: 3
my_series[6]
0.26039556692520305
my_series.sum()
5.753739828566685
# multiple columns
df[["c","a","c"]]
c a c
0 0.924015 0.590081 0.924015
2 0.660032 0.391078 0.660032
4 0.361826 0.360130 0.361826
6 0.260396 0.180945 0.260396
8 0.925196 0.393736 0.925196
10 0.397235 0.720062 0.397235
12 0.079042 0.795926 0.079042
14 0.748451 0.278024 0.748451
16 0.704852 0.885085 0.704852
18 0.692694 0.420386 0.692694
df.loc[:,"c"]
0     0.924015
2     0.660032
4     0.361826
6     0.260396
8     0.925196
10    0.397235
12    0.079042
14    0.748451
16    0.704852
18    0.692694
Name: c, dtype: float64
df.iloc[:,2]
0     0.924015
2     0.660032
4     0.361826
6     0.260396
8     0.925196
10    0.397235
12    0.079042
14    0.748451
16    0.704852
18    0.692694
Name: c, dtype: float64
# Keeps right endpoint!
df.loc[:,"a":"c"]
a b c
0 0.590081 0.028603 0.924015
2 0.391078 0.710272 0.660032
4 0.360130 0.010393 0.361826
6 0.180945 0.424553 0.260396
8 0.393736 0.386491 0.925196
10 0.720062 0.487915 0.397235
12 0.795926 0.549223 0.079042
14 0.278024 0.554839 0.748451
16 0.885085 0.179068 0.704852
18 0.420386 0.102874 0.692694
# Doesn't!
df.iloc[:,0:2]
a b
0 0.590081 0.028603
2 0.391078 0.710272
4 0.360130 0.010393
6 0.180945 0.424553
8 0.393736 0.386491
10 0.720062 0.487915
12 0.795926 0.549223
14 0.278024 0.554839
16 0.885085 0.179068
18 0.420386 0.102874

Aside: Can set values in the same way

df = pd.DataFrame(A,columns=list("abcd"),index=range(0,20,2))
df
a b c d
0 0.590081 0.028603 0.924015 0.676585
2 0.391078 0.710272 0.660032 0.916206
4 0.360130 0.010393 0.361826 0.349009
6 0.180945 0.424553 0.260396 0.576588
8 0.393736 0.386491 0.925196 0.337163
10 0.720062 0.487915 0.397235 0.282458
12 0.795926 0.549223 0.079042 0.802737
14 0.278024 0.554839 0.748451 0.264730
16 0.885085 0.179068 0.704852 0.526536
18 0.420386 0.102874 0.692694 0.688271
df.loc[:,"c"] = "chris"
df
a b c d
0 0.590081 0.028603 chris 0.676585
2 0.391078 0.710272 chris 0.916206
4 0.360130 0.010393 chris 0.349009
6 0.180945 0.424553 chris 0.576588
8 0.393736 0.386491 chris 0.337163
10 0.720062 0.487915 chris 0.282458
12 0.795926 0.549223 chris 0.802737
14 0.278024 0.554839 chris 0.264730
16 0.885085 0.179068 chris 0.526536
18 0.420386 0.102874 chris 0.688271
df.loc[:,"c"] = rng.integers(-5,5,size=10)
df
a b c d
0 0.590081 0.028603 4 0.676585
2 0.391078 0.710272 -2 0.916206
4 0.360130 0.010393 -3 0.349009
6 0.180945 0.424553 -3 0.576588
8 0.393736 0.386491 -2 0.337163
10 0.720062 0.487915 -4 0.282458
12 0.795926 0.549223 -5 0.802737
14 0.278024 0.554839 -3 0.264730
16 0.885085 0.179068 -3 0.526536
18 0.420386 0.102874 -5 0.688271
df.d = df.a + 12
df
a b c d
0 0.590081 0.028603 4 12.590081
2 0.391078 0.710272 -2 12.391078
4 0.360130 0.010393 -3 12.360130
6 0.180945 0.424553 -3 12.180945
8 0.393736 0.386491 -2 12.393736
10 0.720062 0.487915 -4 12.720062
12 0.795926 0.549223 -5 12.795926
14 0.278024 0.554839 -3 12.278024
16 0.885085 0.179068 -3 12.885085
18 0.420386 0.102874 -5 12.420386

Aside: using copy() to reduce warnings

df
a b c d
0 0.590081 0.028603 4 12.590081
2 0.391078 0.710272 -2 12.391078
4 0.360130 0.010393 -3 12.360130
6 0.180945 0.424553 -3 12.180945
8 0.393736 0.386491 -2 12.393736
10 0.720062 0.487915 -4 12.720062
12 0.795926 0.549223 -5 12.795926
14 0.278024 0.554839 -3 12.278024
16 0.885085 0.179068 -3 12.885085
18 0.420386 0.102874 -5 12.420386
df2 = df[:4]
df2
a b c d
0 0.590081 0.028603 4 12.590081
2 0.391078 0.710272 -2 12.391078
4 0.360130 0.010393 -3 12.360130
6 0.180945 0.424553 -3 12.180945
df2["c"] = 3
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/3169448060.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["c"] = 3
# resolve with copy()
df2 = df[:4].copy()
df2["c"] = 3
df2
a b c d
0 0.590081 0.028603 3 12.590081
2 0.391078 0.710272 3 12.391078
4 0.360130 0.010393 3 12.360130
6 0.180945 0.424553 3 12.180945

Accessing rows of a DataFrame

Topics:

  • Using loc and iloc

  • Slicing

  • Boolean indexing

df
a b c d
0 0.590081 0.028603 3 12.590081
2 0.391078 0.710272 3 12.391078
4 0.360130 0.010393 3 12.360130
6 0.180945 0.424553 3 12.180945
8 0.393736 0.386491 -2 12.393736
10 0.720062 0.487915 -4 12.720062
12 0.795926 0.549223 -5 12.795926
14 0.278024 0.554839 -3 12.278024
16 0.885085 0.179068 -3 12.885085
18 0.420386 0.102874 -5 12.420386
df.loc[[2,10,4,2]]
a b c d
2 0.391078 0.710272 3 12.391078
10 0.720062 0.487915 -4 12.720062
4 0.360130 0.010393 3 12.360130
2 0.391078 0.710272 3 12.391078
df.iloc[[2,4,2]]
a b c d
4 0.360130 0.010393 3 12.360130
8 0.393736 0.386491 -2 12.393736
4 0.360130 0.010393 3 12.360130
# slicing
df[:4]
a b c d
0 0.590081 0.028603 3 12.590081
2 0.391078 0.710272 3 12.391078
4 0.360130 0.010393 3 12.360130
6 0.180945 0.424553 3 12.180945

Column (or row, they are very similar)

Topics:

  • Data type

  • Similarity to a dictionary

  • What elements occur in a column? (unique, set, value_counts)

  • Sorting (also of a DataFrame)

  • What three elements occur most often?

df = pd.DataFrame(rng.integers(0,40,size=(20,4)))
df
0 1 2 3
0 30 35 31 39
1 16 14 15 27
2 26 34 23 39
3 20 38 13 34
4 24 27 21 23
5 26 29 6 14
6 19 13 29 16
7 18 31 2 35
8 32 14 13 32
9 14 25 37 23
10 32 29 5 37
11 6 32 0 7
12 11 10 38 21
13 4 7 21 33
14 30 18 2 9
15 3 6 14 14
16 17 28 28 39
17 34 27 3 25
18 30 18 31 19
19 27 25 37 23

Longer Example

Make a 10 million by 10 DataFrame of uniformly distributed random numbers between 0 and 1. Find the rows where the zero-th entry is bigger than the sum of all other 9 elements. (It’s possible that no rows will satisfy this condition. If that happens, try generating new random numbers.)

n = 10**7
A = rng.random(size=(n,10))
df = pd.DataFrame(A)
df.head()
0 1 2 3 4 5 6 7 8 9
0 0.317219 0.419843 0.442733 0.720758 0.640555 0.579006 0.575995 0.922113 0.908004 0.618323
1 0.297504 0.852839 0.149783 0.082658 0.135045 0.041880 0.631482 0.193168 0.213594 0.723869
2 0.581295 0.796233 0.278778 0.372116 0.499024 0.034230 0.998523 0.135782 0.895571 0.773005
3 0.426469 0.740665 0.061880 0.013312 0.544289 0.839926 0.904654 0.583338 0.626046 0.010888
4 0.788545 0.430254 0.283768 0.051350 0.672452 0.789541 0.443963 0.860221 0.982655 0.586919
# make a new column containing the sum of all entries other than the 0 column
df["my sum"] = df.iloc[:,1:].sum(axis=1)
df.head()
0 1 2 3 4 5 6 7 8 9 my sum
0 0.317219 0.419843 0.442733 0.720758 0.640555 0.579006 0.575995 0.922113 0.908004 0.618323 5.827331
1 0.297504 0.852839 0.149783 0.082658 0.135045 0.041880 0.631482 0.193168 0.213594 0.723869 3.024317
2 0.581295 0.796233 0.278778 0.372116 0.499024 0.034230 0.998523 0.135782 0.895571 0.773005 4.783262
3 0.426469 0.740665 0.061880 0.013312 0.544289 0.839926 0.904654 0.583338 0.626046 0.010888 4.324998
4 0.788545 0.430254 0.283768 0.051350 0.672452 0.789541 0.443963 0.860221 0.982655 0.586919 5.101123
df["bigger?"] = (df[0] > df["my sum"])
df
0 1 2 3 4 5 6 7 8 9 my sum bigger?
0 0.317219 0.419843 0.442733 0.720758 0.640555 0.579006 0.575995 0.922113 0.908004 0.618323 5.827331 False
1 0.297504 0.852839 0.149783 0.082658 0.135045 0.041880 0.631482 0.193168 0.213594 0.723869 3.024317 False
2 0.581295 0.796233 0.278778 0.372116 0.499024 0.034230 0.998523 0.135782 0.895571 0.773005 4.783262 False
3 0.426469 0.740665 0.061880 0.013312 0.544289 0.839926 0.904654 0.583338 0.626046 0.010888 4.324998 False
4 0.788545 0.430254 0.283768 0.051350 0.672452 0.789541 0.443963 0.860221 0.982655 0.586919 5.101123 False
... ... ... ... ... ... ... ... ... ... ... ... ...
9999995 0.491812 0.703367 0.607673 0.713924 0.227431 0.643039 0.640853 0.843779 0.231256 0.860354 5.471676 False
9999996 0.144890 0.874357 0.557466 0.029911 0.228275 0.093766 0.773229 0.020584 0.524753 0.930682 4.033024 False
9999997 0.888912 0.539455 0.242572 0.883598 0.070645 0.408915 0.113122 0.120895 0.646624 0.810465 3.836291 False
9999998 0.441192 0.982065 0.531535 0.612471 0.078160 0.097779 0.286849 0.141411 0.862393 0.687830 4.280492 False
9999999 0.225582 0.322104 0.203558 0.676575 0.253310 0.606301 0.174267 0.094030 0.111350 0.496382 2.937877 False

10000000 rows × 12 columns

# called Boolean indexing
# Keep only the rows, where True appears in the "bigger?" column
df[df["bigger?"]]
0 1 2 3 4 5 6 7 8 9 my sum bigger?
3530582 0.988493 0.093486 0.039140 0.124765 0.078407 0.133746 0.082342 0.133572 0.059588 0.058145 0.803191 True
3534459 0.997310 0.147616 0.019514 0.037724 0.004714 0.037830 0.362006 0.215426 0.057665 0.097678 0.980174 True
8805526 0.807755 0.064672 0.047094 0.067381 0.145109 0.092274 0.104757 0.121462 0.123704 0.024139 0.790593 True
9130026 0.990936 0.075488 0.186168 0.234095 0.049748 0.021457 0.081330 0.006716 0.057796 0.210335 0.923134 True
df["my sum"].mean()
4.499632839990039

Counting

For the same DataFrame, how many elements in the 5th column are bigger than 0.6? We would expect about 4 million. Compute this using sum in two different ways, and time them using %%timeit