pandas examples¶
import pandas as pd
import numpy as np
rng = np.random.default_rng()
Making a DataFrame¶
Topics:
Constructing a DataFrame from a dictionary;
from a NumPy array.
read_csv
Naming the rows and columns.
An empty DataFrame.
dtypes.
d = {"A": [3,1,4,1], "C": [2,8,7,1]}
d["A"]
[3, 1, 4, 1]
pd.DataFrame(d)
A | C | |
---|---|---|
0 | 3 | 2 |
1 | 1 | 8 |
2 | 4 | 7 |
3 | 1 | 1 |
Can you tell what is causing this error?
d2 = {"A": [3,1,4,1], "C": [2,8,7,1], "G": "chris", 7: [3,3,4]}
pd.DataFrame(d2)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/3680709340.py in <module>
1 d2 = {"A": [3,1,4,1], "C": [2,8,7,1], "G": "chris", 7: [3,3,4]}
----> 2 pd.DataFrame(d2)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
612 elif isinstance(data, dict):
613 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 614 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
615 elif isinstance(data, ma.MaskedArray):
616 import numpy.ma.mrecords as mrecords
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
462 # TODO: can we get rid of the dt64tz special case above?
463
--> 464 return arrays_to_mgr(
465 arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
466 )
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity, typ, consolidate)
117 # figure out the index, if necessary
118 if index is None:
--> 119 index = _extract_index(arrays)
120 else:
121 index = ensure_index(index)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in _extract_index(data)
633 lengths = list(set(raw_lengths))
634 if len(lengths) > 1:
--> 635 raise ValueError("All arrays must be of the same length")
636
637 if have_dicts:
ValueError: All arrays must be of the same length
d2 = {"A": [3,1,4,1], "C": [2,8,7,1], "G": "chris", 7: [3,3,4,3.1]}
pd.DataFrame(d2)
A | C | G | 7 | |
---|---|---|---|---|
0 | 3 | 2 | chris | 3.0 |
1 | 1 | 8 | chris | 3.0 |
2 | 4 | 7 | chris | 4.0 |
3 | 1 | 1 | chris | 3.1 |
A = rng.random(size=(10,4))
A
array([[0.59008082, 0.02860251, 0.92401521, 0.67658537],
[0.39107826, 0.71027222, 0.66003209, 0.9162059 ],
[0.36012957, 0.01039311, 0.36182634, 0.34900878],
[0.18094549, 0.4245525 , 0.26039557, 0.57658844],
[0.39373595, 0.38649149, 0.92519635, 0.33716329],
[0.72006158, 0.48791468, 0.39723519, 0.28245805],
[0.79592556, 0.54922316, 0.07904228, 0.80273698],
[0.27802416, 0.55483903, 0.74845139, 0.26473047],
[0.88508451, 0.17906794, 0.70485172, 0.52653599],
[0.42038575, 0.10287395, 0.69269369, 0.68827117]])
DataFrame(A)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/1874637228.py in <module>
----> 1 DataFrame(A)
NameError: name 'DataFrame' is not defined
pd.DataFrame(A)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 0.924015 | 0.676585 |
1 | 0.391078 | 0.710272 | 0.660032 | 0.916206 |
2 | 0.360130 | 0.010393 | 0.361826 | 0.349009 |
3 | 0.180945 | 0.424553 | 0.260396 | 0.576588 |
4 | 0.393736 | 0.386491 | 0.925196 | 0.337163 |
5 | 0.720062 | 0.487915 | 0.397235 | 0.282458 |
6 | 0.795926 | 0.549223 | 0.079042 | 0.802737 |
7 | 0.278024 | 0.554839 | 0.748451 | 0.264730 |
8 | 0.885085 | 0.179068 | 0.704852 | 0.526536 |
9 | 0.420386 | 0.102874 | 0.692694 | 0.688271 |
The way we will most often get a pandas DataFrame is by reading in data from an external file, usually a csv file. We will do this using pd.read_csv
. For now, we are working with these randomly generated DataFrames instead.
A
array([[0.59008082, 0.02860251, 0.92401521, 0.67658537],
[0.39107826, 0.71027222, 0.66003209, 0.9162059 ],
[0.36012957, 0.01039311, 0.36182634, 0.34900878],
[0.18094549, 0.4245525 , 0.26039557, 0.57658844],
[0.39373595, 0.38649149, 0.92519635, 0.33716329],
[0.72006158, 0.48791468, 0.39723519, 0.28245805],
[0.79592556, 0.54922316, 0.07904228, 0.80273698],
[0.27802416, 0.55483903, 0.74845139, 0.26473047],
[0.88508451, 0.17906794, 0.70485172, 0.52653599],
[0.42038575, 0.10287395, 0.69269369, 0.68827117]])
pd.DataFrame(A,columns=list("abcd"),index=range(0,22,2))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/4048147504.py in <module>
----> 1 pd.DataFrame(A,columns=list("abcd"),index=range(0,22,2))
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
670 )
671 else:
--> 672 mgr = ndarray_to_mgr(
673 data,
674 index,
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
322 )
323
--> 324 _check_values_indices_shape_match(values, index, columns)
325
326 if typ == "array":
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/internals/construction.py in _check_values_indices_shape_match(values, index, columns)
391 passed = values.shape
392 implied = (len(index), len(columns))
--> 393 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
394
395
ValueError: Shape of passed values is (10, 4), indices imply (11, 4)
df = pd.DataFrame(A,columns=list("abcd"),index=range(0,20,2))
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 0.924015 | 0.676585 |
2 | 0.391078 | 0.710272 | 0.660032 | 0.916206 |
4 | 0.360130 | 0.010393 | 0.361826 | 0.349009 |
6 | 0.180945 | 0.424553 | 0.260396 | 0.576588 |
8 | 0.393736 | 0.386491 | 0.925196 | 0.337163 |
10 | 0.720062 | 0.487915 | 0.397235 | 0.282458 |
12 | 0.795926 | 0.549223 | 0.079042 | 0.802737 |
14 | 0.278024 | 0.554839 | 0.748451 | 0.264730 |
16 | 0.885085 | 0.179068 | 0.704852 | 0.526536 |
18 | 0.420386 | 0.102874 | 0.692694 | 0.688271 |
Accessing columns of a DataFrame¶
Topics:
Accessing one column
Accessing multiple columns
Using loc (by label) and iloc (by position)
Making a new column
# Motivation: this means: look for a column named 2
df[2]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 2
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/946357898.py in <module>
1 # Motivation: this means: look for a column named 2
----> 2 df[2]
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
3456 if self.columns.nlevels > 1:
3457 return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
3459 if is_integer(indexer):
3460 indexer = [indexer]
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 2
my_series = df["c"] # pandas Series
my_series
0 0.924015
2 0.660032
4 0.361826
6 0.260396
8 0.925196
10 0.397235
12 0.079042
14 0.748451
16 0.704852
18 0.692694
Name: c, dtype: float64
my_series[3]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
384 try:
--> 385 return self._range.index(new_key)
386 except ValueError as err:
ValueError: 3 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/1108723751.py in <module>
----> 1 my_series[3]
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key)
940
941 elif key_is_scalar:
--> 942 return self._get_value(key)
943
944 if is_hashable(key):
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
1049
1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
1052 return self.index._get_values_for_loc(self, loc, label)
1053
~/opt/anaconda3/envs/book/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
385 return self._range.index(new_key)
386 except ValueError as err:
--> 387 raise KeyError(key) from err
388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: 3
my_series[6]
0.26039556692520305
my_series.sum()
5.753739828566685
# multiple columns
df[["c","a","c"]]
c | a | c | |
---|---|---|---|
0 | 0.924015 | 0.590081 | 0.924015 |
2 | 0.660032 | 0.391078 | 0.660032 |
4 | 0.361826 | 0.360130 | 0.361826 |
6 | 0.260396 | 0.180945 | 0.260396 |
8 | 0.925196 | 0.393736 | 0.925196 |
10 | 0.397235 | 0.720062 | 0.397235 |
12 | 0.079042 | 0.795926 | 0.079042 |
14 | 0.748451 | 0.278024 | 0.748451 |
16 | 0.704852 | 0.885085 | 0.704852 |
18 | 0.692694 | 0.420386 | 0.692694 |
df.loc[:,"c"]
0 0.924015
2 0.660032
4 0.361826
6 0.260396
8 0.925196
10 0.397235
12 0.079042
14 0.748451
16 0.704852
18 0.692694
Name: c, dtype: float64
df.iloc[:,2]
0 0.924015
2 0.660032
4 0.361826
6 0.260396
8 0.925196
10 0.397235
12 0.079042
14 0.748451
16 0.704852
18 0.692694
Name: c, dtype: float64
# Keeps right endpoint!
df.loc[:,"a":"c"]
a | b | c | |
---|---|---|---|
0 | 0.590081 | 0.028603 | 0.924015 |
2 | 0.391078 | 0.710272 | 0.660032 |
4 | 0.360130 | 0.010393 | 0.361826 |
6 | 0.180945 | 0.424553 | 0.260396 |
8 | 0.393736 | 0.386491 | 0.925196 |
10 | 0.720062 | 0.487915 | 0.397235 |
12 | 0.795926 | 0.549223 | 0.079042 |
14 | 0.278024 | 0.554839 | 0.748451 |
16 | 0.885085 | 0.179068 | 0.704852 |
18 | 0.420386 | 0.102874 | 0.692694 |
# Doesn't!
df.iloc[:,0:2]
a | b | |
---|---|---|
0 | 0.590081 | 0.028603 |
2 | 0.391078 | 0.710272 |
4 | 0.360130 | 0.010393 |
6 | 0.180945 | 0.424553 |
8 | 0.393736 | 0.386491 |
10 | 0.720062 | 0.487915 |
12 | 0.795926 | 0.549223 |
14 | 0.278024 | 0.554839 |
16 | 0.885085 | 0.179068 |
18 | 0.420386 | 0.102874 |
Aside: Can set values in the same way¶
df = pd.DataFrame(A,columns=list("abcd"),index=range(0,20,2))
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 0.924015 | 0.676585 |
2 | 0.391078 | 0.710272 | 0.660032 | 0.916206 |
4 | 0.360130 | 0.010393 | 0.361826 | 0.349009 |
6 | 0.180945 | 0.424553 | 0.260396 | 0.576588 |
8 | 0.393736 | 0.386491 | 0.925196 | 0.337163 |
10 | 0.720062 | 0.487915 | 0.397235 | 0.282458 |
12 | 0.795926 | 0.549223 | 0.079042 | 0.802737 |
14 | 0.278024 | 0.554839 | 0.748451 | 0.264730 |
16 | 0.885085 | 0.179068 | 0.704852 | 0.526536 |
18 | 0.420386 | 0.102874 | 0.692694 | 0.688271 |
df.loc[:,"c"] = "chris"
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | chris | 0.676585 |
2 | 0.391078 | 0.710272 | chris | 0.916206 |
4 | 0.360130 | 0.010393 | chris | 0.349009 |
6 | 0.180945 | 0.424553 | chris | 0.576588 |
8 | 0.393736 | 0.386491 | chris | 0.337163 |
10 | 0.720062 | 0.487915 | chris | 0.282458 |
12 | 0.795926 | 0.549223 | chris | 0.802737 |
14 | 0.278024 | 0.554839 | chris | 0.264730 |
16 | 0.885085 | 0.179068 | chris | 0.526536 |
18 | 0.420386 | 0.102874 | chris | 0.688271 |
df.loc[:,"c"] = rng.integers(-5,5,size=10)
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 4 | 0.676585 |
2 | 0.391078 | 0.710272 | -2 | 0.916206 |
4 | 0.360130 | 0.010393 | -3 | 0.349009 |
6 | 0.180945 | 0.424553 | -3 | 0.576588 |
8 | 0.393736 | 0.386491 | -2 | 0.337163 |
10 | 0.720062 | 0.487915 | -4 | 0.282458 |
12 | 0.795926 | 0.549223 | -5 | 0.802737 |
14 | 0.278024 | 0.554839 | -3 | 0.264730 |
16 | 0.885085 | 0.179068 | -3 | 0.526536 |
18 | 0.420386 | 0.102874 | -5 | 0.688271 |
df.d = df.a + 12
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 4 | 12.590081 |
2 | 0.391078 | 0.710272 | -2 | 12.391078 |
4 | 0.360130 | 0.010393 | -3 | 12.360130 |
6 | 0.180945 | 0.424553 | -3 | 12.180945 |
8 | 0.393736 | 0.386491 | -2 | 12.393736 |
10 | 0.720062 | 0.487915 | -4 | 12.720062 |
12 | 0.795926 | 0.549223 | -5 | 12.795926 |
14 | 0.278024 | 0.554839 | -3 | 12.278024 |
16 | 0.885085 | 0.179068 | -3 | 12.885085 |
18 | 0.420386 | 0.102874 | -5 | 12.420386 |
Aside: using copy()
to reduce warnings¶
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 4 | 12.590081 |
2 | 0.391078 | 0.710272 | -2 | 12.391078 |
4 | 0.360130 | 0.010393 | -3 | 12.360130 |
6 | 0.180945 | 0.424553 | -3 | 12.180945 |
8 | 0.393736 | 0.386491 | -2 | 12.393736 |
10 | 0.720062 | 0.487915 | -4 | 12.720062 |
12 | 0.795926 | 0.549223 | -5 | 12.795926 |
14 | 0.278024 | 0.554839 | -3 | 12.278024 |
16 | 0.885085 | 0.179068 | -3 | 12.885085 |
18 | 0.420386 | 0.102874 | -5 | 12.420386 |
df2 = df[:4]
df2
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 4 | 12.590081 |
2 | 0.391078 | 0.710272 | -2 | 12.391078 |
4 | 0.360130 | 0.010393 | -3 | 12.360130 |
6 | 0.180945 | 0.424553 | -3 | 12.180945 |
df2["c"] = 3
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_83273/3169448060.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df2["c"] = 3
# resolve with copy()
df2 = df[:4].copy()
df2["c"] = 3
df2
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 3 | 12.590081 |
2 | 0.391078 | 0.710272 | 3 | 12.391078 |
4 | 0.360130 | 0.010393 | 3 | 12.360130 |
6 | 0.180945 | 0.424553 | 3 | 12.180945 |
Accessing rows of a DataFrame¶
Topics:
Using loc and iloc
Slicing
Boolean indexing
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 3 | 12.590081 |
2 | 0.391078 | 0.710272 | 3 | 12.391078 |
4 | 0.360130 | 0.010393 | 3 | 12.360130 |
6 | 0.180945 | 0.424553 | 3 | 12.180945 |
8 | 0.393736 | 0.386491 | -2 | 12.393736 |
10 | 0.720062 | 0.487915 | -4 | 12.720062 |
12 | 0.795926 | 0.549223 | -5 | 12.795926 |
14 | 0.278024 | 0.554839 | -3 | 12.278024 |
16 | 0.885085 | 0.179068 | -3 | 12.885085 |
18 | 0.420386 | 0.102874 | -5 | 12.420386 |
df.loc[[2,10,4,2]]
a | b | c | d | |
---|---|---|---|---|
2 | 0.391078 | 0.710272 | 3 | 12.391078 |
10 | 0.720062 | 0.487915 | -4 | 12.720062 |
4 | 0.360130 | 0.010393 | 3 | 12.360130 |
2 | 0.391078 | 0.710272 | 3 | 12.391078 |
df.iloc[[2,4,2]]
a | b | c | d | |
---|---|---|---|---|
4 | 0.360130 | 0.010393 | 3 | 12.360130 |
8 | 0.393736 | 0.386491 | -2 | 12.393736 |
4 | 0.360130 | 0.010393 | 3 | 12.360130 |
# slicing
df[:4]
a | b | c | d | |
---|---|---|---|---|
0 | 0.590081 | 0.028603 | 3 | 12.590081 |
2 | 0.391078 | 0.710272 | 3 | 12.391078 |
4 | 0.360130 | 0.010393 | 3 | 12.360130 |
6 | 0.180945 | 0.424553 | 3 | 12.180945 |
Column (or row, they are very similar)¶
Topics:
Data type
Similarity to a dictionary
What elements occur in a column? (unique, set, value_counts)
Sorting (also of a DataFrame)
What three elements occur most often?
df = pd.DataFrame(rng.integers(0,40,size=(20,4)))
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 30 | 35 | 31 | 39 |
1 | 16 | 14 | 15 | 27 |
2 | 26 | 34 | 23 | 39 |
3 | 20 | 38 | 13 | 34 |
4 | 24 | 27 | 21 | 23 |
5 | 26 | 29 | 6 | 14 |
6 | 19 | 13 | 29 | 16 |
7 | 18 | 31 | 2 | 35 |
8 | 32 | 14 | 13 | 32 |
9 | 14 | 25 | 37 | 23 |
10 | 32 | 29 | 5 | 37 |
11 | 6 | 32 | 0 | 7 |
12 | 11 | 10 | 38 | 21 |
13 | 4 | 7 | 21 | 33 |
14 | 30 | 18 | 2 | 9 |
15 | 3 | 6 | 14 | 14 |
16 | 17 | 28 | 28 | 39 |
17 | 34 | 27 | 3 | 25 |
18 | 30 | 18 | 31 | 19 |
19 | 27 | 25 | 37 | 23 |
Longer Example¶
Make a 10 million by 10 DataFrame of uniformly distributed random numbers between 0 and 1. Find the rows where the zero-th entry is bigger than the sum of all other 9 elements. (It’s possible that no rows will satisfy this condition. If that happens, try generating new random numbers.)
n = 10**7
A = rng.random(size=(n,10))
df = pd.DataFrame(A)
df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.317219 | 0.419843 | 0.442733 | 0.720758 | 0.640555 | 0.579006 | 0.575995 | 0.922113 | 0.908004 | 0.618323 |
1 | 0.297504 | 0.852839 | 0.149783 | 0.082658 | 0.135045 | 0.041880 | 0.631482 | 0.193168 | 0.213594 | 0.723869 |
2 | 0.581295 | 0.796233 | 0.278778 | 0.372116 | 0.499024 | 0.034230 | 0.998523 | 0.135782 | 0.895571 | 0.773005 |
3 | 0.426469 | 0.740665 | 0.061880 | 0.013312 | 0.544289 | 0.839926 | 0.904654 | 0.583338 | 0.626046 | 0.010888 |
4 | 0.788545 | 0.430254 | 0.283768 | 0.051350 | 0.672452 | 0.789541 | 0.443963 | 0.860221 | 0.982655 | 0.586919 |
# make a new column containing the sum of all entries other than the 0 column
df["my sum"] = df.iloc[:,1:].sum(axis=1)
df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | my sum | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.317219 | 0.419843 | 0.442733 | 0.720758 | 0.640555 | 0.579006 | 0.575995 | 0.922113 | 0.908004 | 0.618323 | 5.827331 |
1 | 0.297504 | 0.852839 | 0.149783 | 0.082658 | 0.135045 | 0.041880 | 0.631482 | 0.193168 | 0.213594 | 0.723869 | 3.024317 |
2 | 0.581295 | 0.796233 | 0.278778 | 0.372116 | 0.499024 | 0.034230 | 0.998523 | 0.135782 | 0.895571 | 0.773005 | 4.783262 |
3 | 0.426469 | 0.740665 | 0.061880 | 0.013312 | 0.544289 | 0.839926 | 0.904654 | 0.583338 | 0.626046 | 0.010888 | 4.324998 |
4 | 0.788545 | 0.430254 | 0.283768 | 0.051350 | 0.672452 | 0.789541 | 0.443963 | 0.860221 | 0.982655 | 0.586919 | 5.101123 |
df["bigger?"] = (df[0] > df["my sum"])
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | my sum | bigger? | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.317219 | 0.419843 | 0.442733 | 0.720758 | 0.640555 | 0.579006 | 0.575995 | 0.922113 | 0.908004 | 0.618323 | 5.827331 | False |
1 | 0.297504 | 0.852839 | 0.149783 | 0.082658 | 0.135045 | 0.041880 | 0.631482 | 0.193168 | 0.213594 | 0.723869 | 3.024317 | False |
2 | 0.581295 | 0.796233 | 0.278778 | 0.372116 | 0.499024 | 0.034230 | 0.998523 | 0.135782 | 0.895571 | 0.773005 | 4.783262 | False |
3 | 0.426469 | 0.740665 | 0.061880 | 0.013312 | 0.544289 | 0.839926 | 0.904654 | 0.583338 | 0.626046 | 0.010888 | 4.324998 | False |
4 | 0.788545 | 0.430254 | 0.283768 | 0.051350 | 0.672452 | 0.789541 | 0.443963 | 0.860221 | 0.982655 | 0.586919 | 5.101123 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9999995 | 0.491812 | 0.703367 | 0.607673 | 0.713924 | 0.227431 | 0.643039 | 0.640853 | 0.843779 | 0.231256 | 0.860354 | 5.471676 | False |
9999996 | 0.144890 | 0.874357 | 0.557466 | 0.029911 | 0.228275 | 0.093766 | 0.773229 | 0.020584 | 0.524753 | 0.930682 | 4.033024 | False |
9999997 | 0.888912 | 0.539455 | 0.242572 | 0.883598 | 0.070645 | 0.408915 | 0.113122 | 0.120895 | 0.646624 | 0.810465 | 3.836291 | False |
9999998 | 0.441192 | 0.982065 | 0.531535 | 0.612471 | 0.078160 | 0.097779 | 0.286849 | 0.141411 | 0.862393 | 0.687830 | 4.280492 | False |
9999999 | 0.225582 | 0.322104 | 0.203558 | 0.676575 | 0.253310 | 0.606301 | 0.174267 | 0.094030 | 0.111350 | 0.496382 | 2.937877 | False |
10000000 rows × 12 columns
# called Boolean indexing
# Keep only the rows, where True appears in the "bigger?" column
df[df["bigger?"]]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | my sum | bigger? | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
3530582 | 0.988493 | 0.093486 | 0.039140 | 0.124765 | 0.078407 | 0.133746 | 0.082342 | 0.133572 | 0.059588 | 0.058145 | 0.803191 | True |
3534459 | 0.997310 | 0.147616 | 0.019514 | 0.037724 | 0.004714 | 0.037830 | 0.362006 | 0.215426 | 0.057665 | 0.097678 | 0.980174 | True |
8805526 | 0.807755 | 0.064672 | 0.047094 | 0.067381 | 0.145109 | 0.092274 | 0.104757 | 0.121462 | 0.123704 | 0.024139 | 0.790593 | True |
9130026 | 0.990936 | 0.075488 | 0.186168 | 0.234095 | 0.049748 | 0.021457 | 0.081330 | 0.006716 | 0.057796 | 0.210335 | 0.923134 | True |
df["my sum"].mean()
4.499632839990039
Counting¶
For the same DataFrame, how many elements in the 5th column are bigger than 0.6? We would expect about 4 million. Compute this using sum in two different ways, and time them using %%timeit