Linear and Polynomial Regression with the taxis dataset
Contents
Linear and Polynomial Regression with the taxis dataset¶
import pandas as pd
import altair as alt
alt.data_transformers.enable('default', max_rows=10000)
import seaborn as sns
df = sns.load_dataset("taxis").dropna()
Linear regression¶
Fit a linear regression model to the data from the taxis dataset, using multiple input variables (also called features, also called predictors), and with “total” as the output variable (the target).
df.head()
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan |
1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan |
2 | 2019-03-27 17:53:01 | 2019-03-27 18:00:25 | 1 | 1.37 | 7.5 | 2.36 | 0.0 | 14.16 | yellow | credit card | Alphabet City | West Village | Manhattan | Manhattan |
3 | 2019-03-10 01:23:59 | 2019-03-10 01:49:51 | 1 | 7.70 | 27.0 | 6.15 | 0.0 | 36.95 | yellow | credit card | Hudson Sq | Yorkville West | Manhattan | Manhattan |
4 | 2019-03-30 13:27:42 | 2019-03-30 13:37:14 | 3 | 2.16 | 9.0 | 1.10 | 0.0 | 13.40 | yellow | credit card | Midtown East | Yorkville West | Manhattan | Manhattan |
What are the rows with the biggest values in the “tolls” column?
df.sort_values("tolls", ascending=False)
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5364 | 2019-03-17 16:59:17 | 2019-03-17 18:04:08 | 2 | 36.70 | 150.00 | 0.00 | 24.02 | 174.82 | yellow | cash | JFK Airport | JFK Airport | Queens | Queens |
2122 | 2019-03-08 00:40:32 | 2019-03-08 01:11:53 | 1 | 15.51 | 44.00 | 16.27 | 17.28 | 81.35 | yellow | credit card | TriBeCa/Civic Center | West Brighton | Manhattan | Staten Island |
3640 | 2019-03-22 07:54:09 | 2019-03-22 09:05:13 | 1 | 16.42 | 52.00 | 0.00 | 12.50 | 67.80 | yellow | cash | JFK Airport | Murray Hill | Queens | Manhattan |
5911 | 2019-03-09 12:27:51 | 2019-03-09 13:11:18 | 1 | 11.40 | 39.00 | 0.00 | 11.52 | 51.32 | green | credit card | Windsor Terrace | Clinton East | Brooklyn | Manhattan |
5728 | 2019-03-01 17:07:09 | 2019-03-01 18:05:41 | 1 | 21.27 | 65.59 | 0.00 | 11.52 | 77.61 | green | credit card | Cambria Heights | Morningside Heights | Queens | Manhattan |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2203 | 2019-03-03 00:24:50 | 2019-03-03 00:56:17 | 1 | 4.72 | 22.00 | 5.16 | 0.00 | 30.96 | yellow | credit card | Meatpacking/West Village West | Williamsburg (South Side) | Manhattan | Brooklyn |
2202 | 2019-03-14 22:32:33 | 2019-03-14 22:49:39 | 1 | 2.90 | 13.00 | 3.36 | 0.00 | 20.16 | yellow | credit card | East Chelsea | East Village | Manhattan | Manhattan |
2201 | 2019-03-18 21:16:42 | 2019-03-18 21:27:49 | 1 | 3.00 | 11.50 | 3.06 | 0.00 | 18.36 | yellow | credit card | Clinton East | Upper East Side North | Manhattan | Manhattan |
2200 | 2019-03-03 07:21:40 | 2019-03-03 07:39:12 | 1 | 7.20 | 22.00 | 7.55 | 0.00 | 32.85 | yellow | credit card | Midtown Center | World Trade Center | Manhattan | Manhattan |
6432 | 2019-03-13 19:31:22 | 2019-03-13 19:48:02 | 1 | 3.85 | 15.00 | 3.36 | 0.00 | 20.16 | green | credit card | Boerum Hill | Windsor Terrace | Brooklyn | Brooklyn |
6341 rows × 14 columns
Let’s try to use the following columns as the inputs for our linear regression.
cols = ["distance", "tip", "tolls", "pickup_borough"]
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
This doesn’t work, because the values in the “pickup_borough” column are strings, not numbers.
reg.fit(df[cols],df["total"])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_15733/3118760667.py in <module>
----> 1 reg.fit(df[cols],df["total"])
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/linear_model/_base.py in fit(self, X, y, sample_weight)
661
662 X, y = self._validate_data(
--> 663 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
664 )
665
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
583
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
974 ensure_min_samples=ensure_min_samples,
975 ensure_min_features=ensure_min_features,
--> 976 estimator=estimator,
977 )
978
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/generic.py in __array__(self, dtype)
1991
1992 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993 return np.asarray(self._values, dtype=dtype)
1994
1995 def __array_wrap__(
ValueError: could not convert string to float: 'Manhattan'
Let’s make a new column called “Manhattan”. This will contain 1
for the “Manhattan” pickup borough rows, and contain 0
for all the other rows.
df["Manhattan"] = 0
df["pickup_borough"] == "Manhattan"
0 True
1 True
2 True
3 True
4 True
...
6428 True
6429 False
6430 False
6431 False
6432 False
Name: pickup_borough, Length: 6341, dtype: bool
(I think we could also store Boolean values directly in this new “Manhattan” column, but I think it’s less confusing to have 0
and 1
.)
# Put a 1 (for True) where the value is Manhattan
df.loc[df["pickup_borough"] == "Manhattan", "Manhattan"] = 1
df
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | Manhattan | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan | 1 |
1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan | 1 |
2 | 2019-03-27 17:53:01 | 2019-03-27 18:00:25 | 1 | 1.37 | 7.5 | 2.36 | 0.0 | 14.16 | yellow | credit card | Alphabet City | West Village | Manhattan | Manhattan | 1 |
3 | 2019-03-10 01:23:59 | 2019-03-10 01:49:51 | 1 | 7.70 | 27.0 | 6.15 | 0.0 | 36.95 | yellow | credit card | Hudson Sq | Yorkville West | Manhattan | Manhattan | 1 |
4 | 2019-03-30 13:27:42 | 2019-03-30 13:37:14 | 3 | 2.16 | 9.0 | 1.10 | 0.0 | 13.40 | yellow | credit card | Midtown East | Yorkville West | Manhattan | Manhattan | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6428 | 2019-03-31 09:51:53 | 2019-03-31 09:55:27 | 1 | 0.75 | 4.5 | 1.06 | 0.0 | 6.36 | green | credit card | East Harlem North | Central Harlem North | Manhattan | Manhattan | 1 |
6429 | 2019-03-31 17:38:00 | 2019-03-31 18:34:23 | 1 | 18.74 | 58.0 | 0.00 | 0.0 | 58.80 | green | credit card | Jamaica | East Concourse/Concourse Village | Queens | Bronx | 0 |
6430 | 2019-03-23 22:55:18 | 2019-03-23 23:14:25 | 1 | 4.14 | 16.0 | 0.00 | 0.0 | 17.30 | green | cash | Crown Heights North | Bushwick North | Brooklyn | Brooklyn | 0 |
6431 | 2019-03-04 10:09:25 | 2019-03-04 10:14:29 | 1 | 1.12 | 6.0 | 0.00 | 0.0 | 6.80 | green | credit card | East New York | East Flatbush/Remsen Village | Brooklyn | Brooklyn | 0 |
6432 | 2019-03-13 19:31:22 | 2019-03-13 19:48:02 | 1 | 3.85 | 15.0 | 3.36 | 0.0 | 20.16 | green | credit card | Boerum Hill | Windsor Terrace | Brooklyn | Brooklyn | 0 |
6341 rows × 15 columns
We now replace the old “pickup_borough” column with the newly created “Manhattan” column.
cols = ['distance', 'tip', 'tolls', 'Manhattan']
reg.fit(df[cols],df["total"])
LinearRegression()
The goal of the fit
method is to find the following coefficients, as well as the intercept.
reg.coef_
array([2.6294669 , 1.3306588 , 1.11487961, 1.55477579])
You should interpret the following as saying that the total cost of the taxi ride will be modeled by a formula involving 2.63 times the distance traveled. This can be interpreted as $2.63 per mile.
pd.Series(reg.coef_, index=cols)
distance 2.629467
tip 1.330659
tolls 1.114880
Manhattan 1.554776
dtype: float64
reg.intercept_
6.170557177757704
df[:2]
pickup | dropoff | passengers | distance | fare | tip | tolls | total | color | payment | pickup_zone | dropoff_zone | pickup_borough | dropoff_borough | Manhattan | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-03-23 20:21:09 | 2019-03-23 20:27:24 | 1 | 1.60 | 7.0 | 2.15 | 0.0 | 12.95 | yellow | credit card | Lenox Hill West | UN/Turtle Bay South | Manhattan | Manhattan | 1 |
1 | 2019-03-04 16:11:55 | 2019-03-04 16:19:00 | 1 | 0.79 | 5.0 | 0.00 | 0.0 | 9.30 | yellow | cash | Upper West Side South | Upper West Side South | Manhattan | Manhattan | 1 |
reg.predict(df[:2][cols])
array([14.79339642, 9.80261182])
For example, we can view 14.8
as the predicted output for the 0th row. The predict
method isn’t doing anything mysterious. It’s just evaluating this linear function on the given inputs. Here is the by-hand computation for the 0th row.
2.63*1.6+1.33*2.15+1.11*0+1.55*1+6.17
14.787500000000001
Polynomial regression¶
Last time, we fit a degree 9 polynomial model to this data, using “distance” as the (only) input variable and using “total” as the output variable. The code from last time is below.
Using 100 training points, adapt the code from last time to fit models of different degrees, for each degree from 1 to 25. Plot the resulting polynomials for \(0 \leq x \leq M\), where \(M\) is the maximum “distance” value within the training data.
A lot of this code was copied from last time, and then adjusted to the current goals.
from sklearn.model_selection import train_test_split
We’re using 100 data points, instead of 40 from last time, so we should expect that there will be slightly less overfitting this time, since we are using more data points.
df_train, df_test = train_test_split(df, train_size=100)
df_train.shape
(100, 15)
c = alt.Chart(df_train).mark_circle().encode(
x="distance",
y="total"
)
c
df_train["distance"].max()
26.92
import numpy as np
df_plot = pd.DataFrame({"distance":np.arange(0,df_train["distance"].max()+0.1,0.1)})
df_plot.head()
distance | |
---|---|
0 | 0.0 |
1 | 0.1 |
2 | 0.2 |
3 | 0.3 |
4 | 0.4 |
cols = []
for deg in range(1,25):
col = f"d{deg}"
cols.append(col)
for x in [df_train, df_plot]:
x[col] = x["distance"]**deg
df_plot.head()
distance | d1 | d2 | d3 | d4 | d5 | d6 | d7 | d8 | d9 | ... | d15 | d16 | d17 | d18 | d19 | d20 | d21 | d22 | d23 | d24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.00 | 0.000 | 0.0000 | 0.00000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | ... | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
1 | 0.1 | 0.1 | 0.01 | 0.001 | 0.0001 | 0.00001 | 0.000001 | 1.000000e-07 | 1.000000e-08 | 1.000000e-09 | ... | 1.000000e-15 | 1.000000e-16 | 1.000000e-17 | 1.000000e-18 | 1.000000e-19 | 1.000000e-20 | 1.000000e-21 | 1.000000e-22 | 1.000000e-23 | 1.000000e-24 |
2 | 0.2 | 0.2 | 0.04 | 0.008 | 0.0016 | 0.00032 | 0.000064 | 1.280000e-05 | 2.560000e-06 | 5.120000e-07 | ... | 3.276800e-11 | 6.553600e-12 | 1.310720e-12 | 2.621440e-13 | 5.242880e-14 | 1.048576e-14 | 2.097152e-15 | 4.194304e-16 | 8.388608e-17 | 1.677722e-17 |
3 | 0.3 | 0.3 | 0.09 | 0.027 | 0.0081 | 0.00243 | 0.000729 | 2.187000e-04 | 6.561000e-05 | 1.968300e-05 | ... | 1.434891e-08 | 4.304672e-09 | 1.291402e-09 | 3.874205e-10 | 1.162261e-10 | 3.486784e-11 | 1.046035e-11 | 3.138106e-12 | 9.414318e-13 | 2.824295e-13 |
4 | 0.4 | 0.4 | 0.16 | 0.064 | 0.0256 | 0.01024 | 0.004096 | 1.638400e-03 | 6.553600e-04 | 2.621440e-04 | ... | 1.073742e-06 | 4.294967e-07 | 1.717987e-07 | 6.871948e-08 | 2.748779e-08 | 1.099512e-08 | 4.398047e-09 | 1.759219e-09 | 7.036874e-10 | 2.814750e-10 |
5 rows × 25 columns
cols
['d1',
'd2',
'd3',
'd4',
'd5',
'd6',
'd7',
'd8',
'd9',
'd10',
'd11',
'd12',
'd13',
'd14',
'd15',
'd16',
'd17',
'd18',
'd19',
'd20',
'd21',
'd22',
'd23',
'd24']
cols[:4]
['d1', 'd2', 'd3', 'd4']
chart_list = []
for deg in range(1,25):
subcols = cols[:deg]
reg = LinearRegression()
reg.fit(df_train[subcols],df_train["total"])
df_plot[f"Pred{deg}"] = reg.predict(df_plot[subcols])
c_temp = alt.Chart(df_plot).mark_line(color="red", clip=True).encode(
x="distance",
y=alt.Y(f"Pred{deg}", scale=alt.Scale(domain=(0,200)))
)
chart_list.append(c_temp)
both_charts = [c+d for d in chart_list]
The input to alt.vconcat
needs to be one or more Altair charts, not a list of Altair charts.
alt.vconcat(both_charts)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_15733/187346778.py in <module>
----> 1 alt.vconcat(both_charts)
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in vconcat(*charts, **kwargs)
2330 def vconcat(*charts, **kwargs):
2331 """Concatenate charts vertically"""
-> 2332 return VConcatChart(vconcat=charts, **kwargs)
2333
2334
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in __init__(self, data, vconcat, **kwargs)
2304 # TODO: move common data to top level?
2305 for spec in vconcat:
-> 2306 _check_if_valid_subspec(spec, "VConcatChart")
2307 super(VConcatChart, self).__init__(data=data, vconcat=list(vconcat), **kwargs)
2308 self.data, self.vconcat = _combine_subchart_data(self.data, self.vconcat)
~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in _check_if_valid_subspec(spec, classname)
2072
2073 if not isinstance(spec, (core.SchemaBase, dict)):
-> 2074 raise ValueError("Only chart objects can be used in {0}.".format(classname))
2075 for attr in TOPLEVEL_ONLY_KEYS:
2076 if isinstance(spec, core.SchemaBase):
ValueError: Only chart objects can be used in VConcatChart.
So we use list unpacking. Notice how the overfitting gets more extreme as the degree of the polynomial gets higher.
alt.vconcat(*both_charts)