Linear and Polynomial Regression with the taxis dataset

Linear and Polynomial Regression with the taxis dataset

import pandas as pd
import altair as alt
alt.data_transformers.enable('default', max_rows=10000)
import seaborn as sns
df = sns.load_dataset("taxis").dropna()

Linear regression

  • Fit a linear regression model to the data from the taxis dataset, using multiple input variables (also called features, also called predictors), and with “total” as the output variable (the target).

df.head()
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
0 2019-03-23 20:21:09 2019-03-23 20:27:24 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan
2 2019-03-27 17:53:01 2019-03-27 18:00:25 1 1.37 7.5 2.36 0.0 14.16 yellow credit card Alphabet City West Village Manhattan Manhattan
3 2019-03-10 01:23:59 2019-03-10 01:49:51 1 7.70 27.0 6.15 0.0 36.95 yellow credit card Hudson Sq Yorkville West Manhattan Manhattan
4 2019-03-30 13:27:42 2019-03-30 13:37:14 3 2.16 9.0 1.10 0.0 13.40 yellow credit card Midtown East Yorkville West Manhattan Manhattan

What are the rows with the biggest values in the “tolls” column?

df.sort_values("tolls", ascending=False)
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
5364 2019-03-17 16:59:17 2019-03-17 18:04:08 2 36.70 150.00 0.00 24.02 174.82 yellow cash JFK Airport JFK Airport Queens Queens
2122 2019-03-08 00:40:32 2019-03-08 01:11:53 1 15.51 44.00 16.27 17.28 81.35 yellow credit card TriBeCa/Civic Center West Brighton Manhattan Staten Island
3640 2019-03-22 07:54:09 2019-03-22 09:05:13 1 16.42 52.00 0.00 12.50 67.80 yellow cash JFK Airport Murray Hill Queens Manhattan
5911 2019-03-09 12:27:51 2019-03-09 13:11:18 1 11.40 39.00 0.00 11.52 51.32 green credit card Windsor Terrace Clinton East Brooklyn Manhattan
5728 2019-03-01 17:07:09 2019-03-01 18:05:41 1 21.27 65.59 0.00 11.52 77.61 green credit card Cambria Heights Morningside Heights Queens Manhattan
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2203 2019-03-03 00:24:50 2019-03-03 00:56:17 1 4.72 22.00 5.16 0.00 30.96 yellow credit card Meatpacking/West Village West Williamsburg (South Side) Manhattan Brooklyn
2202 2019-03-14 22:32:33 2019-03-14 22:49:39 1 2.90 13.00 3.36 0.00 20.16 yellow credit card East Chelsea East Village Manhattan Manhattan
2201 2019-03-18 21:16:42 2019-03-18 21:27:49 1 3.00 11.50 3.06 0.00 18.36 yellow credit card Clinton East Upper East Side North Manhattan Manhattan
2200 2019-03-03 07:21:40 2019-03-03 07:39:12 1 7.20 22.00 7.55 0.00 32.85 yellow credit card Midtown Center World Trade Center Manhattan Manhattan
6432 2019-03-13 19:31:22 2019-03-13 19:48:02 1 3.85 15.00 3.36 0.00 20.16 green credit card Boerum Hill Windsor Terrace Brooklyn Brooklyn

6341 rows × 14 columns

Let’s try to use the following columns as the inputs for our linear regression.

cols = ["distance", "tip", "tolls", "pickup_borough"]
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

This doesn’t work, because the values in the “pickup_borough” column are strings, not numbers.

reg.fit(df[cols],df["total"])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_15733/3118760667.py in <module>
----> 1 reg.fit(df[cols],df["total"])

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/linear_model/_base.py in fit(self, X, y, sample_weight)
    661 
    662         X, y = self._validate_data(
--> 663             X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
    664         )
    665 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    579                 y = check_array(y, **check_y_params)
    580             else:
--> 581                 X, y = check_X_y(X, y, **check_params)
    582             out = X, y
    583 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    974         ensure_min_samples=ensure_min_samples,
    975         ensure_min_features=ensure_min_features,
--> 976         estimator=estimator,
    977     )
    978 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    744                     array = array.astype(dtype, casting="unsafe", copy=False)
    745                 else:
--> 746                     array = np.asarray(array, order=order, dtype=dtype)
    747             except ComplexWarning as complex_warning:
    748                 raise ValueError(

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/generic.py in __array__(self, dtype)
   1991 
   1992     def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993         return np.asarray(self._values, dtype=dtype)
   1994 
   1995     def __array_wrap__(

ValueError: could not convert string to float: 'Manhattan'

Let’s make a new column called “Manhattan”. This will contain 1 for the “Manhattan” pickup borough rows, and contain 0 for all the other rows.

df["Manhattan"] = 0
df["pickup_borough"] == "Manhattan"
0        True
1        True
2        True
3        True
4        True
        ...  
6428     True
6429    False
6430    False
6431    False
6432    False
Name: pickup_borough, Length: 6341, dtype: bool

(I think we could also store Boolean values directly in this new “Manhattan” column, but I think it’s less confusing to have 0 and 1.)

# Put a 1 (for True) where the value is Manhattan
df.loc[df["pickup_borough"] == "Manhattan", "Manhattan"] = 1
df
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough Manhattan
0 2019-03-23 20:21:09 2019-03-23 20:27:24 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan 1
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan 1
2 2019-03-27 17:53:01 2019-03-27 18:00:25 1 1.37 7.5 2.36 0.0 14.16 yellow credit card Alphabet City West Village Manhattan Manhattan 1
3 2019-03-10 01:23:59 2019-03-10 01:49:51 1 7.70 27.0 6.15 0.0 36.95 yellow credit card Hudson Sq Yorkville West Manhattan Manhattan 1
4 2019-03-30 13:27:42 2019-03-30 13:37:14 3 2.16 9.0 1.10 0.0 13.40 yellow credit card Midtown East Yorkville West Manhattan Manhattan 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6428 2019-03-31 09:51:53 2019-03-31 09:55:27 1 0.75 4.5 1.06 0.0 6.36 green credit card East Harlem North Central Harlem North Manhattan Manhattan 1
6429 2019-03-31 17:38:00 2019-03-31 18:34:23 1 18.74 58.0 0.00 0.0 58.80 green credit card Jamaica East Concourse/Concourse Village Queens Bronx 0
6430 2019-03-23 22:55:18 2019-03-23 23:14:25 1 4.14 16.0 0.00 0.0 17.30 green cash Crown Heights North Bushwick North Brooklyn Brooklyn 0
6431 2019-03-04 10:09:25 2019-03-04 10:14:29 1 1.12 6.0 0.00 0.0 6.80 green credit card East New York East Flatbush/Remsen Village Brooklyn Brooklyn 0
6432 2019-03-13 19:31:22 2019-03-13 19:48:02 1 3.85 15.0 3.36 0.0 20.16 green credit card Boerum Hill Windsor Terrace Brooklyn Brooklyn 0

6341 rows × 15 columns

We now replace the old “pickup_borough” column with the newly created “Manhattan” column.

cols = ['distance', 'tip', 'tolls', 'Manhattan']
reg.fit(df[cols],df["total"])
LinearRegression()

The goal of the fit method is to find the following coefficients, as well as the intercept.

reg.coef_
array([2.6294669 , 1.3306588 , 1.11487961, 1.55477579])

You should interpret the following as saying that the total cost of the taxi ride will be modeled by a formula involving 2.63 times the distance traveled. This can be interpreted as $2.63 per mile.

pd.Series(reg.coef_, index=cols)
distance     2.629467
tip          1.330659
tolls        1.114880
Manhattan    1.554776
dtype: float64
reg.intercept_
6.170557177757704
df[:2]
pickup dropoff passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough Manhattan
0 2019-03-23 20:21:09 2019-03-23 20:27:24 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan 1
1 2019-03-04 16:11:55 2019-03-04 16:19:00 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan 1
reg.predict(df[:2][cols])
array([14.79339642,  9.80261182])

For example, we can view 14.8 as the predicted output for the 0th row. The predict method isn’t doing anything mysterious. It’s just evaluating this linear function on the given inputs. Here is the by-hand computation for the 0th row.

2.63*1.6+1.33*2.15+1.11*0+1.55*1+6.17
14.787500000000001

Polynomial regression

Last time, we fit a degree 9 polynomial model to this data, using “distance” as the (only) input variable and using “total” as the output variable. The code from last time is below.

Using 100 training points, adapt the code from last time to fit models of different degrees, for each degree from 1 to 25. Plot the resulting polynomials for \(0 \leq x \leq M\), where \(M\) is the maximum “distance” value within the training data.

A lot of this code was copied from last time, and then adjusted to the current goals.

from sklearn.model_selection import train_test_split

We’re using 100 data points, instead of 40 from last time, so we should expect that there will be slightly less overfitting this time, since we are using more data points.

df_train, df_test = train_test_split(df, train_size=100)
df_train.shape
(100, 15)
c = alt.Chart(df_train).mark_circle().encode(
    x="distance",
    y="total"
)
c
df_train["distance"].max()
26.92
import numpy as np
df_plot = pd.DataFrame({"distance":np.arange(0,df_train["distance"].max()+0.1,0.1)})
df_plot.head()
distance
0 0.0
1 0.1
2 0.2
3 0.3
4 0.4
cols = []
for deg in range(1,25):
    col = f"d{deg}"
    cols.append(col)
    for x in [df_train, df_plot]:
        x[col] = x["distance"]**deg
df_plot.head()
distance d1 d2 d3 d4 d5 d6 d7 d8 d9 ... d15 d16 d17 d18 d19 d20 d21 d22 d23 d24
0 0.0 0.0 0.00 0.000 0.0000 0.00000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
1 0.1 0.1 0.01 0.001 0.0001 0.00001 0.000001 1.000000e-07 1.000000e-08 1.000000e-09 ... 1.000000e-15 1.000000e-16 1.000000e-17 1.000000e-18 1.000000e-19 1.000000e-20 1.000000e-21 1.000000e-22 1.000000e-23 1.000000e-24
2 0.2 0.2 0.04 0.008 0.0016 0.00032 0.000064 1.280000e-05 2.560000e-06 5.120000e-07 ... 3.276800e-11 6.553600e-12 1.310720e-12 2.621440e-13 5.242880e-14 1.048576e-14 2.097152e-15 4.194304e-16 8.388608e-17 1.677722e-17
3 0.3 0.3 0.09 0.027 0.0081 0.00243 0.000729 2.187000e-04 6.561000e-05 1.968300e-05 ... 1.434891e-08 4.304672e-09 1.291402e-09 3.874205e-10 1.162261e-10 3.486784e-11 1.046035e-11 3.138106e-12 9.414318e-13 2.824295e-13
4 0.4 0.4 0.16 0.064 0.0256 0.01024 0.004096 1.638400e-03 6.553600e-04 2.621440e-04 ... 1.073742e-06 4.294967e-07 1.717987e-07 6.871948e-08 2.748779e-08 1.099512e-08 4.398047e-09 1.759219e-09 7.036874e-10 2.814750e-10

5 rows × 25 columns

cols
['d1',
 'd2',
 'd3',
 'd4',
 'd5',
 'd6',
 'd7',
 'd8',
 'd9',
 'd10',
 'd11',
 'd12',
 'd13',
 'd14',
 'd15',
 'd16',
 'd17',
 'd18',
 'd19',
 'd20',
 'd21',
 'd22',
 'd23',
 'd24']
cols[:4]
['d1', 'd2', 'd3', 'd4']
chart_list = []

for deg in range(1,25):
    subcols = cols[:deg]
    reg = LinearRegression()
    reg.fit(df_train[subcols],df_train["total"])
    df_plot[f"Pred{deg}"] = reg.predict(df_plot[subcols])
    c_temp = alt.Chart(df_plot).mark_line(color="red", clip=True).encode(
        x="distance",
        y=alt.Y(f"Pred{deg}", scale=alt.Scale(domain=(0,200)))
    )
    chart_list.append(c_temp)
both_charts = [c+d for d in chart_list]

The input to alt.vconcat needs to be one or more Altair charts, not a list of Altair charts.

alt.vconcat(both_charts)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_15733/187346778.py in <module>
----> 1 alt.vconcat(both_charts)

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in vconcat(*charts, **kwargs)
   2330 def vconcat(*charts, **kwargs):
   2331     """Concatenate charts vertically"""
-> 2332     return VConcatChart(vconcat=charts, **kwargs)
   2333 
   2334 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in __init__(self, data, vconcat, **kwargs)
   2304         # TODO: move common data to top level?
   2305         for spec in vconcat:
-> 2306             _check_if_valid_subspec(spec, "VConcatChart")
   2307         super(VConcatChart, self).__init__(data=data, vconcat=list(vconcat), **kwargs)
   2308         self.data, self.vconcat = _combine_subchart_data(self.data, self.vconcat)

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in _check_if_valid_subspec(spec, classname)
   2072 
   2073     if not isinstance(spec, (core.SchemaBase, dict)):
-> 2074         raise ValueError("Only chart objects can be used in {0}.".format(classname))
   2075     for attr in TOPLEVEL_ONLY_KEYS:
   2076         if isinstance(spec, core.SchemaBase):

ValueError: Only chart objects can be used in VConcatChart.

So we use list unpacking. Notice how the overfitting gets more extreme as the degree of the polynomial gets higher.

alt.vconcat(*both_charts)