Linear and Polynomial Regression with the taxis dataset¶

import pandas as pd
import altair as alt
alt.data_transformers.enable('default', max_rows=10000)
import seaborn as sns

df = sns.load_dataset("taxis").dropna()

Linear regression¶

Fit a linear regression model to the data from the taxis dataset, using multiple input variables (also called features, also called predictors), and with “total” as the output variable (the target).

df.head()

	pickup	dropoff	passengers	distance	fare	tip	total	color	payment	pickup_zone	dropoff_zone	pickup_borough	dropoff_borough
0	2019-03-23 20:21:09	2019-03-23 20:27:24	1	1.60	7.0	2.15	12.95	yellow	credit card	Lenox Hill West	UN/Turtle Bay South	Manhattan	Manhattan
1	2019-03-04 16:11:55	2019-03-04 16:19:00	1	0.79	5.0	0.00	9.30	yellow	cash	Upper West Side South	Upper West Side South	Manhattan	Manhattan
2	2019-03-27 17:53:01	2019-03-27 18:00:25	1	1.37	7.5	2.36	14.16	yellow	credit card	Alphabet City	West Village	Manhattan	Manhattan
3	2019-03-10 01:23:59	2019-03-10 01:49:51	1	7.70	27.0	6.15	36.95	yellow	credit card	Hudson Sq	Yorkville West	Manhattan	Manhattan
4	2019-03-30 13:27:42	2019-03-30 13:37:14	3	2.16	9.0	1.10	13.40	yellow	credit card	Midtown East	Yorkville West	Manhattan	Manhattan

What are the rows with the biggest values in the “tolls” column?

df.sort_values("tolls", ascending=False)

	pickup	dropoff	passengers	distance	fare	tip	tolls	total	color	payment	pickup_zone	dropoff_zone	pickup_borough	dropoff_borough
5364	2019-03-17 16:59:17	2019-03-17 18:04:08	2	36.70	150.00	0.00	24.02	174.82	yellow	cash	JFK Airport	JFK Airport	Queens	Queens
2122	2019-03-08 00:40:32	2019-03-08 01:11:53	1	15.51	44.00	16.27	17.28	81.35	yellow	credit card	TriBeCa/Civic Center	West Brighton	Manhattan	Staten Island
3640	2019-03-22 07:54:09	2019-03-22 09:05:13	1	16.42	52.00	0.00	12.50	67.80	yellow	cash	JFK Airport	Murray Hill	Queens	Manhattan
5911	2019-03-09 12:27:51	2019-03-09 13:11:18	1	11.40	39.00	0.00	11.52	51.32	green	credit card	Windsor Terrace	Clinton East	Brooklyn	Manhattan
5728	2019-03-01 17:07:09	2019-03-01 18:05:41	1	21.27	65.59	0.00	11.52	77.61	green	credit card	Cambria Heights	Morningside Heights	Queens	Manhattan
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2203	2019-03-03 00:24:50	2019-03-03 00:56:17	1	4.72	22.00	5.16	0.00	30.96	yellow	credit card	Meatpacking/West Village West	Williamsburg (South Side)	Manhattan	Brooklyn
2202	2019-03-14 22:32:33	2019-03-14 22:49:39	1	2.90	13.00	3.36	0.00	20.16	yellow	credit card	East Chelsea	East Village	Manhattan	Manhattan
2201	2019-03-18 21:16:42	2019-03-18 21:27:49	1	3.00	11.50	3.06	0.00	18.36	yellow	credit card	Clinton East	Upper East Side North	Manhattan	Manhattan
2200	2019-03-03 07:21:40	2019-03-03 07:39:12	1	7.20	22.00	7.55	0.00	32.85	yellow	credit card	Midtown Center	World Trade Center	Manhattan	Manhattan
6432	2019-03-13 19:31:22	2019-03-13 19:48:02	1	3.85	15.00	3.36	0.00	20.16	green	credit card	Boerum Hill	Windsor Terrace	Brooklyn	Brooklyn

6341 rows × 14 columns

Let’s try to use the following columns as the inputs for our linear regression.

cols = ["distance", "tip", "tolls", "pickup_borough"]

from sklearn.linear_model import LinearRegression

reg = LinearRegression()

This doesn’t work, because the values in the “pickup_borough” column are strings, not numbers.

reg.fit(df[cols],df["total"])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_15733/3118760667.py in <module>
----> 1 reg.fit(df[cols],df["total"])

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/linear_model/_base.py in fit(self, X, y, sample_weight)
    661 
    662         X, y = self._validate_data(
--> 663             X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
    664         )
    665 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    579                 y = check_array(y, **check_y_params)
    580             else:
--> 581                 X, y = check_X_y(X, y, **check_params)
    582             out = X, y
    583 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    974         ensure_min_samples=ensure_min_samples,
    975         ensure_min_features=ensure_min_features,
--> 976         estimator=estimator,
    977     )
    978 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    744                     array = array.astype(dtype, casting="unsafe", copy=False)
    745                 else:
--> 746                     array = np.asarray(array, order=order, dtype=dtype)
    747             except ComplexWarning as complex_warning:
    748                 raise ValueError(

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/generic.py in __array__(self, dtype)
   1991 
   1992     def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993         return np.asarray(self._values, dtype=dtype)
   1994 
   1995     def __array_wrap__(

ValueError: could not convert string to float: 'Manhattan'

Let’s make a new column called “Manhattan”. This will contain 1 for the “Manhattan” pickup borough rows, and contain 0 for all the other rows.

df["Manhattan"] = 0

df["pickup_borough"] == "Manhattan"

      True
      True
      True
      True
      True
        ...  
   True
  False
  False
  False
  False
Name: pickup_borough, Length: 6341, dtype: bool

(I think we could also store Boolean values directly in this new “Manhattan” column, but I think it’s less confusing to have 0 and 1.)

# Put a 1 (for True) where the value is Manhattan
df.loc[df["pickup_borough"] == "Manhattan", "Manhattan"] = 1

df

	pickup	dropoff	passengers	distance	fare	tip	tolls	total	color	payment	pickup_zone	dropoff_zone	pickup_borough	dropoff_borough	Manhattan
0	2019-03-23 20:21:09	2019-03-23 20:27:24	1	1.60	7.0	2.15	0.0	12.95	yellow	credit card	Lenox Hill West	UN/Turtle Bay South	Manhattan	Manhattan	1
1	2019-03-04 16:11:55	2019-03-04 16:19:00	1	0.79	5.0	0.00	0.0	9.30	yellow	cash	Upper West Side South	Upper West Side South	Manhattan	Manhattan	1
2	2019-03-27 17:53:01	2019-03-27 18:00:25	1	1.37	7.5	2.36	0.0	14.16	yellow	credit card	Alphabet City	West Village	Manhattan	Manhattan	1
3	2019-03-10 01:23:59	2019-03-10 01:49:51	1	7.70	27.0	6.15	0.0	36.95	yellow	credit card	Hudson Sq	Yorkville West	Manhattan	Manhattan	1
4	2019-03-30 13:27:42	2019-03-30 13:37:14	3	2.16	9.0	1.10	0.0	13.40	yellow	credit card	Midtown East	Yorkville West	Manhattan	Manhattan	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6428	2019-03-31 09:51:53	2019-03-31 09:55:27	1	0.75	4.5	1.06	0.0	6.36	green	credit card	East Harlem North	Central Harlem North	Manhattan	Manhattan	1
6429	2019-03-31 17:38:00	2019-03-31 18:34:23	1	18.74	58.0	0.00	0.0	58.80	green	credit card	Jamaica	East Concourse/Concourse Village	Queens	Bronx	0
6430	2019-03-23 22:55:18	2019-03-23 23:14:25	1	4.14	16.0	0.00	0.0	17.30	green	cash	Crown Heights North	Bushwick North	Brooklyn	Brooklyn	0
6431	2019-03-04 10:09:25	2019-03-04 10:14:29	1	1.12	6.0	0.00	0.0	6.80	green	credit card	East New York	East Flatbush/Remsen Village	Brooklyn	Brooklyn	0
6432	2019-03-13 19:31:22	2019-03-13 19:48:02	1	3.85	15.0	3.36	0.0	20.16	green	credit card	Boerum Hill	Windsor Terrace	Brooklyn	Brooklyn	0

6341 rows × 15 columns

We now replace the old “pickup_borough” column with the newly created “Manhattan” column.

cols = ['distance', 'tip', 'tolls', 'Manhattan']

reg.fit(df[cols],df["total"])

LinearRegression()

The goal of the fit method is to find the following coefficients, as well as the intercept.

reg.coef_

array([2.6294669 , 1.3306588 , 1.11487961, 1.55477579])

You should interpret the following as saying that the total cost of the taxi ride will be modeled by a formula involving 2.63 times the distance traveled. This can be interpreted as $2.63 per mile.

pd.Series(reg.coef_, index=cols)

distance     2.629467
tip          1.330659
tolls        1.114880
Manhattan    1.554776
dtype: float64

reg.intercept_

6.170557177757704

df[:2]

	pickup	dropoff	passengers	distance	fare	tip	tolls	total	color	payment	pickup_zone	dropoff_zone	pickup_borough	dropoff_borough	Manhattan
0	2019-03-23 20:21:09	2019-03-23 20:27:24	1	1.60	7.0	2.15	0.0	12.95	yellow	credit card	Lenox Hill West	UN/Turtle Bay South	Manhattan	Manhattan	1
1	2019-03-04 16:11:55	2019-03-04 16:19:00	1	0.79	5.0	0.00	0.0	9.30	yellow	cash	Upper West Side South	Upper West Side South	Manhattan	Manhattan	1

reg.predict(df[:2][cols])

array([14.79339642,  9.80261182])

For example, we can view 14.8 as the predicted output for the 0th row. The predict method isn’t doing anything mysterious. It’s just evaluating this linear function on the given inputs. Here is the by-hand computation for the 0th row.

2.63*1.6+1.33*2.15+1.11*0+1.55*1+6.17

14.787500000000001

Polynomial regression¶

Last time, we fit a degree 9 polynomial model to this data, using “distance” as the (only) input variable and using “total” as the output variable. The code from last time is below.

Using 100 training points, adapt the code from last time to fit models of different degrees, for each degree from 1 to 25. Plot the resulting polynomials for $0 \leq x \leq M$, where $M$ is the maximum “distance” value within the training data.

A lot of this code was copied from last time, and then adjusted to the current goals.

from sklearn.model_selection import train_test_split

We’re using 100 data points, instead of 40 from last time, so we should expect that there will be slightly less overfitting this time, since we are using more data points.

df_train, df_test = train_test_split(df, train_size=100)

df_train.shape

(100, 15)

c = alt.Chart(df_train).mark_circle().encode(
    x="distance",
    y="total"
)

df_train["distance"].max()

26.92

import numpy as np

df_plot = pd.DataFrame({"distance":np.arange(0,df_train["distance"].max()+0.1,0.1)})

df_plot.head()

	distance
0	0.0
1	0.1
2	0.2
3	0.3
4	0.4

cols = []
for deg in range(1,25):
    col = f"d{deg}"
    cols.append(col)
    for x in [df_train, df_plot]:
        x[col] = x["distance"]**deg

df_plot.head()

	distance	d1	d2	d3	d4	d5	d6	d7	d8	d9	...	d15	d16	d17	d18	d19	d20	d21	d22	d23	d24
0	0.0	0.0	0.00	0.000	0.0000	0.00000	0.000000	0.000000e+00	0.000000e+00	0.000000e+00	...	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00
1	0.1	0.1	0.01	0.001	0.0001	0.00001	0.000001	1.000000e-07	1.000000e-08	1.000000e-09	...	1.000000e-15	1.000000e-16	1.000000e-17	1.000000e-18	1.000000e-19	1.000000e-20	1.000000e-21	1.000000e-22	1.000000e-23	1.000000e-24
2	0.2	0.2	0.04	0.008	0.0016	0.00032	0.000064	1.280000e-05	2.560000e-06	5.120000e-07	...	3.276800e-11	6.553600e-12	1.310720e-12	2.621440e-13	5.242880e-14	1.048576e-14	2.097152e-15	4.194304e-16	8.388608e-17	1.677722e-17
3	0.3	0.3	0.09	0.027	0.0081	0.00243	0.000729	2.187000e-04	6.561000e-05	1.968300e-05	...	1.434891e-08	4.304672e-09	1.291402e-09	3.874205e-10	1.162261e-10	3.486784e-11	1.046035e-11	3.138106e-12	9.414318e-13	2.824295e-13
4	0.4	0.4	0.16	0.064	0.0256	0.01024	0.004096	1.638400e-03	6.553600e-04	2.621440e-04	...	1.073742e-06	4.294967e-07	1.717987e-07	6.871948e-08	2.748779e-08	1.099512e-08	4.398047e-09	1.759219e-09	7.036874e-10	2.814750e-10

5 rows × 25 columns

cols

['d1',
 'd2',
 'd3',
 'd4',
 'd5',
 'd6',
 'd7',
 'd8',
 'd9',
 'd10',
 'd11',
 'd12',
 'd13',
 'd14',
 'd15',
 'd16',
 'd17',
 'd18',
 'd19',
 'd20',
 'd21',
 'd22',
 'd23',
 'd24']

cols[:4]

['d1', 'd2', 'd3', 'd4']

chart_list = []

for deg in range(1,25):
    subcols = cols[:deg]
    reg = LinearRegression()
    reg.fit(df_train[subcols],df_train["total"])
    df_plot[f"Pred{deg}"] = reg.predict(df_plot[subcols])
    c_temp = alt.Chart(df_plot).mark_line(color="red", clip=True).encode(
        x="distance",
        y=alt.Y(f"Pred{deg}", scale=alt.Scale(domain=(0,200)))
    )
    chart_list.append(c_temp)

both_charts = [c+d for d in chart_list]

The input to alt.vconcat needs to be one or more Altair charts, not a list of Altair charts.

alt.vconcat(both_charts)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_15733/187346778.py in <module>
----> 1 alt.vconcat(both_charts)

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in vconcat(*charts, **kwargs)
   2330 def vconcat(*charts, **kwargs):
   2331     """Concatenate charts vertically"""
-> 2332     return VConcatChart(vconcat=charts, **kwargs)
   2333 
   2334 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in __init__(self, data, vconcat, **kwargs)
   2304         # TODO: move common data to top level?
   2305         for spec in vconcat:
-> 2306             _check_if_valid_subspec(spec, "VConcatChart")
   2307         super(VConcatChart, self).__init__(data=data, vconcat=list(vconcat), **kwargs)
   2308         self.data, self.vconcat = _combine_subchart_data(self.data, self.vconcat)

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/altair/vegalite/v4/api.py in _check_if_valid_subspec(spec, classname)
   2072 
   2073     if not isinstance(spec, (core.SchemaBase, dict)):
-> 2074         raise ValueError("Only chart objects can be used in {0}.".format(classname))
   2075     for attr in TOPLEVEL_ONLY_KEYS:
   2076         if isinstance(spec, core.SchemaBase):

ValueError: Only chart objects can be used in VConcatChart.

So we use list unpacking. Notice how the overfitting gets more extreme as the degree of the polynomial gets higher.

alt.vconcat(*both_charts)

UC Irvine Math 10 S22

Linear and Polynomial Regression with the taxis dataset

Contents

Linear and Polynomial Regression with the taxis dataset¶

Linear regression¶

Polynomial regression¶