Week 8 Videos

import numpy as np
import pandas as pd
import altair as alt

Generating quadratic data

from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=1, bias=15, noise=20, random_state=4)
pd.DataFrame({"x":X, "y":y})
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_6239/429307098.py in <module>
----> 1 pd.DataFrame({"x":X, "y":y})

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    612         elif isinstance(data, dict):
    613             # GH#38939 de facto copy defaults to False only in non-dict cases
--> 614             mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
    615         elif isinstance(data, ma.MaskedArray):
    616             import numpy.ma.mrecords as mrecords

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/internals/construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
    463 
    464     return arrays_to_mgr(
--> 465         arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
    466     )
    467 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity, typ, consolidate)
    122 
    123         # don't force copy because getting jammed in an ndarray anyway
--> 124         arrays = _homogenize(arrays, index, dtype)
    125 
    126     else:

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
    588 
    589             val = sanitize_array(
--> 590                 val, index, dtype=dtype, copy=False, raise_cast_failure=False
    591             )
    592 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure, allow_2d)
    574                 subarr = maybe_infer_to_datetimelike(subarr)
    575 
--> 576     subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
    577 
    578     if isinstance(subarr, np.ndarray):

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/construction.py in _sanitize_ndim(result, data, dtype, index, allow_2d)
    625             if allow_2d:
    626                 return result
--> 627             raise ValueError("Data must be 1-dimensional")
    628         if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
    629             # i.e. PandasDtype("O")

ValueError: Data must be 1-dimensional
X.shape
(1000, 1)
X[:10]
array([[-0.20735394],
       [ 0.18362632],
       [ 0.33825293],
       [ 0.28220666],
       [-1.3474603 ],
       [ 0.54950758],
       [ 0.06797219],
       [-0.79674267],
       [-2.29291305],
       [-1.16156742]])
type(X)
numpy.ndarray
X.reshape(-1).shape
(1000,)
df = pd.DataFrame({"x":X.reshape(-1), "y":y})
alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)
df["y"] = df["y"]**2
alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)
df["y"] *= 1/50
alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)
16000/50
320.0

PolynomialFeatures

df
x y
0 -0.207354 8.283916
1 0.183626 2.290559
2 0.338253 45.708364
3 0.282207 0.594082
4 -1.347460 2.214989
... ... ...
995 0.614307 33.987890
996 0.233874 16.966095
997 0.050562 30.240024
998 2.360360 123.332288
999 1.319017 101.775461

1000 rows × 2 columns

max_deg = 6
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=max_deg, include_bias=False)
poly.fit(df[["x"]])
PolynomialFeatures(degree=6, include_bias=False)
poly.transform(df[["x"]]).shape
(1000, 6)
pd.DataFrame(poly.transform(df[["x"]])).head()
0 1 2 3 4 5
0 -0.207354 0.042996 -0.008915 0.001849 -0.000383 0.000079
1 0.183626 0.033719 0.006192 0.001137 0.000209 0.000038
2 0.338253 0.114415 0.038701 0.013091 0.004428 0.001498
3 0.282207 0.079641 0.022475 0.006343 0.001790 0.000505
4 -1.347460 1.815649 -2.446515 3.296582 -4.442014 5.985437
cols = [f"x{i}" for i in range(1,max_deg+1)]
df[cols] = poly.transform(df[["x"]])
df.head()
x y x1 x2 x3 x4 x5 x6
0 -0.207354 8.283916 -0.207354 0.042996 -0.008915 0.001849 -0.000383 0.000079
1 0.183626 2.290559 0.183626 0.033719 0.006192 0.001137 0.000209 0.000038
2 0.338253 45.708364 0.338253 0.114415 0.038701 0.013091 0.004428 0.001498
3 0.282207 0.594082 0.282207 0.079641 0.022475 0.006343 0.001790 0.000505
4 -1.347460 2.214989 -1.347460 1.815649 -2.446515 3.296582 -4.442014 5.985437

Evaluating polynomial regression, Part 1

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(df[cols], df["y"], train_size=20, random_state=0)
for i in range(1,max_deg+1):
    reg = LinearRegression()
    sub_cols = cols[:i]
    print(sub_cols)
['x1']
['x1', 'x2']
['x1', 'x2', 'x3']
['x1', 'x2', 'x3', 'x4']
['x1', 'x2', 'x3', 'x4', 'x5']
['x1', 'x2', 'x3', 'x4', 'x5', 'x6']
train_error_dict = {}
test_error_dict = {}

for i in range(1,max_deg+1):
    reg = LinearRegression()
    sub_cols = cols[:i]
    reg.fit(X_train[sub_cols], y_train)
    train_error_dict[i] = mean_squared_error(reg.predict(X_train[sub_cols]), y_train)
    test_error_dict[i] = mean_squared_error(reg.predict(X_test[sub_cols]), y_test)
train_error_dict
{1: 273.65221070646004,
 2: 174.4690132350938,
 3: 171.7311919316596,
 4: 171.3812547748483,
 5: 109.07647969349776,
 6: 106.63430810917487}
test_error_dict
{1: 1585.3912941201534,
 2: 1128.0874914175256,
 3: 1134.1637387874664,
 4: 1190.2283257927493,
 5: 13294.49407116664,
 6: 18948.061196962655}

Evaluating polynomial regression, Part 2

X_train, X_test, y_train, y_test = train_test_split(df[cols], df["y"], train_size=0.8, random_state=0)
X_train.shape
(800, 6)
train_error_dict2 = {}
test_error_dict2 = {}

for i in range(1,max_deg+1):
    reg = LinearRegression()
    sub_cols = cols[:i]
    reg.fit(X_train[sub_cols], y_train)
    train_error_dict2[i] = mean_squared_error(reg.predict(X_train[sub_cols]), y_train)
    test_error_dict2[i] = mean_squared_error(reg.predict(X_test[sub_cols]), y_test)
train_error_dict2
{1: 1403.2377764663738,
 2: 874.9828631207221,
 3: 873.8798463724188,
 4: 869.1090703099768,
 5: 868.526343606727,
 6: 866.5825551013062}
test_error_dict2
{1: 1168.3993370294302,
 2: 578.2755896285452,
 3: 575.291312202487,
 4: 586.276780022717,
 5: 595.300138775016,
 6: 595.6424078820787}

Plotting the test error curve

U-shaped test error Source: Introduction to Statistical Learning

df_train = pd.DataFrame({"y":train_error_dict, "type": "train"})
df_test = pd.DataFrame({"y":test_error_dict, "type": "test"})
df_test
y type
1 1585.391294 test
2 1128.087491 test
3 1134.163739 test
4 1190.228326 test
5 13294.494071 test
6 18948.061197 test
df_small = pd.concat([df_train, df_test]).reset_index()
df_small
index y type
0 1 273.652211 train
1 2 174.469013 train
2 3 171.731192 train
3 4 171.381255 train
4 5 109.076480 train
5 6 106.634308 train
6 1 1585.391294 test
7 2 1128.087491 test
8 3 1134.163739 test
9 4 1190.228326 test
10 5 13294.494071 test
11 6 18948.061197 test
alt.Chart(df_small).mark_line(clip=True).encode(
    x="index:O",
    y=alt.Y("y", scale=alt.Scale(domain=(0,2000))),
    color="type"
)
df_train2 = pd.DataFrame({"y":train_error_dict2, "type": "train"})
df_test2 = pd.DataFrame({"y":test_error_dict2, "type": "test"})
df_test2
y type
1 1168.399337 test
2 578.275590 test
3 575.291312 test
4 586.276780 test
5 595.300139 test
6 595.642408 test
df_big = pd.concat([df_train2, df_test2]).reset_index()
alt.Chart(df_big).mark_line(clip=True).encode(
    x="index:O",
    y=alt.Y("y", scale=alt.Scale(domain=(0,2000))),
    color="type"
)