Week 8 Videos

Contents

Week 8 Videos¶

import numpy as np
import pandas as pd
import altair as alt

Generating quadratic data¶

from sklearn.datasets import make_regression

X, y = make_regression(n_samples=1000, n_features=1, bias=15, noise=20, random_state=4)

pd.DataFrame({"x":X, "y":y})

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_6239/429307098.py in <module>
----> 1 pd.DataFrame({"x":X, "y":y})

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    612         elif isinstance(data, dict):
    613             # GH#38939 de facto copy defaults to False only in non-dict cases
--> 614             mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
    615         elif isinstance(data, ma.MaskedArray):
    616             import numpy.ma.mrecords as mrecords

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/internals/construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
    463 
    464     return arrays_to_mgr(
--> 465         arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
    466     )
    467 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity, typ, consolidate)
    122 
    123         # don't force copy because getting jammed in an ndarray anyway
--> 124         arrays = _homogenize(arrays, index, dtype)
    125 
    126     else:

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
    588 
    589             val = sanitize_array(
--> 590                 val, index, dtype=dtype, copy=False, raise_cast_failure=False
    591             )
    592 

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure, allow_2d)
    574                 subarr = maybe_infer_to_datetimelike(subarr)
    575 
--> 576     subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
    577 
    578     if isinstance(subarr, np.ndarray):

~/miniconda3/envs/math10s22/lib/python3.7/site-packages/pandas/core/construction.py in _sanitize_ndim(result, data, dtype, index, allow_2d)
    625             if allow_2d:
    626                 return result
--> 627             raise ValueError("Data must be 1-dimensional")
    628         if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
    629             # i.e. PandasDtype("O")

ValueError: Data must be 1-dimensional

X.shape

(1000, 1)

X[:10]

array([[-0.20735394],
       [ 0.18362632],
       [ 0.33825293],
       [ 0.28220666],
       [-1.3474603 ],
       [ 0.54950758],
       [ 0.06797219],
       [-0.79674267],
       [-2.29291305],
       [-1.16156742]])

type(X)

numpy.ndarray

X.reshape(-1).shape

(1000,)

df = pd.DataFrame({"x":X.reshape(-1), "y":y})

alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)

df["y"] = df["y"]**2

alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)

df["y"] *= 1/50

alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)

16000/50

320.0

PolynomialFeatures¶

df

	x	y
0	-0.207354	8.283916
1	0.183626	2.290559
2	0.338253	45.708364
3	0.282207	0.594082
4	-1.347460	2.214989
...	...	...
995	0.614307	33.987890
996	0.233874	16.966095
997	0.050562	30.240024
998	2.360360	123.332288
999	1.319017	101.775461

1000 rows × 2 columns

max_deg = 6

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=max_deg, include_bias=False)

poly.fit(df[["x"]])

PolynomialFeatures(degree=6, include_bias=False)

poly.transform(df[["x"]]).shape

(1000, 6)

pd.DataFrame(poly.transform(df[["x"]])).head()

	0	1	2	3	4	5
0	-0.207354	0.042996	-0.008915	0.001849	-0.000383	0.000079
1	0.183626	0.033719	0.006192	0.001137	0.000209	0.000038
2	0.338253	0.114415	0.038701	0.013091	0.004428	0.001498
3	0.282207	0.079641	0.022475	0.006343	0.001790	0.000505
4	-1.347460	1.815649	-2.446515	3.296582	-4.442014	5.985437

cols = [f"x{i}" for i in range(1,max_deg+1)]

df[cols] = poly.transform(df[["x"]])

df.head()

	x	y	x1	x2	x3	x4	x5	x6
0	-0.207354	8.283916	-0.207354	0.042996	-0.008915	0.001849	-0.000383	0.000079
1	0.183626	2.290559	0.183626	0.033719	0.006192	0.001137	0.000209	0.000038
2	0.338253	45.708364	0.338253	0.114415	0.038701	0.013091	0.004428	0.001498
3	0.282207	0.594082	0.282207	0.079641	0.022475	0.006343	0.001790	0.000505
4	-1.347460	2.214989	-1.347460	1.815649	-2.446515	3.296582	-4.442014	5.985437

Evaluating polynomial regression, Part 1¶

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(df[cols], df["y"], train_size=20, random_state=0)

for i in range(1,max_deg+1):
    reg = LinearRegression()
    sub_cols = cols[:i]
    print(sub_cols)

['x1']
['x1', 'x2']
['x1', 'x2', 'x3']
['x1', 'x2', 'x3', 'x4']
['x1', 'x2', 'x3', 'x4', 'x5']
['x1', 'x2', 'x3', 'x4', 'x5', 'x6']

train_error_dict = {}
test_error_dict = {}

for i in range(1,max_deg+1):
    reg = LinearRegression()
    sub_cols = cols[:i]
    reg.fit(X_train[sub_cols], y_train)
    train_error_dict[i] = mean_squared_error(reg.predict(X_train[sub_cols]), y_train)
    test_error_dict[i] = mean_squared_error(reg.predict(X_test[sub_cols]), y_test)

train_error_dict

{1: 273.65221070646004,
174.4690132350938,
171.7311919316596,
171.3812547748483,
109.07647969349776,
106.63430810917487}

test_error_dict

{1: 1585.3912941201534,
1128.0874914175256,
1134.1637387874664,
1190.2283257927493,
13294.49407116664,
18948.061196962655}

Evaluating polynomial regression, Part 2¶

X_train, X_test, y_train, y_test = train_test_split(df[cols], df["y"], train_size=0.8, random_state=0)

X_train.shape

(800, 6)

train_error_dict2 = {}
test_error_dict2 = {}

for i in range(1,max_deg+1):
    reg = LinearRegression()
    sub_cols = cols[:i]
    reg.fit(X_train[sub_cols], y_train)
    train_error_dict2[i] = mean_squared_error(reg.predict(X_train[sub_cols]), y_train)
    test_error_dict2[i] = mean_squared_error(reg.predict(X_test[sub_cols]), y_test)

train_error_dict2

{1: 1403.2377764663738,
874.9828631207221,
873.8798463724188,
869.1090703099768,
868.526343606727,
866.5825551013062}

test_error_dict2

{1: 1168.3993370294302,
578.2755896285452,
575.291312202487,
586.276780022717,
595.300138775016,
595.6424078820787}

Plotting the test error curve¶

U-shaped test error Source: Introduction to Statistical Learning

df_train = pd.DataFrame({"y":train_error_dict, "type": "train"})
df_test = pd.DataFrame({"y":test_error_dict, "type": "test"})
df_test

	y	type
1	1585.391294	test
2	1128.087491	test
3	1134.163739	test
4	1190.228326	test
5	13294.494071	test
6	18948.061197	test

df_small = pd.concat([df_train, df_test]).reset_index()

df_small

	index	y	type
0	1	273.652211	train
1	2	174.469013	train
2	3	171.731192	train
3	4	171.381255	train
4	5	109.076480	train
5	6	106.634308	train
6	1	1585.391294	test
7	2	1128.087491	test
8	3	1134.163739	test
9	4	1190.228326	test
10	5	13294.494071	test
11	6	18948.061197	test

alt.Chart(df_small).mark_line(clip=True).encode(
    x="index:O",
    y=alt.Y("y", scale=alt.Scale(domain=(0,2000))),
    color="type"
)

df_train2 = pd.DataFrame({"y":train_error_dict2, "type": "train"})
df_test2 = pd.DataFrame({"y":test_error_dict2, "type": "test"})
df_test2

	y	type
1	1168.399337	test
2	578.275590	test
3	575.291312	test
4	586.276780	test
5	595.300139	test
6	595.642408	test

df_big = pd.concat([df_train2, df_test2]).reset_index()

alt.Chart(df_big).mark_line(clip=True).encode(
    x="index:O",
    y=alt.Y("y", scale=alt.Scale(domain=(0,2000))),
    color="type"
)

previous

Midterm Review

next

Homework 7