Week 10 Videos

import pandas as pd
import altair as alt

Using a Pipeline for polynomial regression

df = pd.read_csv("../data/sim_data.csv", usecols=["x","y"])
c = alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)

c
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('reg', LinearRegression())
    ]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('reg', LinearRegression())])
df["pred"] = pipe.predict(df[["x"]])
c_pred = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred"
)
c+c_pred

Checking coefficient size from Pipeline

pipe
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('reg', LinearRegression())])
pipe.named_steps
{'poly': PolynomialFeatures(degree=10, include_bias=False),
 'reg': LinearRegression()}
pipe.named_steps['reg'].coef_
array([-1.15482804e+01, -1.17462844e-01,  2.72397298e+00,  2.56238375e-02,
       -1.61608397e-01, -1.88225667e-03,  4.12088929e-03,  1.52761153e-04,
       -3.77537167e-05, -2.36222277e-06])

Including StandardScaler in our Pipeline

from sklearn.preprocessing import StandardScaler
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', LinearRegression())
    ]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', LinearRegression())])
pipe.named_steps['reg'].coef_
array([-6.24520911e+01, -3.12148450e+00,  8.70308705e+02,  6.26136298e+01,
       -4.09488120e+03, -4.21029795e+02,  9.15190860e+03,  3.16215660e+03,
       -7.65508093e+03, -4.57323098e+03])

Using regularization to reduce overfitting

scikit-learn Ridge regression reference

from sklearn.linear_model import Ridge
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', Ridge(alpha=0))
    ]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', Ridge(alpha=0))])
df["pred0"] = pipe.predict(df[["x"]])
c0 = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred0"
)
c+c0
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', Ridge(alpha=10))
    ]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', Ridge(alpha=10))])
df["pred10"] = pipe.predict(df[["x"]])
c10 = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred10"
)
c+c10
pipe.named_steps["reg"].coef_
array([14.84789507, 18.57306252,  5.84197442, 10.14676263,  0.66979349,
        5.00139616, -1.06236972,  2.59525635, -1.40458682,  1.52149585])
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', Ridge(alpha=10**4))
    ]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', Ridge(alpha=10000))])
df["pred10000"] = pipe.predict(df[["x"]])
c10000 = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred10000"
)
c+c10000