# Week 10 Videos

In [1]:
import pandas as pd
import altair as alt

## Using a Pipeline for polynomial regression

<iframe width="560" height="315" src="https://www.youtube.com/embed/DdgdtLCMLh4" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

In [2]:
df = pd.read_csv("../data/sim_data.csv", usecols=["x","y"])

In [3]:
c = alt.Chart(df).mark_circle().encode(
    x="x",
    y="y"
)

c

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [5]:
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('reg', LinearRegression())
    ]
)

In [6]:
pipe.fit(df[["x"]], df["y"])

Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('reg', LinearRegression())])

In [7]:
df["pred"] = pipe.predict(df[["x"]])

In [8]:
c_pred = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred"
)

In [9]:
c+c_pred

## Checking coefficient size from Pipeline

<iframe width="560" height="315" src="https://www.youtube.com/embed/96Jb5Zbav_A" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

In [10]:
pipe

Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('reg', LinearRegression())])

In [11]:
pipe.named_steps

{'poly': PolynomialFeatures(degree=10, include_bias=False),
 'reg': LinearRegression()}

In [12]:
pipe.named_steps['reg'].coef_

array([-1.15482804e+01, -1.17462844e-01,  2.72397298e+00,  2.56238375e-02,
       -1.61608397e-01, -1.88225667e-03,  4.12088929e-03,  1.52761153e-04,
       -3.77537167e-05, -2.36222277e-06])

## Including StandardScaler in our Pipeline

<iframe width="560" height="315" src="https://www.youtube.com/embed/hJ4b6gM5oKM" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', LinearRegression())
    ]
)

In [15]:
pipe.fit(df[["x"]], df["y"])

Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', LinearRegression())])

In [16]:
pipe.named_steps['reg'].coef_

array([-6.24520911e+01, -3.12148450e+00,  8.70308705e+02,  6.26136298e+01,
       -4.09488120e+03, -4.21029795e+02,  9.15190860e+03,  3.16215660e+03,
       -7.65508093e+03, -4.57323098e+03])

## Using regularization to reduce overfitting

<iframe width="560" height="315" src="https://www.youtube.com/embed/VIiT4UCV1DM" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

[scikit-learn Ridge regression reference](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)

In [17]:
from sklearn.linear_model import Ridge

In [18]:
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', Ridge(alpha=0))
    ]
)

In [19]:
pipe.fit(df[["x"]], df["y"])

Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', Ridge(alpha=0))])

In [20]:
df["pred0"] = pipe.predict(df[["x"]])

In [21]:
c0 = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred0"
)

In [22]:
c+c0

In [23]:
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', Ridge(alpha=10))
    ]
)

In [24]:
pipe.fit(df[["x"]], df["y"])

Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', Ridge(alpha=10))])

In [25]:
df["pred10"] = pipe.predict(df[["x"]])

In [26]:
c10 = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred10"
)

In [27]:
c+c10

In [28]:
pipe.named_steps["reg"].coef_

array([14.84789507, 18.57306252,  5.84197442, 10.14676263,  0.66979349,
        5.00139616, -1.06236972,  2.59525635, -1.40458682,  1.52149585])

In [29]:
pipe = Pipeline(
    [
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('reg', Ridge(alpha=10**4))
    ]
)

In [30]:
pipe.fit(df[["x"]], df["y"])

Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
                ('scaler', StandardScaler()), ('reg', Ridge(alpha=10000))])

In [31]:
df["pred10000"] = pipe.predict(df[["x"]])

In [32]:
c10000 = alt.Chart(df).mark_line(color="black").encode(
    x="x",
    y="pred10000"
)

In [33]:
c+c10000