Week 10 Videos
Contents
Week 10 Videos¶
import pandas as pd
import altair as alt
Using a Pipeline for polynomial regression¶
df = pd.read_csv("../data/sim_data.csv", usecols=["x","y"])
c = alt.Chart(df).mark_circle().encode(
x="x",
y="y"
)
c
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
pipe = Pipeline(
[
('poly', PolynomialFeatures(degree=10, include_bias=False)),
('reg', LinearRegression())
]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
('reg', LinearRegression())])
df["pred"] = pipe.predict(df[["x"]])
c_pred = alt.Chart(df).mark_line(color="black").encode(
x="x",
y="pred"
)
c+c_pred
Checking coefficient size from Pipeline¶
pipe
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
('reg', LinearRegression())])
pipe.named_steps
{'poly': PolynomialFeatures(degree=10, include_bias=False),
'reg': LinearRegression()}
pipe.named_steps['reg'].coef_
array([-1.15482804e+01, -1.17462844e-01, 2.72397298e+00, 2.56238375e-02,
-1.61608397e-01, -1.88225667e-03, 4.12088929e-03, 1.52761153e-04,
-3.77537167e-05, -2.36222277e-06])
Including StandardScaler in our Pipeline¶
from sklearn.preprocessing import StandardScaler
pipe = Pipeline(
[
('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()),
('reg', LinearRegression())
]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()), ('reg', LinearRegression())])
pipe.named_steps['reg'].coef_
array([-6.24520911e+01, -3.12148450e+00, 8.70308705e+02, 6.26136298e+01,
-4.09488120e+03, -4.21029795e+02, 9.15190860e+03, 3.16215660e+03,
-7.65508093e+03, -4.57323098e+03])
Using regularization to reduce overfitting¶
scikit-learn Ridge regression reference
from sklearn.linear_model import Ridge
pipe = Pipeline(
[
('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()),
('reg', Ridge(alpha=0))
]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()), ('reg', Ridge(alpha=0))])
df["pred0"] = pipe.predict(df[["x"]])
c0 = alt.Chart(df).mark_line(color="black").encode(
x="x",
y="pred0"
)
c+c0
pipe = Pipeline(
[
('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()),
('reg', Ridge(alpha=10))
]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()), ('reg', Ridge(alpha=10))])
df["pred10"] = pipe.predict(df[["x"]])
c10 = alt.Chart(df).mark_line(color="black").encode(
x="x",
y="pred10"
)
c+c10
pipe.named_steps["reg"].coef_
array([14.84789507, 18.57306252, 5.84197442, 10.14676263, 0.66979349,
5.00139616, -1.06236972, 2.59525635, -1.40458682, 1.52149585])
pipe = Pipeline(
[
('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()),
('reg', Ridge(alpha=10**4))
]
)
pipe.fit(df[["x"]], df["y"])
Pipeline(steps=[('poly', PolynomialFeatures(degree=10, include_bias=False)),
('scaler', StandardScaler()), ('reg', Ridge(alpha=10000))])
df["pred10000"] = pipe.predict(df[["x"]])
c10000 = alt.Chart(df).mark_line(color="black").encode(
x="x",
y="pred10000"
)
c+c10000