generated from slds-lmu/lecture_template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a0c36e7
commit d8109de
Showing
16 changed files
with
328 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from sklearn.preprocessing import PolynomialFeatures | ||
from sklearn.linear_model import Ridge | ||
from sklearn.metrics import mean_squared_error | ||
|
||
|
||
# Set the random seed for reproducibility | ||
np.random.seed(0) | ||
|
||
# Define the true function and the number of datasets | ||
true_function = lambda x: np.sin(x) | ||
n_datasets = 100 # Number of datasets for training | ||
n_samples = 100 | ||
n_test_samples = 10000 | ||
n_order = 8 | ||
lambdas = np.exp(np.linspace(-6, 7, 25)) | ||
|
||
# Generate polynomial features | ||
poly = PolynomialFeatures(degree=n_order, include_bias=False) | ||
|
||
# Initialize arrays to store the bias, variance, and error | ||
bias_squared = np.zeros_like(lambdas) | ||
variance = np.zeros_like(lambdas) | ||
test_error = np.zeros_like(lambdas) | ||
|
||
# Generate shared x values for all datasets | ||
x_shared = np.random.uniform(0, 1, n_samples).reshape(-1, 1) | ||
x_shared_poly = poly.fit_transform(x_shared) | ||
|
||
# Generate test data | ||
x_test = np.random.uniform(0, 1, n_test_samples).reshape(-1, 1) | ||
y_test = true_function(x_test).reshape(-1, 1) + np.random.randn(n_test_samples,1) | ||
x_test_poly = poly.transform(x_test) | ||
|
||
# Loop over the lambda values | ||
for i, lambda_val in enumerate(lambdas): | ||
# Initialize arrays to store predictions for each model | ||
predictions = np.zeros((n_datasets, n_samples)) | ||
|
||
# Train and predict with n_datasets models | ||
for j in range(n_datasets): | ||
# Generate new y values for each dataset | ||
epsilon = np.random.randn(n_samples, 1) | ||
y = true_function(x_shared) + epsilon | ||
|
||
# Fit Ridge regression model | ||
model = Ridge(alpha=lambda_val, fit_intercept=True) | ||
model.fit(x_shared_poly, y) | ||
predictions[j, :] = model.predict(x_shared_poly).flatten() | ||
|
||
# Calculate the average prediction for each x | ||
average_prediction = np.mean(predictions, axis=0) | ||
|
||
# Compute itegrated bias^2 and variance using MC | ||
bias_squared[i] = np.mean((average_prediction - true_function(x_shared).flatten()) ** 2) | ||
variance[i] = np.mean(np.var(predictions, axis=0)) | ||
|
||
# Train a final model on a new dataset and compute test error for each lambda | ||
for i, lambda_val in enumerate(lambdas): | ||
# Generate new data for the final model | ||
x_train_final = np.random.uniform(0, 1, n_samples).reshape(-1, 1) | ||
y_train_final = true_function(x_train_final) + np.random.randn(n_samples, 1) | ||
x_train_final_poly = poly.transform(x_train_final) | ||
|
||
# Fit the final model | ||
model_final = Ridge(alpha=lambda_val, fit_intercept=True) | ||
model_final.fit(x_train_final_poly, y_train_final) | ||
|
||
# Predict on the test set and compute the error | ||
y_test_pred_final = model_final.predict(x_test_poly).flatten() | ||
# The test error | ||
test_error[i] = mean_squared_error(y_test, y_test_pred_final) | ||
|
||
# Plotting the results with two y-axes | ||
fig, ax1 = plt.subplots(figsize=(12, 6)) | ||
|
||
# Plot bias^2 and variance on the primary y-axis | ||
ax1.plot(np.log(lambdas), bias_squared, label='(bias)^2', color='red') | ||
ax1.plot(np.log(lambdas), variance, label='variance', color='blue') | ||
ax1.plot(np.log(lambdas), bias_squared + variance, label='(bias)^2 + variance', color='green') | ||
|
||
ax1.set_xlabel('ln(λ)', fontsize=16) | ||
ax1.set_ylabel('(bias)^2, variance', fontsize=16) | ||
ax1.legend(loc='upper left') | ||
|
||
# Create secondary y-axis for test error | ||
ax2 = ax1.twinx() | ||
ax2.plot(np.log(lambdas), test_error, label='test error', color='magenta', linestyle='--', alpha=.6) | ||
ax2.set_ylabel('Test error on single dataset', fontsize=16) | ||
ax2.legend(loc='upper right') | ||
|
||
plt.title('Bias-Variance Tradeoff with L2 Regularization', fontsize=20) | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from sklearn.datasets import load_wine | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.linear_model import RidgeCV | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
|
||
# Load wine dataset | ||
X, y = load_wine(return_X_y=True) | ||
|
||
# Generating standard normal noise features | ||
np.random.seed(42) | ||
noise_features = np.random.normal(size=(X.shape[0], 75)) | ||
|
||
# Adding these noise features to the original dataset | ||
X_extended = np.hstack((X, noise_features)) | ||
|
||
# Splitting the augmented dataset into training and test sets | ||
X_train_ext, X_test_ext, y_train_ext, y_test_ext = train_test_split( | ||
X_extended, y, test_size=0.2, random_state=42 | ||
) | ||
|
||
# Standardizing the augmented dataset | ||
scaler_ext = StandardScaler() | ||
X_train_ext_scaled = scaler_ext.fit_transform(X_train_ext) | ||
X_test_ext_scaled = scaler_ext.transform(X_test_ext) | ||
|
||
# Define a range of lambda (alpha) values | ||
lambda_values = np.logspace(-4, 4, 50) | ||
|
||
# Performing Ridge Regression with Cross-Validation on the extended dataset | ||
ridge_cv_ext = RidgeCV(alphas=lambda_values, store_cv_values=True) | ||
ridge_cv_ext.fit(X_train_ext_scaled, y_train_ext) | ||
|
||
# Plotting the CV Curve for the extended dataset | ||
mean_cv_scores_ext = np.mean(ridge_cv_ext.cv_values_, axis=0) | ||
|
||
# Finding the lambda value with the minimum CV score | ||
min_lambda_index = np.argmin(mean_cv_scores_ext) | ||
min_lambda_value = lambda_values[min_lambda_index] | ||
|
||
# Re-plotting with a vertical blue bar at the minimum CV score | ||
plt.figure(figsize=(8, 6)) | ||
plt.plot(lambda_values, mean_cv_scores_ext, marker='o', color='red') | ||
plt.axvline(x=min_lambda_value, color='blue', linestyle='--', label=f'Min CV Score at λ={min_lambda_value:.4f}') | ||
plt.xscale('log') | ||
plt.xlabel('Lambda (Regularization strength)', fontsize = 14) | ||
plt.ylabel('Generalization error', fontsize = 14) | ||
#plt.title('Wine dataset with add. noise features', fontsize=12) | ||
plt.title('Effect of L2 Regularization', fontsize = 16) | ||
plt.legend() | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
library(ggplot2) | ||
|
||
lambda = 2 | ||
|
||
fun1 <- function(x){ | ||
return(x^2 + 3*abs(x)+ 1) | ||
} | ||
|
||
fun2 <- function(x){ | ||
return(0.5*(x-4)^2 + lambda*abs(x)+ 1) | ||
} | ||
|
||
fun3 <- function(x){ | ||
return(0.5*(x+4)^2 + lambda*abs(x)+ 1) | ||
} | ||
|
||
|
||
p1 <- ggplot() + | ||
xlim(-7, 7) + | ||
geom_function(fun = fun1) + | ||
xlab(expression(theta)) + | ||
ylab(expression(R[reg])) + | ||
geom_vline(xintercept = 0, | ||
linetype="dashed") + | ||
theme_bw(base_size = 20) | ||
|
||
pdf("../figure/th_l1_zero.pdf") | ||
print(p1) | ||
dev.off() | ||
|
||
p2 <- ggplot() + | ||
xlim(-7, 7) + | ||
geom_function(fun = fun2) + | ||
xlab(expression(theta)) + | ||
ylab(expression(R[reg])) + | ||
geom_vline(xintercept = 4 - lambda, | ||
linetype="dashed") + | ||
theme_bw(base_size = 20) | ||
|
||
pdf("../figure/th_l1_pos.pdf") | ||
print(p2) | ||
dev.off() | ||
|
||
p3 <- ggplot() + | ||
xlim(-7, 7) + | ||
geom_function(fun = fun3) + | ||
xlab(expression(theta)) + | ||
ylab(expression(R[reg])) + | ||
geom_vline(xintercept = -4 + lambda, | ||
linetype="dashed") + | ||
theme_bw(base_size = 20) | ||
|
||
|
||
pdf("../figure/th_l1_neg.pdf") | ||
print(p3) | ||
dev.off() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
\documentclass[11pt,compress,t,notes=noshow, xcolor=table]{beamer} | ||
\input{../../style/preamble} | ||
\input{../../latex-math/basic-math} | ||
\input{../../latex-math/basic-ml} | ||
|
||
\newcommand{\titlefigure}{figure_man/bias-variance-ridge.png} | ||
\newcommand{\learninggoals}{ | ||
\item Know alternative interpretations of Ridge regression | ||
\item Derivation of the bias-variance tradeoff for Ridge regression | ||
} | ||
|
||
\title{Introduction to Machine Learning} | ||
\date{} | ||
|
||
\begin{document} | ||
|
||
\lecturechapter{Ridge Regression Deep-Dive} | ||
\lecture{Introduction to Machine Learning} | ||
|
||
|
||
|
||
\begin{vbframe}{Perspectives on $L2$ regularization} | ||
We already saw that $L2$ regularization is equivalent to a constrained optimization problem: | ||
\begin{eqnarray*} | ||
\thetah_{\text{Ridge}} &=& \argmin_{\thetab} \sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv\\ | ||
%&=& \argmin_{\thetab} \left(\yv - \Xmat \thetab\right)^\top \left(\yv - \Xmat \thetab\right) + \lambda \thetab^\top \thetab \\ | ||
&=& \argmin_{\thetab} \sumin \left(\yi - \fxit\right)^2 \, | ||
\text{s.t. } \|\thetab\|_2^2 \leq t | ||
\end{eqnarray*} | ||
We can also recover the Ridge estimator by performing least-squares on a \textbf{row-augmented} data set: Let \scriptsize{$\tilde{\Xmat}:= \begin{pmatrix} \Xmat \\ \sqrt{\lambda} \id_{p} \end{pmatrix}$ and $\tilde{\yv} := \begin{pmatrix} | ||
\yv \\ \bm{0}_{p} | ||
\end{pmatrix}$.} \normalsize{Using the augmented data, the least-squares objective becomes} | ||
\small{ | ||
$$%\argmin_{\thetab} | ||
\sum_{i=1}^{n+p} \left(\tilde{\yi} - \thetab^T \tilde{\xi} \right)^2 = %\argmin_{\thetab} | ||
\sum_{i=1}^{n} \left(\yi - \thetab^T \xi \right)^2 + \sum_{j=1}^{p} \left(0 - \sqrt{\lambda} \theta_j \right)^2 %= \thetah_{\text{Ridge}} | ||
=\sumin \left(\yi - \thetab^T \xi \right)^2 + \lambda \|\thetab\|_2^2 | ||
$$ | ||
} | ||
\normalsize{Thus the least-squares solution $\thetah$ using $\tilde{\Xmat},\tilde{\yv}$ instead of $\Xmat, \yv$ is $\thetah_{\text{Ridge}}$.} | ||
%$$\thetah_{\text{Ridge}} = ({\Xmat}^T \Xmat + \lambda \id)^{-1} \Xmat^T\yv$$ | ||
\end{vbframe} | ||
|
||
\endlecture | ||
\end{document} |
Oops, something went wrong.