ML
Regularized model-Ridge code
30303
2024. 3. 19. 04:39
728x90
[목적]
1. Linear Regression
- 단순 Linear Regression을 활용하여 변수의 중요도 및 방향성을 알아봄
- 매우 심플한 모델이기 때문에 사이즈가 큰 데이터에 적합하지 않음
- 하지만 설명력에서는 큰 장점이 있음
2. Ridge Regression
- Regularized Linear Model을 활용하여 Overfitting을 방지함
- Hyperparameter lamba를 튜닝할 때 for loop 뿐만 아니라 GridsearchCV를 통해 돌출해봄
3. Regularized Linear Models의 경우 X's Scaling을 필수적으로 진행해야함
[Process]
1. Define X's & Y
2. Split Train & Valid dataset
3. Modeling
4. Model 해석
# Package
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
data = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')
# X's & Y Split
Y = data['Y']
X = data.drop(columns=['Y'])
X = pd.get_dummies(X, columns=['SEX'])
[Data Split]
- Data Split을 진행할 때 BigData의 경우 꼭 indexing을 추출하여 모델에 적용시켜야 함
- 이유는 Data Split하여 새로운 Data set을 만들 경우 메모리에 부담을 주기 때문
X.shape[0]
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))
# Linear Regression
results = LinearRegression().fit(X.iloc[train_idx], Y.iloc[train_idx])
import scipy
from sklearn import metrics
def sse(clf, X, y):
"""Calculate the standard squared error of the model.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
float
The standard squared error of the model.
"""
y_hat = clf.predict(X)
sse = np.sum((y_hat - y) ** 2)
return sse / X.shape[0]
def adj_r2_score(clf, X, y):
"""Calculate the adjusted :math:`R^2` of the model.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
float
The adjusted :math:`R^2` of the model.
"""
n = X.shape[0] # Number of observations
p = X.shape[1] # Number of features
r_squared = metrics.r2_score(y, clf.predict(X))
return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))
def coef_se(clf, X, y):
"""Calculate standard error for beta coefficients.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
numpy.ndarray
An array of standard errors for the beta coefficients.
"""
n = X.shape[0]
X1 = np.hstack((np.ones((n, 1)), np.matrix(X)))
se_matrix = scipy.linalg.sqrtm(
metrics.mean_squared_error(y, clf.predict(X)) *
np.linalg.inv(X1.T * X1)
)
return np.diagonal(se_matrix)
def coef_tval(clf, X, y):
"""Calculate t-statistic for beta coefficients.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
numpy.ndarray
An array of t-statistic values.
"""
a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
return np.append(a, b)
def coef_pval(clf, X, y):
"""Calculate p-values for beta coefficients.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
numpy.ndarray
An array of p-values.
"""
n = X.shape[0]
t = coef_tval(clf, X, y)
p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
return p
def summary(clf, X, y, xlabels=None):
"""
Output summary statistics for a fitted regression model.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
xlabels : list, tuple
The labels for the predictors.
"""
# Check and/or make xlabels
ncols = X.shape[1]
if xlabels is None:
xlabels = np.array(
['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
elif isinstance(xlabels, (tuple, list)):
xlabels = np.array(xlabels, dtype='str')
# Make sure dims of xlabels matches dims of X
if xlabels.shape[0] != ncols:
raise AssertionError(
"Dimension of xlabels {0} does not match "
"X {1}.".format(xlabels.shape, X.shape))
# Create data frame of coefficient estimates and associated stats
coef_df = pd.DataFrame(
index=['_intercept'] + list(xlabels),
columns=['Estimate', 'Std. Error', 't value', 'p value']
)
try:
coef_df['Estimate'] = np.concatenate(
(np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
except Exception as e:
coef_df['Estimate'] = np.concatenate(
(
np.round(np.array([clf.intercept_]), 6),
np.round((clf.coef_), 6)
), axis = 1
)[0,:]
coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
# Output results
print('Coefficients:')
print(coef_df.to_string(index=True))
print('---')
print('R-squared: {0:.6f}, Adjusted R-squared: {1:.6f}, MSE: {2:.1f}'.format(
metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))
# Scaling
scaler = MinMaxScaler().fit(X.iloc[train_idx])
X_scal = scaler.transform(X)
X_scal = pd.DataFrame(X_scal, columns=X.columns)
# Linear Regression
results = LinearRegression().fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
summary(results, X_scal.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X_scal.columns)
summary(results, X_scal.iloc[train_idx], Y.iloc[train_idx], xlabels=X_scal.columns)
valid train 셋 비교시 valid셋에서 r2 score가 더 높음을 확인 -> 과적합되지 않음
[Ridge Regression]
- Hyperparameter Tuning using for Loop
- Hyperparameter Tuning using GridSearchCV
[Ridge Regression Parameters]
- alpha : L2-norm Penalty Term
- alpha : 0 일 때, Just Linear Regression
- fit_intercept : Centering to zero
- 베타0를 0로 보내는 것 (베타0는 상수이기 때문에)
- max_iter : Maximum number of interation
- Loss Function의 Ridge Penalty Term은 Closed Form 값이기는 하지만 값을 찾아 나감
- Penalty Term : (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_2
penelty = [0.00001, 0.00005, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1, 10]
# Using For Loop !!
# Ridge Regression
# select alpha by checking R2, MSE, RMSE
for a in penelty:
model = Ridge(alpha=a).fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
score = model.score(X_scal.iloc[valid_idx], Y.iloc[valid_idx])
pred_y = model.predict(X_scal.iloc[valid_idx])
mse = mean_squared_error(Y.iloc[valid_idx], pred_y)
print("Alpha:{0:.5f}, R2:{1:.7f}, MSE:{2:.7f}, RMSE:{3:.7f}".format(a, score, mse, np.sqrt(mse)))
알파 0.01 혹은 0.1을 택.
model_best = Ridge(alpha=0.01).fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
summary(model_best, X_scal.iloc[valid_idx], Y.iloc[valid_idx], xlabels = X_scal.columns)
# Using GridSearchCV
ridge_cv=RidgeCV(alphas=penelty, cv=5)
model = ridge_cv.fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
print("Best Alpha:{0:.5f}, R2:{1:.4f}".format(model.alpha_, model.best_score_))
# GridSearchCV Result
model_best = Ridge(alpha=model.alpha_).fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
score = model_best.score(X_scal.iloc[valid_idx], Y.iloc[valid_idx])
pred_y = model_best.predict(X_scal.iloc[valid_idx])
mse = np.sqrt(mean_squared_error(Y.iloc[valid_idx], pred_y))
print("Alpha:{0:.5f}, R2:{1:.7f}, MSE:{2:.7f}, RMSE:{3:.7f}".format(0.01, score, mse, np.sqrt(mse)))
summary(model_best, X_scal.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X_scal.columns)