--Simple regression: A method that attempts to predict a certain value using a single input data.
ex) Predict weight using height data. y=w₀+w₁X
--Multiple regression: A method that attempts to predict a certain value using two or more input data.
ex) Predict your weight using your height, waist, and body fat. y=w₀+w₁x₁+w₂x₂+w₃x₃+…
――Now, I want to regress the data as shown in the figure below. However, it is clearly a shape that cannot be expressed by a straight line.
--Attempts regression by polynomial regression.
--Polynomial regression: A type of multiple regression analysis. In addition to the input data x, x ^ 2, x ^ 3… are added as new input data.
--The least squares method is an algorithm that tends to complicate the model.
--Lack of learning ... The situation where the training data cannot be fully expressed. The loss function remains high.
--Overfitting (overfitting): A situation in which the training data is overfitted. The generalization performance is low.
--Regularization: A method to prevent overfitting by selecting appropriate coefficients or reducing the size of the coefficients. Find out the really important coefficients.
--Variable selection Gradual selection method: Add or reduce the coefficients one by one to maximize the goodness of fit.
--Reduction estimation (1) Ridge regression: Reduce the absolute value of the coefficient. (2) Lasso regression: Set some coefficients to 0 completely.
import numpy as np
import matplotlib.pyplot as plt
data_size=20
#0~Up to 1 is represented by 20 pieces.
X=np.linspace(0,1,data_size)
#Uniform random number greater than or equal to low and less than high
noise=np.random.uniform(low=-1.0,high=1.0,size=data_size)*0.2
y=np.sin(2.0*np.pi*X)+noise
#0~Up to 1 is represented by 1000 pieces.
X_line=np.linspace(0,1,1000)
sin_X=np.sin(2.0*np.pi*X_line)
def plot_sin():
plt.scatter(X,y)
plt.plot(X_line,sin_X,"red")
plot_sin()
from sklearn.linear_model import LinearRegression
#Linear regression model generation
lin_reg_model=LinearRegression().fit(X.reshape(-1,1),y)
#Intercept,Tilt
lin_reg_model.intercept_,lin_reg_model.coef_
plt.plot(X_line,lin_reg_model.intercept_+lin_reg_model.coef_*X_line)
plot_sin()
from sklearn.preprocessing import PolynomialFeatures
#0th power~Generate 4 columns up to 3rd power(20-by-4 data frame generation)
poly = PolynomialFeatures(degree=3)
poly.fit(X.reshape(-1,1))
X_poly_3=poly.transform(X.reshape(-1,1))
lin_reg_3_model=LinearRegression().fit(X_poly_3,y)
X_line_poly_3=poly.fit_transform(X_line.reshape(-1,1))
plt.plot(X_line,lin_reg_3_model.predict(X_line_poly_3))
plot_sin()
fig,axes=plt.subplots(1,3,figsize=(16,4))
for degree,ax in zip([5,15,25],axes):
poly=PolynomialFeatures(degree=degree)
X_poly=poly.fit_transform(X.reshape(-1,1))
lin_reg=LinearRegression().fit(X_poly,y)
X_line_poly=poly.fit_transform(X_line.reshape(-1,1))
ax.plot(X_line,lin_reg.predict(X_line_poly))
ax.scatter(X,y)
ax.plot(X_line,sin_X,"red")
import mglearn
import pandas as pd
from sklearn.model_selection import train_test_split
X,y=mglearn.datasets.load_extended_boston()
df_X=pd.DataFrame(X)
dy_y=pd.DataFrame(y)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
lin_reg_model=LinearRegression().fit(X_train,y_train)
print(round(lin_reg_model.score(X_train,y_train),3))#Training data precision rate
print(round(lin_reg_model.score(X_test,y_test),3))#Test data precision rate
from sklearn.linear_model import Ridge,Lasso
ridge_model=Ridge().fit(X_train,y_train)
def print_score(model):
print(round(model.score(X_train,y_train),3))#Training data precision rate
print(round(model.score(X_test,y_test),3))#Test data precision rate
#The larger alpha, the smaller the absolute value(default=1)
ridge_10_model=Ridge(alpha=10).fit(X_train,y_train)
print_score(ridge_10_model)
ridge_01_model=Ridge(alpha=0.1).fit(X_train,y_train)
print_score(ridge_01_model)
coefficients=pd.DataFrame({"lin_reg":lin_reg.coef_,"ridge_10_model":ridge_10_model.coef_,"ridge_01_model":ridge_01_model.coef_})
coefficients
lasso_001_model=Lasso(alpha=0.01,max_iter=10000).fit(X_train,y_train)
print_score(lasso_001_model)
coefficients_lasso=pd.DataFrame({"lin_reg":lin_reg.coef_,
"lasso_001_model":lasso_001_model.coef_})
Recommended Posts