#Data processing / calculation / analysis library
import numpy as np
import pandas as pd
#Graph drawing library
import matplotlib.pyplot as plt
%matplotlib inline
#Machine learning library
import sklearn
from sklearn.linear_model import Ridge, Lasso #Class for regression model generation
#Module to make matplotlib support Japanese display
!pip install japanize-matplotlib
import japanize_matplotlib
#Get data
url = 'https://raw.githubusercontent.com/yumi-ito/sample_data/master/ridge_lasso_50variables.csv'
#Read the acquired data as a DataFrame object
df = pd.read_csv(url)
print(df)
#Create explanatory variable x by deleting the "y" column
x = df.drop('y', axis=1)
#Extract the "y" column to create the objective variable y
y = df['y']
# λ(alpha)Generate 50 ways
num_alphas = 50
alphas = np.logspace(-2, 0.7, num_alphas)
print(alphas)
numpy.logspace ()
is a function with a twist that ** takes an arithmetic progression with a base of 10 as a logarithm **.(start value, end value, number to be generated)
in the argument, but when you actually take the logarithm, it will be as follows.np.log10(alphas)
numpy.arange ()
seems to be good, but I do this because it is necessary to use ** logarithmic scale ** when visualizing later.The value of the logarithmic scale doubles with each scale. A graph that uses either the x-axis, the y-axis, or both axes is called a logarithmic graph.
Ordinary scales are called linear scales, but if you convert them to logarithmic scales ...
Since it feels like the axis scale is compressed tightly on the number line, it makes it easier to visually compare data that is far apart in the number of digits.
#Variable to store regression coefficients
ridge_coefs = []
#Repeat the estimation of ridge regression while exchanging alpha
for a in alphas:
ridge = Ridge(alpha = a, fit_intercept = False)
ridge.fit(x, y)
ridge_coefs.append(ridge.coef_)
, and add the regression coefficient to
ridge_coefs`.fit_intercept = False
ofRidge ()
, which generates the model template, specifies whether to calculate the intercept. If it is set to False
, the intercept is not calculated, that is, the intercept always passes through the origin. Will be.#Convert the accumulated regression coefficients to a numpy array
ridge_coefs = np.array(ridge_coefs)
print("Array shape:", ridge_coefs.shape)
print(ridge_coefs)
log_alphas
, which is the logarithmic conversion of ʻalphas` and minus.plt.text ()
function, which displays text in the graph, uses (x, y," str ")
as arguments to specify coordinates and strings.#Logarithmic conversion of alphas(-log10)
log_alphas = -np.log10(alphas)
#Specifying the size of the graph area
plt.figure(figsize = (8,6))
#Line graph with λ on the x-axis and coefficients on the y-axis
plt.plot(log_alphas, ridge_coefs)
#Explanatory variable x_Show 1
plt.text(max(log_alphas) + 0.1, np.array(ridge_coefs)[0,0], "x_1", fontsize=13)
#Specify x-axis range
plt.xlim([min(log_alphas) - 0.1, max(log_alphas) + 0.3])
#Axis label
plt.xlabel("Regularization parameter λ(-log10)", fontsize=13)
plt.ylabel("Regression coefficient", fontsize=13)
#Scale line
plt.grid()
#Variable to store regression coefficients
lasso_coefs = []
#Repeat the estimation of the lasso regression while exchanging alpha
for a in alphas:
lasso = Lasso(alpha = a, fit_intercept = False)
lasso.fit(x, y)
lasso_coefs.append(lasso.coef_)
#Convert the accumulated regression coefficients to a numpy array
lasso_coefs = np.array(lasso_coefs)
print("Array shape:", lasso_coefs.shape)
print(lasso_coefs)
#Specifying the size of the graph area
plt.figure(figsize = (8,6))
#Line graph with λ on the x-axis and coefficients on the y-axis
plt.plot(log_alphas, lasso_coefs)
#Explanatory variable x_Show 1
plt.text(max(log_alphas) + 0.1, np.array(lasso_coefs)[0,0], "x_1", fontsize=13)
#Specify x-axis range
plt.xlim([min(log_alphas) - 0.1, max(log_alphas) + 0.3])
#Axis label
plt.xlabel("Regularization parameter λ(-log10)", fontsize=13)
plt.ylabel("Regression coefficient", fontsize=13)
#Scale line
plt.grid()