** Here, I would like to compare three multiple regression models, including the lasso regression. ** **
#Data processing / calculation / analysis library
import numpy as np
import pandas as pd
#Graph drawing library
import matplotlib.pyplot as plt
%matplotlib inline
#Machine learning library
import sklearn
#Get data
url = 'https://raw.githubusercontent.com/yumi-ito/datasets/master/datasets_auto_4variables_pre-processed.csv'
#Read the acquired data as a DataFrame object
df = pd.read_csv(url, header=None)
#Set column label
df.columns = ['width', 'height', 'horsepower', 'price']
print(df)
#Confirmation of data shape
print('Data shape:', df.shape)
#Confirmation of missing values
print('Number of missing values:{}\n'.format(df.isnull().sum().sum()))
#Data type confirmation
print(df.dtypes)
#Import for model building
from sklearn.linear_model import Ridge, Lasso, LinearRegression
#Import for data splitting
from sklearn.model_selection import train_test_split
drop ()
function to remove the price
column and set only the explanatory variables to x and only the price
to y.train_test_split
method, the explanatory variable x and the objective variable y are separated into training data (train) and test data (test), respectively.#Set explanatory variables and objective variables
x = df.drop('price', axis=1)
y = df['price']
#Divided into training data and test data
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.5, random_state=0)
for
statement to generate a model, calculate the correct answer rate for training data, and calculate the correct answer rate for test data at once.#Initialize each class and store it in models of dict type variable
models = {
'linear': LinearRegression(),
'ridge': Ridge(random_state=0),
'lasso': Lasso(random_state=0)}
#Initialize the dict type variable that stores the correct answer rate
scores = {}
#Generate each model in sequence, calculate the correct answer rate, and store it.
for model_name, model in models.items():
#Model generation
model.fit(X_train, Y_train)
#Correct answer rate of training data
scores[(model_name, 'train')] = model.score(X_train, Y_train)
#Test data accuracy rate
scores[(model_name, 'test')] = model.score(X_test, Y_test)
#Convert dict type to pandas one-dimensional list
print(pd.Series(scores))
score ()
function, and store model_name
and either train
or test
as a set as a key.Multiple regression | Ridge regression | Lasso return | |
---|---|---|---|
Correct answer rate of training data | 0.733358 | 0.733355 | 0.733358 |
Test data accuracy rate | 0.737069 | 0.737768 | 0.737084 |
** So I would like to change the regularization parameters and compare. ** **
#parameter settings
alpha = 10.0
#Initialize each class and store in models
models = {
'ridge': Ridge(alpha=alpha, random_state=0),
'lasso': Lasso(alpha=alpha, random_state=0)}
#Initialize the dict type variable that stores the correct answer rate
scores = {}
#Execute each model in sequence and store the correct answer rate
for model_name, model in models.items():
model.fit(X_train, Y_train)
scores[(model_name, 'train')] = model.score(X_train, Y_train)
scores[(model_name, 'test')] = model.score(X_test, Y_test)
print(pd.Series(scores))
λ | Ridge(train) | Ridge(test) | Lasso(train) | Lasso(test) |
---|---|---|---|---|
1 | 0.733355 | 0.737768 | 0.733358 | 0.737084 |
10 | 0.733100 | 0.743506 | 0.733357 | 0.737372 |
100 | 0.721015 | 0.771022 | 0.733289 | 0.740192 |
200 | 0.705228 | 0.778607 | 0.733083 | 0.743195 |
400 | 0.680726 | 0.779004 | 0.732259 | 0.748795 |
500 | 0.671349 | 0.777338 | 0.731640 | 0.751391 |
1000 | 0.640017 | 0.767504 | 0.726479 | 0.762336 |