#Data processing / calculation / analysis library
import numpy as np
import pandas as pd
#Graph drawing library
import matplotlib.pyplot as plt
%matplotlib inline
#Machine learning library
import sklearn
#Get data
url = 'https://raw.githubusercontent.com/yumi-ito/datasets/master/datasets_auto.csv'
#Read the acquired data as a DataFrame object
df = pd.read_csv(url, header=None)
#Set column label
df.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base',
'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
Variable name | Free translation | Item (commentary) | Data type | |
---|---|---|---|---|
0 | symboling | Insurance risk rating | -3, -2, -1, 0, 1, 2, 3.(3 is high risk and dangerous,-3 is low risk and safe) | int64 |
1 | normalized-losses | Normalization loss | 65〜256 | object |
2 | make | Maker | alfa-romero, audi, bmw, ..., volkswagen, volvo. | object |
3 | fuel-type | Fuel type | diesel, gas. | object |
4 | aspiration | Intake type | std, turbo. | object |
5 | num-of-doors | Number of doors | four, two. | object |
6 | body-style | Body style | hardtop, wagon, sedan, hatchback, convertible. | object |
7 | drive-wheels | Drive wheels | 4wd, fwd, rwd. | object |
8 | engine-location | Engine position | front, rear. | object |
9 | wheel-base | Wheelbase | 86.6~120.9 | float64 |
10 | length | Commander | 141.1~208.1 | float64 |
11 | width | Vehicle width | 60.3~72.3 | float64 |
12 | height | Vehicle height | 47.8~59.8 | float64 |
13 | curb-weight | Unmanned vehicle weight | 1488~4066 | int64 |
14 | engine-type | Engine type | dohc, dohcv, l, ohc, ohcf, ohcv, rotor. | object |
15 | num-of-cylinders | Number of cylinders | eight, five, four, six, three, twelve, two. | object |
16 | engine-size | Engine size | 61~326 | int64 |
17 | fuel-system | Fuel system | 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi. | object |
18 | bore | Engine cylinder inner diameter | 2.54~3.94 | object |
19 | stroke | Amount of movement of the piston | 2.07~4.17 | object |
20 | compression-ratio | Compression ratio | 7~23 | float64 |
21 | horsepower | horsepower | 48~288 | object |
22 | peak-rpm | Maximum output | 4150~6600 | object |
23 | city-mpg | City fuel economy | 13-49 (miles traveled per gallon of oil) | int64 |
24 | highway-mpg | Highway fuel economy | 16~54 | int64 |
25 | price | price | 5118~45400 | object |
#Output data shape and number of defects
print(df.shape)
print('Number of defects:{}'.format(df.isnull().sum().sum()))
#Output the first 5 lines of data
df.head()
#Create a DataFrame for only the target columns
auto = df[['price', 'horsepower', 'width', 'height']]
#For each column, "?Check the number that contains
auto.isin(['?']).sum()
#"?Replace with NAN and delete the line with NAN
auto = auto.replace('?', np.nan).dropna()
#Check the shape of the matrix after deletion
auto.shape
#Data type confirmation
auto.dtypes
#Convert data type
auto = auto.assign(price = pd.to_numeric(auto.price))
auto = auto.assign(horsepower = pd.to_numeric(auto.horsepower))
#Check the data type after conversion
auto.dtypes
function, if you specify
column name = value` in the keyword argument, the specified value will be assigned to the existing column, and a new column will be added if it is a new column name. ..corr ()
function of pandas.auto.corr()
#Check the data
print(auto)
** Using this data, perform model estimation for ridge regression and multiple regression analysis, and compare the accuracy of both. ** **
#Import for model building of ridge regression
from sklearn.linear_model import Ridge
#Import for model building of multiple regression analysis
from sklearn.linear_model import LinearRegression
#Import for data splitting (training data and test data)
from sklearn.model_selection import train_test_split
drop ()
function to remove the price
column and set only the explanatory variables to x and only the price
to y.train_test_split
method, the explanatory variable x and the objective variable y are separated into training data (train) and test data (test), respectively.#Set explanatory variables and objective variables
x = auto.drop('price', axis=1)
y = auto['price']
#Divided into training data and test data
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.5, random_state=0)
** First, build a model for multiple regression analysis and calculate the accuracy rate of training data and test data. ** **
#Initialization of LinearRegression class
linear = LinearRegression()
#Execution of learning
linear.fit(X_train, Y_train)
#Correct answer rate of training data
train_score_linear = format(linear.score(X_train, Y_train))
print('Correct answer rate of multiple regression analysis(train):',
'{:.6f}'.format(float(train_score_linear)))
#Test data accuracy rate
test_score_linear = format(linear.score(X_test, Y_test))
print('Correct answer rate of multiple regression analysis(test):',
'{:.6f}'.format(float(test_score_linear)))
'{:. Number of digits f}'. format ()
specifies the number of digits after the decimal point.train_score_linear
and test_score_linear
is str, they are converted to floating point numbers by float ()
.** Next, build a model of ridge regression and calculate the accuracy rate of training data and test data. ** **
#Initialization of Ridge class
ridge = Ridge()
#Execution of learning
ridge.fit(X_train, Y_train)
#Correct answer rate of training data
train_score_ridge = format(ridge.score(X_train, Y_train))
print('Correct answer rate of ridge regression(train):',
'{:.6f}'.format(float(train_score_ridge)))
#Test data accuracy rate
test_score_ridge = format(ridge.score(X_test, Y_test))
print('Correct answer rate of ridge regression(test):',
'{:.6f}'.format(float(test_score_ridge)))
Multiple regression analysis(L) | Ridge regression(R) | Difference(L-R) | |
---|---|---|---|
Correct answer rate of training data | 0.733358 | 0.733355 | 0.000003 |
Test data accuracy rate | 0.737069 | 0.737768 | -0.000699 |
Recommended Posts