from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
boston = load_boston()
df = pd.DataFrame(boston["data"], columns = boston["feature_names"])
df["PRICE"] = boston["target"]
df.head()
#How to use Cykit Learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
#Input data
X = df.drop("PRICE", axis=1)
Y = df["PRICE"]
#Divided into train data and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
#Standardize values
sc = StandardScaler()
X_train__std = sc.fit_transform(X_train)
Y_train_std = sc.fit_transform(Y_train.values.reshape(-1,1))
X_test_std = sc.transform(X_test)
Y_test_std = sc.transform(Y_test.values.reshape(-1, 1))
#Linear regression
print("***Linear regression***")
model_linear = LinearRegression()
model_linear.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_linear.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_linear.score(X_test, Y_test))
Y_train_pred = model_linear.predict(X_train)
Y_test_pred = model_linear.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#Linear kernel SVM regression
print("***SVM regression***")
#Regularization parameters=1, use linear kernel
model_svm = svm.SVR(C=1.0, kernel='linear', epsilon=0.1)
model_svm.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_svm.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_svm.score(X_test, Y_test))
Y_train_pred = model_svm.predict(X_train)
Y_test_pred = model_svm.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#Ridge regression
print("***Ridge regression***")
model_ridge = Ridge(alpha=1.0, fit_intercept=True,
normalize=False, copy_X=True,
max_iter=None, tol=0.001, random_state=0)
model_ridge.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_ridge.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_ridge.score(X_test, Y_test))
Y_train_pred = model_ridge.predict(X_train)
Y_test_pred = model_ridge.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#Lasso return
print("***Lasso return***")
model_lasso = Lasso(alpha=1.0, fit_intercept=True,
normalize=False, copy_X=True,
max_iter=1000, tol=0.0001,
warm_start=False, positive=False,
random_state=None, selection="cyclic")
model_lasso.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_lasso.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_lasso.score(X_test, Y_test))
Y_train_pred = model_lasso.predict(X_train)
Y_test_pred = model_lasso.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#Elastic net regression
print("***Elastic net regression***")
model_lasso_elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.5,
fit_intercept=True, normalize=False,
max_iter=1000, copy_X=True,
tol=0.0001, warm_start=False,
positive=False, random_state=None,
selection='cyclic')
model_lasso_elasticnet.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_lasso_elasticnet.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_lasso_elasticnet.score(X_test, Y_test))
Y_train_pred = model_lasso_elasticnet.predict(X_train)
Y_test_pred = model_lasso_elasticnet.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#Random forest regression
print("***Random forest regression***")
model_randomforest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
oob_score=False, random_state=2525, verbose=0, warm_start=False)
model_randomforest.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_randomforest.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_randomforest.score(X_test, Y_test))
Y_train_pred = model_randomforest.predict(X_train)
Y_test_pred = model_randomforest.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#Gradient boosting regression
print("Gradient boosting regression")
model_gbc = GradientBoostingRegressor(random_state=0)
model_gbc.fit(X_train, Y_train)
print("Correlation coefficient of training data:", model_gbc.score(X_train, Y_train))
print("Correlation coefficient of validation data:", model_gbc.score(X_test, Y_test))
Y_train_pred = model_gbc.predict(X_train)
Y_test_pred = model_gbc.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
from sklearn.preprocessing import PolynomialFeatures
df1 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]] ,columns=["col_a", "col_b", "col_c"])
print(df1)
pf = PolynomialFeatures(degree=2, include_bias=False)
df2 = pd.DataFrame(pf.fit_transform(a), columns = pf.get_feature_names(a.columns))
print(df2)
#Mean squared error(Sum of the squares of the residuals)
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_pred)
#Coefficient of determination(0.0-1.Evaluate the goodness of the model fit between 0)
#However, there are cases where a negative coefficient of determination appears. Represents a bad fit
from sklearn.metrics import r2_score
r2_score(y, y_pred)
・ If "f1" is specified for scoring, it will be evaluated by F value.
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
model_rfc_1 = RandomForestClassifier()
cross_val_score(model_rfc_1, X, y, cv=cv, scoring='accuracy')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
param_grid = {'max_depth': [5, 10, 15], 'n_estimators': [10, 20, 30]}
model_rfc_2 = RandomForestClassifier()
grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X, y)
Recommended Posts