We will try 28 types of regression models of sklearn and generate a graph of accuracy.
As a tool to try a lot of machine learning models, there are a lot of wonderful and convenient tools such as AutoML system and recently PyCaret, but I want to prepare a model by myself. Since there was something, I will leave a memorandum.
Import the required model and put together all the regression model instances.
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.linear_model import PassiveAggressiveRegressor, ARDRegression, RidgeCV
from sklearn.linear_model import TheilSenRegressor, RANSACRegressor, HuberRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.cross_decomposition import PLSRegression
reg_dict = {"LinearRegression": LinearRegression(),
"Ridge": Ridge(),
"Lasso": Lasso(),
"ElasticNet": ElasticNet(),
"Polynomial_deg2": Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', LinearRegression())]),
"Polynomial_deg3": Pipeline([('poly', PolynomialFeatures(degree=3)),('linear', LinearRegression())]),
"Polynomial_deg4": Pipeline([('poly', PolynomialFeatures(degree=4)),('linear', LinearRegression())]),
"Polynomial_deg5": Pipeline([('poly', PolynomialFeatures(degree=5)),('linear', LinearRegression())]),
"KNeighborsRegressor": KNeighborsRegressor(n_neighbors=3),
"DecisionTreeRegressor": DecisionTreeRegressor(),
"RandomForestRegressor": RandomForestRegressor(),
"SVR": SVR(kernel='rbf', C=1e3, gamma=0.1, epsilon=0.1),
"GaussianProcessRegressor": GaussianProcessRegressor(),
"SGDRegressor": SGDRegressor(),
"MLPRegressor": MLPRegressor(hidden_layer_sizes=(10,10), max_iter=100, early_stopping=True, n_iter_no_change=5),
"ExtraTreesRegressor": ExtraTreesRegressor(n_estimators=100),
"PLSRegression": PLSRegression(n_components=10),
"PassiveAggressiveRegressor": PassiveAggressiveRegressor(max_iter=100, tol=1e-3),
"TheilSenRegressor": TheilSenRegressor(random_state=0),
"RANSACRegressor": RANSACRegressor(random_state=0),
"HistGradientBoostingRegressor": HistGradientBoostingRegressor(),
"AdaBoostRegressor": AdaBoostRegressor(random_state=0, n_estimators=100),
"BaggingRegressor": BaggingRegressor(base_estimator=SVR(), n_estimators=10),
"GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
"VotingRegressor": VotingRegressor([('lr', LinearRegression()), ('rf', RandomForestRegressor(n_estimators=10))]),
"StackingRegressor": StackingRegressor(estimators=[('lr', RidgeCV()), ('svr', LinearSVR())], final_estimator=RandomForestRegressor(n_estimators=10)),
"ARDRegression": ARDRegression(),
"HuberRegressor": HuberRegressor(),
}
** Caution 1 ** The arguments of each regression model are quite textural, and the above code is just an example. When using it, it is strongly recommended to set it to an appropriate value according to the data or to combine it with hyperparameter optimization such as grid search.
** Caution 2 ** Polynomial_deg2-5 is regression by 2-5th order function. See the sklearn formula for a description of other models.
Use sklearn's make_regression
to generate data for regression.
In addition, all models are trained and inferred 10 times each, and sorted in order of accuracy (MAPE is used here).
from sklearn.model_selection import train_test_split
import random
from sklearn.datasets import make_regression
import numpy as np
def mean_absolute_percentage_error(y_true, y_pred):
"""MAPE"""
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
test_size = 0.3 #Split ratio
N_trials = 10 #Number of trials
#Generate data to be regressed
x, y = make_regression(random_state=12,
n_samples=100,
n_features=10,
n_informative=4,
noise=10.0,
bias=0.0)
mape_dict = {reg_name:[] for reg_name in reg_dict.keys()} #Precision hangar
for i in range(N_trials):
print(f"Trial {i+1}")
random_state = random.randint(0, 1000)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)
for reg_name, reg in reg_dict.items():
reg.fit(x_train,y_train)
y_pred = reg.predict(x_test)
mape = mean_absolute_percentage_error(y_test, y_pred) #Calculate MAPE
mape_dict[reg_name].append(mape) #Storing
#Sort by average MAPE
mape_dict_sorted = {key: value for key, value in reversed(sorted(mape_dict.items(), key=lambda x:np.mean(x[1])))}
In the above, MAPE was measured 10 times for all models and sorted by the average value of MAPE.
When judging the quality of a machine learning model, in addition to the average value of accuracy, we also want to look at the degree of dispersion (index such as variance).
Therefore, draw a box plot.
The smaller the MAPE, the better, so in the figure, the upper model is better, and the smaller the boxplot, the more stable the model.
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import Normalize
plt.rcParams["font.size"] = 18 #Increase the font size
scalarMap = cm.ScalarMappable(norm=Normalize(vmin=0, vmax=len(mape_dict)), cmap=plt.get_cmap('gist_rainbow_r'))
plt.figure(figsize=(15,9))
box=plt.boxplot(mape_dict_sorted.values(), vert=False, patch_artist=True,labels=mape_dict_sorted.keys())
for i, patch in enumerate(box['boxes']):
patch.set_facecolor(scalarMap.to_rgba(i))
plt.title("MAPE Box Plot")
plt.xlabel("MAPE")
plt.ylabel("Regressor Name")
plt.show()
Recommended Posts