My previous job was software development (Java, C #). There, we also conducted detailed code reviews based on the guideline of writing beautiful code efficiently.
I am in charge of data analysis and machine learning. I will write it in Python, but if it is not a commercial version, there is no code review and it is an atmosphere that it should work. .. Certainly, the analysis target, model and parameters change frequently, so I think that it is the result rather than working hard on the design.
However, I hate the code that defines multiple models (for example, multiple regression, SVR, Lasso, RandomForestRegressor) in one Jupyter notebook and sends each model, and separating notebooks for each model also manages more files. I hate it because it's a hassle. So, I don't mess with one notebook to move (for checking the result), and model is specified from configuration and executed, which makes it a little object-oriented. I'm sure there are people who say, "Machine learning & notebook models are not." ..
clazz_path = parser.get(path="regression", key="class_path")
model = ClassCreator.create(clazz_path)
[regression]
class_path = utility.SVRModel.SVRModel
import sys
class ClassCreator():
@staticmethod
def create(class_path):
try:
print("class:", class_path)
component_path = str(class_path).split('.')
package_path = component_path[:-1]
package_name = ".".join(package_path)
class_name = component_path[-1]
__import__(str(package_name))
cls = getattr(sys.modules[package_name], class_name)
return cls()
except Exception as e:
print('===error contents===')
print('type:' + str(type(e)))
print('args:' + str(e.args))
print('eself:' + str(e))
from abc import ABCMeta, abstractmethod
class AbstractModel(metaclass=ABCMeta):
@abstractmethod
def run_grid_search(self, x_train_scaled, y_train):
pass
@abstractmethod
def create_and_pred(self, x_train_scaled, y_train, x_test_scaled):
pass
@abstractmethod
def print_eval(self, x_train_scaled, x_test_scaled, y_train, y_test, df):
pass
@abstractmethod
def get_score(self, x_test_scaled, y_test):
pass
@abstractmethod
def get_rmse(self, y_test, pred):
pass
@abstractmethod
def get_modeltype(self):
pass
@abstractmethod
def get_best_params(self):
pass
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import mean_squared_error
from utility.AbstractModel import AbstractModel
class SVRModel(AbstractModel):
def __init__(self):
self.clr = SVR()
self.regr = None
self.grid_search = None
def run_grid_search(self, x_train_scaled, y_train):
param_grid = {'C': [0.005, 0.0075, 0.1, 0.25, 0.4, 0.5, 0.6, 0.75, 1],
'epsilon': [0.000001, 0.00005, 0.00001, 0.0001, 0.0005, 0.001]}
self.grid_search = GridSearchCV(self.clr, param_grid, cv=5)
self.grid_search.fit(x_train_scaled, y_train)
print("best param: {}".format(self.grid_search.best_params_))
print("best score: {}".format(self.grid_search.best_score_))
return self.grid_search
def create_and_pred(self, x_train_scaled, y_train, x_test_scaled):
self.regr = SVR(C=self.grid_search.best_params_["C"], epsilon=self.grid_search.best_params_["epsilon"])
self.regr.fit(x_train_scaled, y_train)
return self.regr.predict(x_test_scaled)
def print_eval(self, x_train_scaled, x_test_scaled, y_train, y_test, df):
if self.regr is None:
raise Exception("needs to run 'create_and_pred' method before call this method.")
print("Fits test data")
print("Accuracy of training data(Coefficient of determination r^2 score,correlation) =", self.regr.score(x_train_scaled, y_train))
print("Test data accuracy(Coefficient of determination r^2 score,correlation) =", self.regr.score(x_test_scaled, y_test))
pred = self.regr.predict(X=x_test_scaled)
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))
def get_score(self, x_test_scaled, y_test):
return self.regr.score(x_test_scaled, y_test)
def get_rmse(self, y_test, pred):
return np.sqrt(mean_squared_error(y_test, pred))
def get_modeltype(self):
return type(self.clr)
def get_best_params(self):
return self.grid_search.best_params_
Of course, I think there is a better shape. Anyway, unlike system development, the code repeats with trial and error, so I think it's important not to forget it later.
Using this form, I will try to build a model with sample data in the future.
Recommended Posts