Since lightGBM and xgboost did not work in Boruta, I rewrote a part and tried to make it work.
See below for how to do it normally with sklearn estimator https://qiita.com/studio_haneya/items/bdb25b19baaf43d867d7
Windows10 python 3.6.7 scikit-learn 0.21.3 lightgbm 2.3.0 xgboost 0.90
Boruta works with sklearn estimator that can get feature_importance_, so you can use Random Forest and Gradient Boosting, but lightGBM and xg boost's sklearn wrapper is like sklearn, but it doesn't work as it is because it is a little different. is. So, I inherited the BorutaPy class and rewrote it partly to make it work.
There are two differences that can be problematic when trying to use lightGBM with Boruta:
boruta | lgb/xgb | |
---|---|---|
random_state | np.random.RandomState | int |
max_No depth limit | None | -1 |
In sklearn, seed can be passed by np.random.RandomState (), but it is not possible in lightGBM / xgboost and it must be passed as an int type number, and it is assigned to params when the max_depth limit is removed. Since the value is None for sklearn and -1 for lightGBM, n_estimators calculated based on max_depth cannot be calculated normally. So if you fix the parts that correspond to these two, it will work.
Below is the sample code that runs lightGBM. I inherited BorutaPy and rewrote part of it, but since _fit () is designed to replace random_state with np.random.RandomState () and then self.estimator.fit (), it is not rewritten. I can't think of a way to do it, and it's a very long code. Please let me know if there is a better way. (Fixed 20191102: Fixed because the code did not reflect the value of random state) </ font>
python
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from boruta import BorutaPy
import lightgbm as lgb
import xgboost as xgb
from sklearn.utils import check_random_state
class BorutaPyForLGB(BorutaPy):
def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
two_step=True, max_iter=100, random_state=None, verbose=0):
super().__init__(estimator, n_estimators, perc, alpha,
two_step, max_iter, random_state, verbose)
if random_state is None:
self.random_state_input = np.random.randint(0, 2**64-1)
elif isinstance(random_state, int):
self.random_state_input = random_state
else:
raise TypeError('random_state must be int or None')
def _get_tree_num(self, n_feat):
depth = self.estimator.get_params()['max_depth']
if (depth == None) or (depth <= 0):
depth = 10
f_repr = 100
multi = ((n_feat * 2) / (np.sqrt(n_feat * 2) * depth))
n_estimators = int(multi * f_repr)
return n_estimators
def _fit(self, X, y):
# check input params
self._check_params(X, y)
self.random_state = check_random_state(self.random_state)
# setup variables for Boruta
n_sample, n_feat = X.shape
_iter = 1
# holds the decision about each feature:
# 0 - default state = tentative in original code
# 1 - accepted in original code
# -1 - rejected in original code
dec_reg = np.zeros(n_feat, dtype=np.int)
# counts how many times a given feature was more important than
# the best of the shadow features
hit_reg = np.zeros(n_feat, dtype=np.int)
# these record the history of the iterations
imp_history = np.zeros(n_feat, dtype=np.float)
sha_max_history = []
# set n_estimators
if self.n_estimators != 'auto':
self.estimator.set_params(n_estimators=self.n_estimators)
# main feature selection loop
while np.any(dec_reg == 0) and _iter < self.max_iter:
# find optimal number of trees and depth
if self.n_estimators == 'auto':
# number of features that aren't rejected
not_rejected = np.where(dec_reg >= 0)[0].shape[0]
n_tree = self._get_tree_num(not_rejected)
self.estimator.set_params(n_estimators=n_tree)
# make sure we start with a new tree in each iteration
self.estimator.set_params(random_state=self.random_state_input)
# add shadow attributes, shuffle them and train estimator, get imps
cur_imp = self._add_shadows_get_imps(X, y, dec_reg)
# get the threshold of shadow importances we will use for rejection
imp_sha_max = np.percentile(cur_imp[1], self.perc)
# record importance history
sha_max_history.append(imp_sha_max)
imp_history = np.vstack((imp_history, cur_imp[0]))
# register which feature is more imp than the max of shadows
hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)
# based on hit_reg we check if a feature is doing better than
# expected by chance
dec_reg = self._do_tests(dec_reg, hit_reg, _iter)
# print out confirmed features
if self.verbose > 0 and _iter < self.max_iter:
self._print_results(dec_reg, _iter, 0)
if _iter < self.max_iter:
_iter += 1
# we automatically apply R package's rough fix for tentative ones
confirmed = np.where(dec_reg == 1)[0]
tentative = np.where(dec_reg == 0)[0]
# ignore the first row of zeros
tentative_median = np.median(imp_history[1:, tentative], axis=0)
# which tentative to keep
tentative_confirmed = np.where(tentative_median
> np.median(sha_max_history))[0]
tentative = tentative[tentative_confirmed]
# basic result variables
self.n_features_ = confirmed.shape[0]
self.support_ = np.zeros(n_feat, dtype=np.bool)
self.support_[confirmed] = 1
self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
self.support_weak_[tentative] = 1
# ranking, confirmed variables are rank 1
self.ranking_ = np.ones(n_feat, dtype=np.int)
# tentative variables are rank 2
self.ranking_[tentative] = 2
# selected = confirmed and tentative
selected = np.hstack((confirmed, tentative))
# all rejected features are sorted by importance history
not_selected = np.setdiff1d(np.arange(n_feat), selected)
# large importance values should rank higher = lower ranks -> *(-1)
imp_history_rejected = imp_history[1:, not_selected] * -1
# update rank for not_selected features
if not_selected.shape[0] > 0:
# calculate ranks in each iteration, then median of ranks across feats
iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
rank_medians = np.nanmedian(iter_ranks, axis=0)
ranks = self._nanrankdata(rank_medians, axis=0)
# set smallest rank to 3 if there are tentative feats
if tentative.shape[0] > 0:
ranks = ranks - np.min(ranks) + 3
else:
# and 2 otherwise
ranks = ranks - np.min(ranks) + 2
self.ranking_[not_selected] = ranks
else:
# all are selected, thus we set feature supports to True
self.support_ = np.ones(n_feat, dtype=np.bool)
# notify user
if self.verbose > 0:
self._print_results(dec_reg, _iter, 1)
return self
Now that we're ready, let's run it below. The code is based on the following. https://github.com/masakiaota/blog/blob/master/boruta/Madalon_Data_Set.ipynb
def main():
#Read the data
data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
X_data = pd.read_csv(data_url, sep=" ", header=None)
y_data = pd.read_csv(label_url, sep=" ", header=None)
data = X_data.iloc[:,0:500]
data['target'] = y_data[0]
y=data['target']
X=data.drop(columns='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#Learning with the whole data
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train.values, y_train.values)
y_test_pred = model.predict(X_test.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with ALL Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
#Feature selection with Boruta(Use the partially rewritten Boruta Py)
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
feat_selector = BorutaPyForLGB(model, n_estimators='auto', two_step=False,verbose=2, random_state=42)
feat_selector.fit(X_train.values, y_train.values)
print(X_train.columns[feat_selector.support_])
#Extract the selected Feature
X_train_selected = X_train.iloc[:,feat_selector.support_]
X_test_selected = X_test.iloc[:,feat_selector.support_]
print(X_test_selected.head())
#Learn with the selected Feature
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train_selected.values, y_train.values)
y_test_pred = model.predict(X_test_selected.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with selected Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
if __name__=='__main__':
main()
The result of executing it is as follows. It seems that you can select it properly.
result
[[192 57]
[ 49 202]]
SCORE with ALL Features: 0.79
Index([48, 105, 153, 241, 318, 336, 338, 378, 442, 453, 472, 475], dtype='object')
[[212 37]
[ 34 217]]
SCORE with selected Features: 0.86
BorutaPyForLGB () created above can also be used with xgboost.
python
def main():
#Read the data
data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
X_data = pd.read_csv(data_url, sep=" ", header=None)
y_data = pd.read_csv(label_url, sep=" ", header=None)
data = X_data.iloc[:,0:500]
data['target'] = y_data[0]
y=data['target']
X=data.drop(columns='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#Learning with the whole data
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train.values, y_train.values)
y_test_pred = model.predict(X_test.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with ALL Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
#Feature selection with Boruta(Use the partially rewritten Boruta Py)
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
feat_selector = BorutaPyForLGB(model, n_estimators='auto', two_step=False,verbose=2, random_state=42)
feat_selector.fit(X_train.values, y_train.values)
print(X_train.columns[feat_selector.support_])
#Extract the selected Feature
X_train_selected = X_train.iloc[:,feat_selector.support_]
X_test_selected = X_test.iloc[:,feat_selector.support_]
print(X_test_selected.head())
#Learn with the selected Feature
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train_selected.values, y_train.values)
y_test_pred = model.predict(X_test_selected.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with selected Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
if __name__=='__main__':
main()
It works, but it's not working well, and the accuracy has dropped. It seems that the feature amount has been reduced too much, but why?
result
[[182 67]
[ 75 176]]
SCORE with ALL Features: 0.72
Index([28, 378, 451, 475], dtype='object')
[[148 101]
[109 142]]
SCORE with selected Features: 0.58
It's like that. Please let me know if there is a way to write it shorter. Let's try!
Recommended Posts