Posts of Hisabisa
There are category_encoders and pandas get_dummies () for OneHot Encoding, but I find it difficult to use in the following points.
It was itchy because I couldn't reach the itch.
I made my own ohehot encoder to solve this problem
class BaseEncoder():
def __init__(self):
pass
def fit(self):
raise Exception('not implemented')
def transform(self):
raise Exception('not implemented')
def fit_transform(self):
raise Exception('not implemented')
class OneHotEncoder(BaseEncoder):
# library requirement
# import pandas as pd
# import numpy as np
# mojimoji
def __init__(self,
col_name=None,
categories=None,
handle_unknown="summarize",
handle_nan="onehot",
col_order="name",
col_name_type="category",
force_hankaku=True,
return_type="df",
handle_rare=None,
dummy=None,
):
import pandas as pd
import numpy as np
import mojimoji
#---
# args
# col_name : target column [str, default : None] get column name from training data. If training data is np values, col is None
#
# categories : encoded category list [list, default : None]
#
# handle_unknown : handle unknown category method [str, default : "summarize"]
# "summarize" : unknown category (not appeared in training data) is treated as "unknownCategory"
# "ignore" : unknown category is ignored
#
# handle_nan : handle nan method [str, default : "onehot"]
# "onehot" : nan is treated as onehot
# "ignore" : nan is ignored
#
# col_order : output order [str, default : "name"]
# "name" : sorted by category name
# "count_asc" : sorted by ascending appearance count
# "count_des" : sorted by descending appearance count
#
# col_name_type : column name type [str, default : "category"]
# "name" : return column name is category name
# "index" : return column name is index number (rare : -1, nan : -2, impute : -3)
#
# force_hankaku : whether apply hankaku or not [bool , default : True]
#
# return_type : return values type [str, default : "df"] "pd" : pd.DataFrame , "np" : np.values
#
# handle_rare : rare category treat method [float, list, default : None]
# float : rare threshold of appearance category , list : this list category is treated as rare
#
# dummy : dummy method [str, bool, None, defult: None]
# str : category name , this category is treated as dummy
# True : dummy is valid, and dummy category is selected automatically
#
self.col_name = col_name
if type(categories) is list:
raise Exception(f"[Error] argument categories is invalid , shuold be list, but>> {categories}")
self.categories = categories
checks = ["summarize" , "ignore"]
if handle_unknown not in checks:
raise Exception(f"[Error] argument handle_unknown is invalid , shuold be {checks}, but {handle_unknown}")
self.handle_unknown = handle_unknown
checks = ["onehot" , "ignore"]
if handle_nan not in checks:
raise Exception(f"[Error] argument handle_nan is invalid , shuold be {checks}, but {handle_nan}")
self.handle_nan = handle_nan
checks = ["name" , "count_asc", "count_des"]
if col_order not in checks:
raise Exception(f"[Error] argument col_order is invalid , shuold be {checks}, but {col_order}")
self.col_order = col_order
checks = ["category" , "index"]
if col_name_type not in checks:
raise Exception(f"[Error] argument col_name_type is invalid , shuold be {checks}, but {col_name_type}")
self.col_name_type = col_name_type
checks = [bool]
if type(force_hankaku) not in checks:
raise Exception(f"[Error] argument force_hankaku should be {checks} type , but {force_hankaku}")
self.force_hankaku = force_hankaku
checks = ["df" , "np"]
if return_type not in checks:
raise Exception(f"[Error] argument return_type is invalid , shuold be {checks}, but {return_type}")
self.return_type = return_type
checks = [int, float, list]
if type(handle_rare) not in checks and handle_rare is not None:
raise Exception(f"[Error] argument handle_rare should be {checks} type or None, but {handle_rare}")
if type(handle_rare) in [int, float]:
if handle_rare >= 1 or handle_rare <= 0:
print(f"[Warning] handle_rare may be meaningless value >> {handle_rare}")
self.handle_rare = handle_rare if handle_rare is not None else -1.
checks = [str, bool]
if type(dummy) not in checks and dummy is not None:
raise Exception(f"[Error] argument force hankaku should be {checks} type or None , but {dummy}")
self.dummy = dummy # True only
self.encode_map = {}
self.unknown_categories = []
self.dummy_category = None
def fit(self, Xs):
_Xs = pd.Series(Xs.copy()).astype(str)
_Xs = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
# get column name
if self.col_name is None:
self.col_name = _Xs.name
if self.col_name is None:
self.col_name = "onehotEncode"
print(f"[Warning] column name is {self.col_name}")
new_cols = []
# if categories is inputted
if self.categories is not None:
cats = pd.Series(self.categories).astype(str)
if self.force_hankaku:
cats = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
for c in [x for x in cats if x not in ["nan", "None"]]:
onehot_name = f"{self.col_name}_{c}"
self.encode_map[c] = onehot_name
new_cols.append(onehot_name)
# handle nan
if self.handle_nan == "onehot":
for nan_v in ["nan", "None"]:
if nan_v in cats:
onehot_name = f"{self.col_name}_nan"
self.encode_map[nan_v] = onehot_name
new_cols.append(onehot_name)
# handle unknown
if self.handle_unknown == "summarize":
new_cols.append(f"{self.col_name}_unknownCategory")
self.new_cols = new_cols
return
# get category
vc = _Xs.value_counts(dropna=False, normalize=True)
# sort category
if self.col_order == "name":
vc.sort_index(inplace=True)
elif self.col_order == "count_asc":
vc.sort_values(inplace=True, ascending=True)
elif self.col_order == "count_des":
vc.sort_values(inplace=True, ascending=False)
# rare category (threshold)
if type(self.handle_rare) is float:
for c_ind, c in enumerate([x for x in vc[vc > self.handle_rare].index if x not in ["nan", "None"]]):
# skip dummy
if (self.dummy == True and c_ind == 0) or (self.dummy == c):
self.dummy_category = c
self.encode_map[c] = "DUMMY_CATEGORY"
continue
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_{c}"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_{c_ind}"
self.encode_map[c] = onehot_name
new_cols += [onehot_name]
for c in [x for x in vc[vc <= self.handle_rare].index if x not in ["nan", "None"]]:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_rareCategory"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-1"
self.encode_map[c] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# rare category (list)
if type(self.handle_rare) is list:
for c_ind, c in enumerate([x for x in vc.index if x not in ["nan" , "None"] + self.handle_rare]):
# skip dummy
if (self.dummy and c_ind == 0) or (self.dummy == c):
self.dummy_category = c
continue
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_{c}"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_{c_ind}"
self.encode_map[c] = onehot_name
new_cols += [onehot_name]
for c in self.handle_rare:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_rareCategory"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-1"
self.encode_map[c] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# handle nan
if self.handle_nan == "onehot":
for nan_v in ["nan", "None"]:
if nan_v in vc.index:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_nan"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-2"
self.encode_map[nan_v] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# handle unknown
if self.handle_unknown == "summarize":
if self.col_name_type == "category":
new_cols.append(f"{self.col_name}_unknownCategory")
elif self.col_name_type == "index":
new_cols.append(f"{self.col_name}_-3")
encode_map_inv = {}
for k, v in self.encode_map.items():
if v in encode_map_inv.keys():
encode_map_inv[v] += [k]
else:
encode_map_inv[v] = [k]
self.new_cols = new_cols
self.categories = list(self.encode_map.keys())
self.encode_map_inv = encode_map_inv
del _Xs
def transform(self, Xs):
_Xs = pd.Series(Xs.copy()).astype(str)
if self.force_hankaku:
_Xs = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
# return dataframe
res_df = pd.DataFrame(index=range(len(_Xs)))
for k, v in self.encode_map_inv.items():
if k == "DUMMY_CATEGORY":
continue
res_df[k] = 0 # fill 0
res_df.loc[_Xs.isin(v), k] = 1 # one hot
# handle unknown
if self.handle_unknown == "summarize":
new_col = f"{self.col_name}_unknownCategory"
res_df[new_col] = 0 # fill 0
known_cats = self.categories
if self.handle_nan == "ignore":
known_cats += ["nan", "None"]
res_df.loc[~_Xs.isin(known_cats), new_col] = 1 # one hot
for cat in list(set(_Xs.values) - set(known_cats)):
if cat not in self.unknown_categories:
self.unknown_categories += [cat]
del _Xs
# return type redefine
if self.return_type == "np":
res_df = res_df.values
return res_df
def fit_transform(self, Xs):
self.fit(Xs)
return self.transform(Xs)
Create sample data for training and testing as follows
The test has categories (elephant, bird, etc.) that are not found in learning (the encoder created also has a function to put these in the unknown category).
# generate sample category
import random
random.seed(42)
vals1 = ['salamander'] * 10 + ['snake'] * 8 + ['cameleon'] * 5 + ['rizard'] * 7 + ['frog'] * 2 + ['jellyfish'] * 3 + [np.nan] * 3 + [None] * 2
vals2 = ['cute'] * 4 + ['cool'] * 12 + ['colurful'] * 3 + ['nice'] * 2 + ['Wonderful'] * 3 + ['foooo'] * 3 + ['Excellent'] * 3 + [np.nan] * 6 + [None] * 4
vals3 = ['salamander'] * 13 + ['snake'] * 5 + ['cameleon'] * 7 + ['rizard'] * 5 + ['turtle'] * 3 + ['bird'] * 1 + ['elephant'] * 1 + ["jellyfish"] * 2 + [np.nan] * 1 + [None] * 2
vals4 = ['cute'] * 4 + ['cool'] * 12 + ['colorful'] * 3 + ['nice'] * 2 + ['Wonderful'] * 3 + ['foooo'] * 3 + ['Excellent'] * 1 + ['good'] * 1 + ['OK'] * 1 + [np.nan] * 3 + [None] * 7
random.shuffle(vals1)
random.shuffle(vals2)
random.shuffle(vals3)
random.shuffle(vals4)
train_df = pd.DataFrame({'animal' : vals1, 'feature' : vals2})
test_df = pd.DataFrame({'animal' : vals3, 'feature' : vals4})
Try one hot the animal column
#Create an instance
ohe = OneHotEncoder()
# train data de
Train encode
ohe.fit(train_df['animal'])
#Actually encode. Put the training data in transform,
ohe.transform(train_df['animal'])
Concat the original data and see the result
pd.concat([train_df, ohe.transform(train_df['animal'])], axis=1)
Some nans are properly onehot, and unknown categories are also available.
Let's look at the test data
pd.concat([test_df, ohe.transform(test_df['animal'])], axis=1)
Since the same column as train is prepared, it can be used as it is with light gbm or elastic net
This function was scarce as it was, so I implemented it. dict returns with ohe.encode_map
ohe.encode_map
You can also see the reverse version
ohe.encode_map_inv
ohe.new_cols
Specify handle_nan = "ignore"
ohe = OneHotEncoder(handle_nan="ignore")
ohe.fit(train_df['animal'])
nan column is gone
You can specify a category to be a rare category such as handle_rare = 0.1 (the number 0.1 is%)
See how often animals appear
try to encode
ohe = OneHotEncoder(handle_rare=0.1)
ohe.fit(train_df['animal'])
rareCategory has been added
If you look at encode_map, you can see what became rare
In addition, if you put a list of categories in handle_rare, the entered categories will be encoded in rareCategory.
ohe = OneHotEncoder(handle_rare=["cameleon", "frog"])
ohe.fit(train_df['animal'])
handle_unknown = "ignore", unknown is not encoded
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit(train_df['animal'])
If col_name_type = "index" is set, it becomes an index (same as category_encoders)
ohe = OneHotEncoder(col_name_type="index")
ohe.fit(train_df['animal'])
By default, the column name of dataframe is prefix, but you can change it with col_name = "XXXX"
(If you enter a numpy value instead of a dataframe, onehotEncode will be a prefix)
ohe = OneHotEncoder(col_name="new_col")
ohe.fit(train_df['animal'])
If you want dummy encoding (encoding that reduces the number of features by not making one category a column) set dummy = True
ohe = OneHotEncoder(dummy=True)
ohe.fit(train_df['animal'])
Dummy categories can be retrieved with ohe.dummy_category
If dummy = "xxx", the category will be dummy
I would like to create a library of category encodings that can be reached where it is itchy
I'm glad if you can get an impression by using the above
There are other detailed functions besides the above, but I'm tired of writing, so when I get more likes, I plan to make a library and put together usage such as git.