Messages de Hisabisa
Il existe des category_encoders et des pandas get_dummies () pour OneHot Encoding, mais je trouve difficile à utiliser dans les points suivants.
Je ne pouvais pas atteindre l'endroit qui me démangeait et c'était des démangeaisons
J'ai créé mon propre encodeur ohehot pour résoudre ce problème
class BaseEncoder():
def __init__(self):
pass
def fit(self):
raise Exception('not implemented')
def transform(self):
raise Exception('not implemented')
def fit_transform(self):
raise Exception('not implemented')
class OneHotEncoder(BaseEncoder):
# library requirement
# import pandas as pd
# import numpy as np
# mojimoji
def __init__(self,
col_name=None,
categories=None,
handle_unknown="summarize",
handle_nan="onehot",
col_order="name",
col_name_type="category",
force_hankaku=True,
return_type="df",
handle_rare=None,
dummy=None,
):
import pandas as pd
import numpy as np
import mojimoji
#---
# args
# col_name : target column [str, default : None] get column name from training data. If training data is np values, col is None
#
# categories : encoded category list [list, default : None]
#
# handle_unknown : handle unknown category method [str, default : "summarize"]
# "summarize" : unknown category (not appeared in training data) is treated as "unknownCategory"
# "ignore" : unknown category is ignored
#
# handle_nan : handle nan method [str, default : "onehot"]
# "onehot" : nan is treated as onehot
# "ignore" : nan is ignored
#
# col_order : output order [str, default : "name"]
# "name" : sorted by category name
# "count_asc" : sorted by ascending appearance count
# "count_des" : sorted by descending appearance count
#
# col_name_type : column name type [str, default : "category"]
# "name" : return column name is category name
# "index" : return column name is index number (rare : -1, nan : -2, impute : -3)
#
# force_hankaku : whether apply hankaku or not [bool , default : True]
#
# return_type : return values type [str, default : "df"] "pd" : pd.DataFrame , "np" : np.values
#
# handle_rare : rare category treat method [float, list, default : None]
# float : rare threshold of appearance category , list : this list category is treated as rare
#
# dummy : dummy method [str, bool, None, defult: None]
# str : category name , this category is treated as dummy
# True : dummy is valid, and dummy category is selected automatically
#
self.col_name = col_name
if type(categories) is list:
raise Exception(f"[Error] argument categories is invalid , shuold be list, but>> {categories}")
self.categories = categories
checks = ["summarize" , "ignore"]
if handle_unknown not in checks:
raise Exception(f"[Error] argument handle_unknown is invalid , shuold be {checks}, but {handle_unknown}")
self.handle_unknown = handle_unknown
checks = ["onehot" , "ignore"]
if handle_nan not in checks:
raise Exception(f"[Error] argument handle_nan is invalid , shuold be {checks}, but {handle_nan}")
self.handle_nan = handle_nan
checks = ["name" , "count_asc", "count_des"]
if col_order not in checks:
raise Exception(f"[Error] argument col_order is invalid , shuold be {checks}, but {col_order}")
self.col_order = col_order
checks = ["category" , "index"]
if col_name_type not in checks:
raise Exception(f"[Error] argument col_name_type is invalid , shuold be {checks}, but {col_name_type}")
self.col_name_type = col_name_type
checks = [bool]
if type(force_hankaku) not in checks:
raise Exception(f"[Error] argument force_hankaku should be {checks} type , but {force_hankaku}")
self.force_hankaku = force_hankaku
checks = ["df" , "np"]
if return_type not in checks:
raise Exception(f"[Error] argument return_type is invalid , shuold be {checks}, but {return_type}")
self.return_type = return_type
checks = [int, float, list]
if type(handle_rare) not in checks and handle_rare is not None:
raise Exception(f"[Error] argument handle_rare should be {checks} type or None, but {handle_rare}")
if type(handle_rare) in [int, float]:
if handle_rare >= 1 or handle_rare <= 0:
print(f"[Warning] handle_rare may be meaningless value >> {handle_rare}")
self.handle_rare = handle_rare if handle_rare is not None else -1.
checks = [str, bool]
if type(dummy) not in checks and dummy is not None:
raise Exception(f"[Error] argument force hankaku should be {checks} type or None , but {dummy}")
self.dummy = dummy # True only
self.encode_map = {}
self.unknown_categories = []
self.dummy_category = None
def fit(self, Xs):
_Xs = pd.Series(Xs.copy()).astype(str)
_Xs = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
# get column name
if self.col_name is None:
self.col_name = _Xs.name
if self.col_name is None:
self.col_name = "onehotEncode"
print(f"[Warning] column name is {self.col_name}")
new_cols = []
# if categories is inputted
if self.categories is not None:
cats = pd.Series(self.categories).astype(str)
if self.force_hankaku:
cats = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
for c in [x for x in cats if x not in ["nan", "None"]]:
onehot_name = f"{self.col_name}_{c}"
self.encode_map[c] = onehot_name
new_cols.append(onehot_name)
# handle nan
if self.handle_nan == "onehot":
for nan_v in ["nan", "None"]:
if nan_v in cats:
onehot_name = f"{self.col_name}_nan"
self.encode_map[nan_v] = onehot_name
new_cols.append(onehot_name)
# handle unknown
if self.handle_unknown == "summarize":
new_cols.append(f"{self.col_name}_unknownCategory")
self.new_cols = new_cols
return
# get category
vc = _Xs.value_counts(dropna=False, normalize=True)
# sort category
if self.col_order == "name":
vc.sort_index(inplace=True)
elif self.col_order == "count_asc":
vc.sort_values(inplace=True, ascending=True)
elif self.col_order == "count_des":
vc.sort_values(inplace=True, ascending=False)
# rare category (threshold)
if type(self.handle_rare) is float:
for c_ind, c in enumerate([x for x in vc[vc > self.handle_rare].index if x not in ["nan", "None"]]):
# skip dummy
if (self.dummy == True and c_ind == 0) or (self.dummy == c):
self.dummy_category = c
self.encode_map[c] = "DUMMY_CATEGORY"
continue
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_{c}"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_{c_ind}"
self.encode_map[c] = onehot_name
new_cols += [onehot_name]
for c in [x for x in vc[vc <= self.handle_rare].index if x not in ["nan", "None"]]:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_rareCategory"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-1"
self.encode_map[c] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# rare category (list)
if type(self.handle_rare) is list:
for c_ind, c in enumerate([x for x in vc.index if x not in ["nan" , "None"] + self.handle_rare]):
# skip dummy
if (self.dummy and c_ind == 0) or (self.dummy == c):
self.dummy_category = c
continue
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_{c}"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_{c_ind}"
self.encode_map[c] = onehot_name
new_cols += [onehot_name]
for c in self.handle_rare:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_rareCategory"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-1"
self.encode_map[c] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# handle nan
if self.handle_nan == "onehot":
for nan_v in ["nan", "None"]:
if nan_v in vc.index:
if self.col_name_type == "category":
onehot_name = f"{self.col_name}_nan"
elif self.col_name_type == "index":
onehot_name = f"{self.col_name}_-2"
self.encode_map[nan_v] = onehot_name
if onehot_name not in new_cols:
new_cols += [onehot_name]
# handle unknown
if self.handle_unknown == "summarize":
if self.col_name_type == "category":
new_cols.append(f"{self.col_name}_unknownCategory")
elif self.col_name_type == "index":
new_cols.append(f"{self.col_name}_-3")
encode_map_inv = {}
for k, v in self.encode_map.items():
if v in encode_map_inv.keys():
encode_map_inv[v] += [k]
else:
encode_map_inv[v] = [k]
self.new_cols = new_cols
self.categories = list(self.encode_map.keys())
self.encode_map_inv = encode_map_inv
del _Xs
def transform(self, Xs):
_Xs = pd.Series(Xs.copy()).astype(str)
if self.force_hankaku:
_Xs = _Xs.apply(lambda x : mojimoji.zen_to_han(x))
# return dataframe
res_df = pd.DataFrame(index=range(len(_Xs)))
for k, v in self.encode_map_inv.items():
if k == "DUMMY_CATEGORY":
continue
res_df[k] = 0 # fill 0
res_df.loc[_Xs.isin(v), k] = 1 # one hot
# handle unknown
if self.handle_unknown == "summarize":
new_col = f"{self.col_name}_unknownCategory"
res_df[new_col] = 0 # fill 0
known_cats = self.categories
if self.handle_nan == "ignore":
known_cats += ["nan", "None"]
res_df.loc[~_Xs.isin(known_cats), new_col] = 1 # one hot
for cat in list(set(_Xs.values) - set(known_cats)):
if cat not in self.unknown_categories:
self.unknown_categories += [cat]
del _Xs
# return type redefine
if self.return_type == "np":
res_df = res_df.values
return res_df
def fit_transform(self, Xs):
self.fit(Xs)
return self.transform(Xs)
Créez des exemples de données pour la formation et les tests comme suit
Le test a des catégories (éléphant, oiseau, etc.) qui ne se trouvent pas dans l'apprentissage (l'encodeur créé a également une fonction pour les mettre dans la catégorie inconnue).
# generate sample category
import random
random.seed(42)
vals1 = ['salamander'] * 10 + ['snake'] * 8 + ['cameleon'] * 5 + ['rizard'] * 7 + ['frog'] * 2 + ['jellyfish'] * 3 + [np.nan] * 3 + [None] * 2
vals2 = ['cute'] * 4 + ['cool'] * 12 + ['colurful'] * 3 + ['nice'] * 2 + ['Wonderful'] * 3 + ['foooo'] * 3 + ['Excellent'] * 3 + [np.nan] * 6 + [None] * 4
vals3 = ['salamander'] * 13 + ['snake'] * 5 + ['cameleon'] * 7 + ['rizard'] * 5 + ['turtle'] * 3 + ['bird'] * 1 + ['elephant'] * 1 + ["jellyfish"] * 2 + [np.nan] * 1 + [None] * 2
vals4 = ['cute'] * 4 + ['cool'] * 12 + ['colorful'] * 3 + ['nice'] * 2 + ['Wonderful'] * 3 + ['foooo'] * 3 + ['Excellent'] * 1 + ['good'] * 1 + ['OK'] * 1 + [np.nan] * 3 + [None] * 7
random.shuffle(vals1)
random.shuffle(vals2)
random.shuffle(vals3)
random.shuffle(vals4)
train_df = pd.DataFrame({'animal' : vals1, 'feature' : vals2})
test_df = pd.DataFrame({'animal' : vals3, 'feature' : vals4})
Essayez une colonne d'animaux chauds
#Créer une instance
ohe = OneHotEncoder()
# train data de
Encoder le train
ohe.fit(train_df['animal'])
#En fait, encodez. Mettez les données d'entraînement en transformation,
ohe.transform(train_df['animal'])
Concater avec les données d'origine et voir le résultat
pd.concat([train_df, ohe.transform(train_df['animal'])], axis=1)
Certains nans sont correctement onehot et il existe également des catégories inconnues.
Regardons les données de test
pd.concat([test_df, ohe.transform(test_df['animal'])], axis=1)
Puisque la même colonne que le train est préparée, elle peut être utilisée telle quelle avec gbm léger ou filet élastique
Cette fonction était rare comme elle l'était, alors je l'ai implémentée dict retourne avec ohe.encode_map
ohe.encode_map
Vous pouvez également voir la version inversée
ohe.encode_map_inv
ohe.new_cols
Spécifiez handle_nan = "ignorer"
ohe = OneHotEncoder(handle_nan="ignore")
ohe.fit(train_df['animal'])
la colonne nan est partie
Vous pouvez spécifier une catégorie comme étant une catégorie rare telle que handle_rare = 0,1 (le nombre 0,1 est%)
Voir à quelle fréquence les animaux apparaissent
essayer d'encoder
ohe = OneHotEncoder(handle_rare=0.1)
ohe.fit(train_df['animal'])
rareCategory a été ajouté
Si vous regardez encode_map, vous pouvez voir ce qui est devenu rare
De plus, si vous mettez une liste de catégories dans handle_rare, les catégories saisies seront encodées en rareCategory.
ohe = OneHotEncoder(handle_rare=["cameleon", "frog"])
ohe.fit(train_df['animal'])
handle_unknown = "ignorer", unknown n'est pas codé
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit(train_df['animal'])
Si col_name_type = "index" est défini, il devient un index (identique à category_encoders)
ohe = OneHotEncoder(col_name_type="index")
ohe.fit(train_df['animal'])
Par défaut, le nom de colonne de dataframe est prefix, mais vous pouvez le changer avec col_name = "XXXX".
(Si vous entrez une valeur numpy au lieu d'un dataframe, onehotEncode devient un préfixe)
ohe = OneHotEncoder(col_name="new_col")
ohe.fit(train_df['animal'])
Si vous voulez un encodage factice (encodage qui réduit le nombre d'entités en ne faisant pas d'une catégorie une colonne) set dummy = True
ohe = OneHotEncoder(dummy=True)
ohe.fit(train_df['animal'])
Les catégories factices peuvent être trouvées dans ohe.dummy_category
Si dummy = "xxx", la catégorie sera factice
Je voudrais créer une bibliothèque d'encodage catégorique qui peut être atteinte là où ça démange
Je suis heureux si vous pouvez avoir une impression en utilisant ce qui précède
Il existe d'autres fonctions détaillées en plus de celles ci-dessus, mais je suis fatigué d'écrire, alors quand j'aurai plus de likes, je prévois de créer une bibliothèque et de rassembler des utilisations telles que git.
Recommended Posts