This is an article about creating a natural language classification model with BERT, LightGBM, and optuna. The data uses livedoor news corpus.
Also, all the code used in this article is below. https://github.com/kazuki-hayakawa/bert_lightgbm_model
When you actually run it, please do a git clone
and try it at hand.
Download the livedoor news corpus data to the data / raw
directory. The download scripts are summarized in src / data / download_livedoor_news.sh
.
Then run src / data / preprocess.py
to preprocess and save the data separately for training and testing.
src/data/preprocess.py
import os
import glob
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
def read_text(text_filepath):
"""Read only the text from the 4th line onward according to the format of livedoor news"""
with open(text_filepath, 'r') as f:
lines = f.readlines()
lines = lines[3:]
text = ' '.join(lines)
#Delete double-byte space and line feed code
text = text.replace('\u3000', '').replace('\n', '')
return text
def main():
#Download in advance_livedoor_news.Run sh to get the data
exclude_files = ['CHANGES.txt', 'README.txt', 'LICENSE.txt']
all_file_paths = glob.glob('../../data/raw/text/**/*.txt', recursive=True)
all_file_paths = [p for p in all_file_paths
if os.path.basename(p) not in exclude_files]
df_processed = pd.DataFrame(columns=['id', 'media', 'text'])
for idx, filepath in enumerate(tqdm(all_file_paths)):
media = os.path.dirname(filepath).replace('../../data/raw/text/', '')
text = read_text(filepath)
row = pd.Series([idx + 1, media, text], index=df_processed.columns)
df_processed = df_processed.append(row, ignore_index=True)
df_train, df_test, _, _ = train_test_split(
df_processed, df_processed['media'], test_size=0.1, random_state=0,
stratify=df_processed['media']
)
df_train.to_csv('../../data/processed/train_dataset.csv', index=False)
df_test.to_csv('../../data/processed/test_dataset.csv', index=False)
if __name__ == '__main__':
main()
For the procedure, refer to Creating a Japanese BERT sentence embedding calculation server using bert-as-service.
Create the models / bert_jp
directory and download Japanese learned BERT model.
Rename the file so that it can be loaded by bert-as-service
mv model.ckpt-1400000.index bert_model.ckpt.index
mv model.ckpt-1400000.meta bert_model.ckpt.meta
mv model.ckpt-1400000.data-00000-of-00001 bert_model.ckpt.data-00000-of-00001
Creating a vocabulary file
cut -f1 wiki-ja.vocab | sed -e "1 s/<unk>/[UNK]/g" > vocab.txt
Creating a BERT configuration file
bert_jp/bert_config.json
{
"attention_probs_dropout_prob" : 0.1,
"hidden_act" : "gelu",
"hidden_dropout_prob" : 0.1,
"hidden_size" : 768,
"initializer_range" : 0.02,
"intermediate_size" : 3072,
"max_position_embeddings" : 512,
"num_attention_heads" : 12,
"num_hidden_layers" : 12,
"type_vocab_size" : 2,
"vocab_size" : 32000
}
Start the container by running docker-compose up -d
. (Refer to the GitHub repository for Dockerfile and docker-compose.yml)
Then run docker-compose exec analytics / bin / bash
to enter the container.
The Bert
class that operates BERT is implemented as follows.
src/features/bert.py
import sentencepiece as spm
from bert_serving.client import BertClient
class Bert():
""" Bert model client
Before usage, you need to run bert server.
"""
def __init__(self, bert_model_path, client_ip='0.0.0.0'):
self.bert_client = BertClient(ip=client_ip)
self.spm_model = spm.SentencePieceProcessor()
self.spm_model.load(bert_model_path + 'wiki-ja.model')
def _parse(self, text):
text = str(text).lower()
encoded_texts = self.spm_model.EncodeAsPieces(text)
encoded_texts = [t for t in encoded_texts if t.strip()]
return encoded_texts
def text2vec(self, texts):
"""
Args:
texts (list):List of Japanese strings
Returns:
numpy array:Distributed representation tensor of text
"""
parsed_texts = list(map(self._parse, texts))
tensor = self.bert_client.encode(parsed_texts, is_tokenized=True)
return tensor
Use this to convert natural language into a vector. At the same time, the media name that is the objective variable is also converted to an integer label.
Run src / features / build_features.py
.
src/features/build_features.py
import subprocess
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from bert import Bert
def build_features(df, bert_client):
vectors = bert_client.text2vec(df['text'])
le = LabelEncoder()
targets = le.fit_transform(df['media'])
return vectors, targets
def main():
BERT_MODEL_PATH = '../../models/bert_jp/'
# start bert server
commands = ['bert-serving-start', '-model_dir',
BERT_MODEL_PATH, '-num_worker=1', '-cpu']
p = subprocess.Popen(commands, shell=False,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# start bert client
bert = Bert(bert_model_path=BERT_MODEL_PATH, client_ip='0.0.0.0')
# build train features
train_dataset = pd.read_csv('../../data/processed/train_dataset.csv')
train_vectors, train_targets = build_features(train_dataset, bert)
np.save('../../data/features/train_vectors', train_vectors)
np.save('../../data/features/train_targets', train_targets)
# build test features
test_dataset = pd.read_csv('../../data/processed/test_dataset.csv')
test_vectors, test_targets = build_features(test_dataset, bert)
np.save('../../data/features/test_vectors', test_vectors)
np.save('../../data/features/test_targets', test_targets)
p.terminate()
if __name__ == '__main__':
main()
The class MediaClassifier
, which defines a model for classifying news media, is implemented as follows.
src/models/classifier.py
import os
import uuid
import pickle
import numpy as np
import lightgbm as lgb
import optuna
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
class MediaClassifier():
"""model for multi-class classification of livedoor news corpus"""
def __init__(self, output_dir, use_gpu=False):
JST = timezone(timedelta(hours=+9), 'JST')
dt_now = datetime.now(JST)
training_date = dt_now.strftime("%Y%m%d_%H%M%S")
self.output_dir = os.path.join(output_dir, training_date)
os.makedirs(self.output_dir, exist_ok=True)
self.device = 'gpu' if use_gpu else 'cpu'
def train(self, features, targets):
X_train, X_test, y_train, y_test = train_test_split(
features, targets, test_size=0.2, random_state=0)
def objectives(trial):
trial_uuid = str(uuid.uuid4())
trial.set_user_attr("trial_uuid", trial_uuid)
#Parameter and callback settings
params = {
#Since the number of media of liverdoor news corpus is 9, 9 multi-class classifications
'objective': 'multiclass',
'num_class': 9,
'metric': 'multi_logloss',
'num_leaves': trial.suggest_int("num_leaves", 10, 500),
'feature_fraction': trial.suggest_uniform("feature_fraction", 0.0, 1.0),
'class_weight': 'balanced',
'device': self.device,
'verbose': -1
}
pruning_callback = optuna.integration.LightGBMPruningCallback(
trial, "multi_logloss")
# training
lgb_model = lgb.train(params, lgb.Dataset(X_train, y_train), num_boost_round=100,
valid_sets=lgb.Dataset(X_test, y_test), callbacks=[pruning_callback])
y_pred_train = np.argmax(lgb_model.predict(X_train), axis=1)
y_pred_test = np.argmax(lgb_model.predict(X_test), axis=1)
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
trial.set_user_attr("accuracy_train", accuracy_train)
trial.set_user_attr("accuracy_test", accuracy_test)
#Save model
output_file = os.path.join(self.output_dir, f"{trial_uuid}.pkl")
with open(output_file, "wb") as fp:
pickle.dump(lgb_model, fp)
return 1.0 - accuracy_test
study = optuna.create_study()
study.optimize(objectives, n_trials=100)
result_df = study.trials_dataframe()
result_csv = os.path.join(self.output_dir, "result.csv")
result_df.to_csv(result_csv, index=False)
return study.best_trial.user_attrs
Perform training on the above model.
src/models/train_model.py
import numpy as np
from classifier import MediaClassifier
def main():
train_vectors = np.load('../../data/features/train_vectors.npy')
train_targets = np.load('../../data/features/train_targets.npy')
model = MediaClassifier(output_dir='../../models/training_models',
use_gpu=False)
best_result = model.train(train_vectors, train_targets)
print('best result \n', best_result)
if __name__ == '__main__':
main()
After the execution is completed, the trial uuid and score of the best performing model will be output as follows, so make a note of it.
{'trial_uuid': 'BEST_MODEL_TRIAL_UUID', 'accuracy_train': 1.0, 'accuracy_test': 0.7398190045248869}
The BEST_MODEL_TRIAL_UUID
part is actually the uuid.
Evaluation using test data is performed as follows.
src/models/predict_model.py
import argparse
import pickle
import numpy as np
from sklearn.metrics import accuracy_score
def main(args):
test_vectors = np.load('../../data/features/test_vectors.npy')
test_targets = np.load('../../data/features/test_targets.npy')
with open(args.best_model, 'rb') as f:
model = pickle.load(f)
pred_targets = np.argmax(model.predict(test_vectors), axis=1)
accuracy = accuracy_score(test_targets, pred_targets)
print('test accuracy : {:.2f}'.format(accuracy))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--best_model', help='best model pickle file path.')
args = parser.parse_args()
main(args)
A directory is automatically generated when you start training the model, so specify the path. The following is an execution example.
$ cd src/models
$ python predict_model.py --best_model='../../models/training_models/TRINING_DATE/BEST_MODEL_TRIAL_UUID.pkl'
test accuracy : 0.73
The correct answer rate was 0.73
in the model I actually created. It's not very expensive, but I think it can be raised a little more by increasing the amount of data and devising ways to separate the text.
Exit the container with ʻexitand exit with
docker-compose down`.