Learning Japanese text categories with tf-idf and random forest ~ livedoor news seems to have room for tuning, so try your best I will.
It is the same as the previous article.
import glob
import random
import numpy as np
from natto import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
def load_livedoor_news_corpus():
category = {
'dokujo-tsushin': 1,
'it-life-hack':2,
'kaden-channel': 3,
'livedoor-homme': 4,
'movie-enter': 5,
'peachy': 6,
'smax': 7,
'sports-watch': 8,
'topic-news':9
}
docs = []
labels = []
for c_name, c_id in category.items():
files = glob.glob("./text/{c_name}/{c_name}*.txt".format(c_name=c_name))
text = ''
for file in files:
with open(file, 'r') as f:
lines = f.read().splitlines()
url = lines[0]
datetime = lines[1]
subject = lines[2]
body = "\n".join(lines[3:])
text = subject + "\n" + body
docs.append(text)
labels.append(c_id)
return docs, labels
docs, labels = load_livedoor_news_corpus()
random.seed()
indices = list(range(len(docs)))
random.shuffle(indices)
split_size = 7000
train_data = [docs[i] for i in indices[0:split_size]]
train_labels = [labels[i] for i in indices[0:split_size]]
test_data = [docs[i] for i in indices[split_size:]]
test_labels = [labels[i] for i in indices[split_size:]]
def tokenize(text):
tokens = []
with MeCab('-F%f[0],%f[6]') as nm:
for n in nm.parse(text, as_nodes=True):
# ignore any end-of-sentence nodes
if not n.is_eos() and n.is_nor():
klass, word = n.feature.split(',', 1)
if klass in ['noun']:
tokens.append(word)
return tokens
vectorizer = TfidfVectorizer(tokenizer=tokenize)
train_matrix = vectorizer.fit_transform(train_data)
test_matrix = vectorizer.transform(test_data)
clf2 = RandomForestClassifier(n_estimators=100, max_features=3000, oob_score=True)
clf2.fit(train_matrix, train_labels)
print(clf2.score(train_matrix, train_labels))
print(clf2.score(test_matrix, test_labels))
Dataset | Score |
---|---|
Training | 1.0 |
Test | 0.901 |
Since the identification rate of Training data is 100%, it seems good to remove the part of speech and reduce the features.
def tokenize(text):
tokens = []
with MeCab('-F%f[0],%f[6]') as nm:
for n in nm.parse(text, as_nodes=True):
# ignore any end-of-sentence nodes
if not n.is_eos() and n.is_nor():
klass, word = n.feature.split(',', 1)
if klass in ['noun']:
tokens.append(word)
return tokens
Dataset | Score |
---|---|
Training | 1.0 |
Test | 0.918 |
The identification rate of Test data has improved to 91.8%.
As measured by the get_feature_names method of TfidfVectorizer, the feature was 31258.
The max_features of RandomForestClassifier was sqrt by default, 176 in this case. I feel that this is too small, so I will increase it a little.
clf2 = RandomForestClassifier(n_estimators=100, max_features=num_features)
clf2.fit(train_matrix, train_labels)
num_features = 1000
Dataset | Score |
---|---|
Training | 1.0 |
Test | 0.931 |
num_features = 3000
Dataset | Score |
---|---|
Training | 1.0 |
Test | 0.937 |
The identification rate of Test data has improved to 93.7%.
The API document for sklearn says "Whether to use out-of-bag samples to estimate the generalization error."
I can't understand how it behaves from the document. ..
clf2 = RandomForestClassifier(n_estimators=100, max_features=3000, oob_score=True)
clf2.fit(train_matrix, train_labels)
Dataset | Score |
---|---|
Training | 1.0 |
Test | 0.948 |
In this case, it was better to tune with oob_score = True.
Before tuning, it was 90.1%, but eventually the identification rate improved to 94.8%.
Recommended Posts