When human-classified data is used as teacher data for text classification, the harmful effects of human intervention between the data have appeared, so I made a text classifier without a teacher.
I regret that it was better to classify by graph algorithm instead of k-means. The reason is that there are only a limited number of documents that output similarity when calculated with Doc2Vec. In k-means, you have to build a sparse matrix once, but if you classify with a graph algorithm, you don't have to bother to waste memory. ..
I'm sorry that the comments are mixed with Japanese and English, and the code is quite dirty. ..
# coding: utf-8
from gensim import corpora, models
import numpy as np
from numpy import random
from scipy.cluster.vq import vq, kmeans, whiten
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
from separatewords import MecabTokenize #Call out a morphological analyzer that suits your purpose
class MyTexts:
def __init__(self, text_list):
self.text_list = text_list
def __iter__(self):
for line in self.text_list:
if line==b'Not entered': continue
yield MecabTokenize.tokenize( line.rstrip().decode('utf-8') )
class LabeledLineSentence(object):
def __init__(self, texts_words):
self.texts_words = texts_words
def __iter__(self):
for uid, words in enumerate(self.texts_words):
yield models.doc2vec.LabeledSentence(words, labels=['SENT_%s' % uid])
#Set the similarity of each sentence acquired by Doc2Vec to matrix
#In addition, get the representative word of each sentence
def create_sim_vec(model,n_sent):
base_name = 'SENT_'
sim_matrix = []
sim_matrix_apd = sim_matrix.append
word_matrix = []
word_matrix_apd = word_matrix.append
for i_sent in xrange(n_sent):
sim_vec = np.zeros(n_sent)
word_list = []
word_list_apd = word_list.append
#Since sent may not exist, exception handling should be included.
for word, sim_val in model.most_similar(base_name+str(i_sent)):
if 'SENT_' in word:
_, s_idx = word.split('_')
sim_vec[int(s_idx)] = sim_val
return sim_matrix, word_matrix
#Organize similar documents with kmeans
def sent_integrate(sim_matrix,n_class):
#Make the variance for each dimension uniform
centroid, destortion = kmeans(sim_matrix, n_class, iter=100, thresh=1e-05)
labels, dist = vq(sim_matrix, centroid)
return labels
def count_class(labels):
res_dic = defaultdict(int)
for label in labels:
res_dic[label] += 1
return res_dic
def count_labeled_data(label_data, labels):
result_dict = {}
for orig_labels, label in zip(label_data, labels):
labels = np.array(orig_labels.split(), dtype=np.int64)
if label not in result_dict:
result_dict[label] = labels
result_dict[label] += labels
return result_dict
if __name__=='__main__':
ifname = './out_data.csv'
model_basename = './D2V/doc2vec_result/model'
topic_result_basename = './D2V/doc2vec_result/topic'
comment_data = []
comment_data_apd = comment_data.append
label_data = []
label_data_apd = label_data.append
with open(ifname, 'r') as f:
for line in f:
single_flag, label_flags, comment = line.strip().split('\t')
texts = MyTexts(comment_data)
sentences = LabeledLineSentence(texts)
model = models.Doc2Vec(alpha=0.025, min_alpha=0.025) # use fixed learning rate
# store the model to mmap-able files
# load the model back
model_loaded = models.Doc2Vec.load(model_basename+'.d2v')
epoch = 10
for _ in xrange(epoch):
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
print 'done training'
# show topic
n_sent = len(comment_data)
sent_matrix, word_matrix = create_sim_vec(model, n_sent)
print 'done get sent_matrix'
##Put together similar documents
#Data compression with svd(Dense data)
np.savetxt('./D2V/sent_matrix', np.array(sent_matrix))
dimension = 100
lsa = TruncatedSVD(dimension)
info_matrix = lsa.fit_transform(sent_matrix)
np.savetxt('./D2V/info_matrix', np.array(info_matrix))
#Implementation of kmeans
n_class = 7
labels = sent_integrate(np.array(info_matrix),n_class)
np.savetxt('./D2V/sent_labels.csv', labels,delimiter=',', fmt='%d')
print count_class(labels)
#Comparison with what humans classify
print count_labeled_data(label_data, labels)
The data used is one line composed of (0 \ t1 0 1 0 0 0 0 \ txxxx). Labels attached by humans are flagged with 7 01, separated by spaces.
It is not credible because the results of the experimental data are not attached. By visually checking 2000 data, about 80% were classified without any discomfort. On the contrary, when I looked at the labels attached by humans, there were about 30% of them that didn't make sense. (What is the difference in sensibility ...)
Recently, I have published an article without explaining the method or posting experimental data. I want to write an article (I wish I could write it) without cutting corners in the future. We apologize for the inconvenience, but we would appreciate it if you could point out any mistakes.
