pip install gensim
pip install janome
#Import required libraries
from janome.tokenizer import Tokenizer
from gensim.models import word2vec
import re
#Read after opening txt file
binarydata = open("kazeno_matasaburo.txt").read()
#By the way, the one who printed and checked one by one
binarydata = open("kazeno_matasaburo.txt")
print(type(binarydata))
Execution result <class'_io.BufferedReader'>
binarydata = open("kazeno_matasaburo.txt").read()
print(type(binarydata))
Execution result <class'bytes'>
#Convert data type to string type (how to write python)
text = binarydata.decode('shift_jis')
#Remove unnecessary data
text = re.split(r'\-{5,}',text)[2]
text = re.split(r'Bottom book:',text)[0]
text = text.strip()
#Perform morphological analysis
t = Tokenizer()
results = []
lines = text.split("\r\n") #Separated by line
for line in lines:
s = line
s = s.replace('|','')
s = re.sub(r'《.+?》','',s)
s = re.sub(r'[#.+?]','',s)
tokens = t.tokenize(s) #Contains the analyzed one
r = []
#Take them out one by one.base_form.You can access it on the surface
for token in tokens:
if token.base_form == "*":
w = token.surface
else:
w = token.base_form
ps = token.part_of_speech
hinshi = ps.split(',')[0]
if hinshi in ['noun','adjective','verb','symbol']:
r.append(w)
rl = (" ".join(r)).strip()
results.append(rl)
print(rl)
#Write the analyzed one at the same time as the file is generated
wakachigaki_file = "matasaburo.wakati"
with open(wakachigaki_file,'w', encoding='utf-8') as fp:
fp.write('\n'.join(results))
#Analysis start
data = word2vec.LineSentence(wakachigaki_file)
model = word2.Word2Vec(data,size=200,window=10,hs=1,min_count=2,sg=1)
model.save('matasaburo.model')
#try using model
model.most_similar(positive=['school'])
① Get the sentence you want to analyze. ② Process so that it is only sentences. Get rid of things like the last bibliography ③ Take out line by line with the for statement and remove unnecessary parts. ④ Perform morphological analysis with tokenizer. Put it in the list. ⑤ Write the created list to a file ⑥ Create a model using the morphologically analyzed file
Recommended Posts