Try reading ** Ryunosuke Akutagawa's "nose" ** from Aozora Bunko
The character code of the file is ** shift_jis **

#Reading and writing text files in Python (input / output)
with open('/hana.txt', mode='r', encoding='shift_jis') as f: 
  nose_hana = f.read()
print(nose_hana)

#Data preprocessing
import re
import pickle
nose = re.sub('《[^》]+》', '', nose_hana)    #Delete ruby
nose = re.sub('[|―  「」\n]', '', nose)      # |-And double-byte space, "" and line break deletion
nose = re.sub('[ ]', '', nose)                #Delete half-width space
nose = re.sub('[\u3000]', '', nose)           #\u3000 deleted
sentense_end = '。'
nose_list = nose.split(sentense_end)
nose_list.pop()
nose_list = [x+sentense_end for x in nose_list]
print(nose_list)

from janome import tokenizer
s = Tokenizer()
t = nose_list
for _ in nose_list:
  print(s.tokenize(_, wakati=True))

#You can count the frequency of appearance in collections
import collections
s = Tokenizer() #Instantiation
words = []
for _ in nose_list:
  words += s.tokenize(_, wakati=True)
c = collections.Counter(words)
print(c)
Reference
Recommended Posts