#Try it with Word Cloud Japanese Python JupyterLab.
#Main HP that I used as a reference
# https://github.com/amueller/word_cloud/blob/master/examples/masked.py
# https://note.nkmk.me/python-janome-tutorial/
# https://quest.signate.jp/quests/10031 "Data preprocessing dojo"
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud, STOPWORDS
#Install "re" to use replacement
import re
# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
#----
#Use "janome" because it can be introduced only with pip. Used to extract only nouns from character strings.
from janome.tokenizer import Tokenizer
text_wakati = open(path.join(d, 'chumonno_oi_ryoriten.txt'), encoding='shift_jis').read()
t = Tokenizer()
s = text_wakati
print(type(t.tokenize(s)))
text_wakati=[token.surface for token in t.tokenize(s)
if token.part_of_speech.startswith('noun')]
text_wakati =" ".join(map(str, text_wakati))
#The word "lowering" is recognized as a noun. Exclude it because it is recognized as a frequent character.
text_wakati = re.sub("Character", ' ', text_wakati)
text_wakati = re.sub("Lower", ' ', text_wakati)
text_wakati = re.sub("here", ' ', text_wakati)
#Described to confirm the generated character string. Finally, make a comment to eliminate the influence.
#print(text_wakati)
# read the mask image
# taken from
alice_mask = np.array(Image.open(path.join(d, "alice_mask.png ")))
stopwords = set(STOPWORDS)
stopwords.add("said")
font_patha_a = 'gomarice_mukasi_mukasi.ttf'
#If a font that can display Japanese cannot be used, it will be written as □□□□.
#Since I am using Python installed with docker this time, it was troublesome to put the fonts in the folder, so place the font files in the same hierarchy.
#「font_path=font_patha_Added "a".
wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
stopwords=stopwords, contour_width=3, contour_color='steelblue',font_path=font_patha_a)
# generate word cloud
wc.generate(text_wakati)
# store to file
wc.to_file(path.join(d, "alice.png "))
# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()
#Advance preparation
#Prepare "text file", "image file" and "font file" in the same layer
#Main changes with the sample
#Text file: Uses data from restaurants with many orders from "Aozora Bunko" Saved name "chumonno"_oi_ryoriten.txt "Placed in the same hierarchy.
#import re: Used to replace character strings
#Character classification: Use "janome" because it can be introduced only with pip. Used to extract only nouns from character strings
#<Font> (I thought it was caused by docker, so I was only looking at articles that support Japanese ... The result seems to be □□□ due to the font.)(gomarice_mukasi_mukasi.Free fonts are available for ttf.)
#If a font that can display Japanese cannot be used, it will be written as □□□□.
#Since I am using Python installed with docker this time, it was troublesome to put the fonts in the folder, so place the font files in the same hierarchy.
#「font_path=font_patha_Added "a".
Recommended Posts