Try it with Word Cloud Japanese Python JupyterLab.


#Try it with Word Cloud Japanese Python JupyterLab.

#Main HP that I used as a reference
# https://github.com/amueller/word_cloud/blob/master/examples/masked.py
# https://note.nkmk.me/python-janome-tutorial/
# https://quest.signate.jp/quests/10031 "Data preprocessing dojo"

from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud, STOPWORDS


#Install "re" to use replacement
import re

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()


#----
#Use "janome" because it can be introduced only with pip. Used to extract only nouns from character strings.
from janome.tokenizer import Tokenizer

text_wakati = open(path.join(d, 'chumonno_oi_ryoriten.txt'), encoding='shift_jis').read()
t = Tokenizer()

s = text_wakati

print(type(t.tokenize(s)))

text_wakati=[token.surface for token in t.tokenize(s)
       if token.part_of_speech.startswith('noun')]

text_wakati ="　".join(map(str, text_wakati))

#The word "lowering" is recognized as a noun. Exclude it because it is recognized as a frequent character.
text_wakati = re.sub("Character", ' ', text_wakati)
text_wakati = re.sub("Lower", ' ', text_wakati)
text_wakati = re.sub("here", ' ', text_wakati)

#Described to confirm the generated character string. Finally, make a comment to eliminate the influence.
#print(text_wakati)

# read the mask image
# taken from
alice_mask = np.array(Image.open(path.join(d, "alice_mask.png ")))

stopwords = set(STOPWORDS)
stopwords.add("said")
font_patha_a = 'gomarice_mukasi_mukasi.ttf'


#If a font that can display Japanese cannot be used, it will be written as □□□□.
#Since I am using Python installed with docker this time, it was troublesome to put the fonts in the folder, so place the font files in the same hierarchy.
#「font_path=font_patha_Added "a".
wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
               stopwords=stopwords, contour_width=3, contour_color='steelblue',font_path=font_patha_a)

# generate word cloud
wc.generate(text_wakati)

# store to file
wc.to_file(path.join(d, "alice.png "))

# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()



#Advance preparation
#Prepare "text file", "image file" and "font file" in the same layer

#Main changes with the sample
#Text file: Uses data from restaurants with many orders from "Aozora Bunko" Saved name "chumonno"_oi_ryoriten.txt "Placed in the same hierarchy.
#import re: Used to replace character strings
#Character classification: Use "janome" because it can be introduced only with pip. Used to extract only nouns from character strings

#<Font> (I thought it was caused by docker, so I was only looking at articles that support Japanese ... The result seems to be □□□ due to the font.)(gomarice_mukasi_mukasi.Free fonts are available for ttf.)

#If a font that can display Japanese cannot be used, it will be written as □□□□.
#Since I am using Python installed with docker this time, it was troublesome to put the fonts in the folder, so place the font files in the same hierarchy.
#「font_path=font_patha_Added "a".