"""
36.Top 10 most frequent words
Display the 10 words with high frequency of appearance and their frequency of appearance in a graph (for example, a bar graph).
data:
[[{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
  {'surface': 'one', 'base': 'one', 'pos': 'noun', 'pos1': 'number'},
  {'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
 [{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
  {'surface': 'I', 'base': 'I', 'pos': 'noun', 'pos1': '代noun'},
  {'surface': 'Is', 'base': 'Is', 'pos': 'Particle', 'pos1': '係Particle'},
  {'surface': 'Cat', 'base': 'Cat', 'pos': 'noun', 'pos1': 'General'},
  {'surface': 'so', 'base': 'Is', 'pos': 'Auxiliary verb', 'pos1': '*'},
  {'surface': 'is there', 'base': 'is there', 'pos': 'Auxiliary verb', 'pos1': '*'},
  {'surface': '。', 'base': '。', 'pos': 'symbol', 'pos1': 'Kuten'},
  {'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
Memo:
    -Display Japanese with matplotlib
"""
from collections import Counter
from typing import List
import matplotlib.pyplot as plt
import utils
plt.style.use("ggplot")
plt.rcParams["font.family"] = "Hiragino Mincho ProN"  #Japanese support
def get_tf(sentence_list: List[List[dict]]) -> list:
    words = [word["surface"] for sent in sentence_list for word in sent[1:-1]]
    c = Counter(words)
    return c.most_common()
def plot_tf(x: list, y: list) -> None:
    x_pos = [i for i, _ in enumerate(x)]
    plt.bar(x, y)
    plt.xlabel("Term")
    plt.ylabel("Frequency")
    plt.title("TF Graph")
    plt.xticks(x_pos, x)
    plt.show()
data = utils.read_json("30_neko_mecab.json")
counter = get_tf(data)
# [('of', 9194), ('。', 7486)]
x = [word[0] for word in counter[:10]]
y = [word[1] for word in counter[:10]]
plot_tf(x, y)
# 
Recommended Posts