"""
37.Top 10 des mots qui coïncident fréquemment avec "chat"
Affichez 10 mots qui coexistent souvent avec "chat" (fréquence élevée de cooccurrence) et leur fréquence d'apparition dans un graphique (par exemple, un graphique à barres).
sentence_list:
[[{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
{'surface': 'un', 'base': 'un', 'pos': 'nom', 'pos1': 'nombre'},
{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
[{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
{'surface': 'je', 'base': 'je', 'pos': 'nom', 'pos1': '代nom'},
{'surface': 'Est', 'base': 'Est', 'pos': 'Particule', 'pos1': '係Particule'},
{'surface': 'Chat', 'base': 'Chat', 'pos': 'nom', 'pos1': 'Général'},
{'surface': 'alors', 'base': 'Est', 'pos': 'Verbe auxiliaire', 'pos1': '*'},
{'surface': 'y a-t-il', 'base': 'y a-t-il', 'pos': 'Verbe auxiliaire', 'pos1': '*'},
{'surface': '。', 'base': '。', 'pos': 'symbole', 'pos1': 'Phrase'},
{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
Memo:
-Fréquence de cooccurrence: https://www.jtp.co.jp/techport/2018-04-18-001/
"""
from collections import defaultdict
from typing import List
import matplotlib.pyplot as plt
import utils
plt.style.use("ggplot")
plt.rcParams["font.family"] = "Hiragino Mincho ProN" #Support japonais
def get_co_occurrence(sentence_list: List[List[dict]]) -> list:
sents = [
[word["surface"] for word in sent[1:-1]] for sent in sentence_list
] # [['un'], ['je', 'Est', 'Chat', 'alors', 'y a-t-il', '。']]
counter = defaultdict(int)
for sent in sents:
if "Chat" in sent:
for word in sent:
counter[word] += 1
del counter["Chat"]
sorted_counter = {
k: v for k, v in sorted(counter.items(), key=lambda item: item[1], reverse=True)
}
return list(sorted_counter.items())
def plot_co_occurrence(x: list, y: list) -> None:
x_pos = [i for i, _ in enumerate(x)]
plt.bar(x, y)
plt.xlabel("Term")
plt.ylabel("Frequency")
plt.title("Co-occurrence with 'Chat'")
plt.xticks(x_pos, x)
plt.show()
sentence_list = utils.read_json("30_neko_mecab.json")
counter = get_co_occurrence(sentence_list)
# [('de', 391), ('Est', 272), ('、', 252), ('À', 250), ('À', 232)]
x = [word[0] for word in counter[:10]]
y = [word[1] for word in counter[:10]]
plot_co_occurrence(x, y)
# ![image-20200527193140109](https://raw.githubusercontent.com/LearnXu/images/master/imgs/image-20200527193140109.png)
Recommended Posts