100 language processing knocks (2020): 35

"""
35.Frequency of word occurrence
Find the words that appear in the sentence and their frequency of appearance, and arrange them in descending order of frequency of appearance.

[[{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
  {'surface': 'one', 'base': 'one', 'pos': 'noun', 'pos1': 'number'},
  {'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
 [{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
  {'surface': 'I', 'base': 'I', 'pos': 'noun', 'pos1': '代noun'},
  {'surface': 'Is', 'base': 'Is', 'pos': 'Particle', 'pos1': '係Particle'},
  {'surface': 'Cat', 'base': 'Cat', 'pos': 'noun', 'pos1': 'General'},
  {'surface': 'so', 'base': 'Is', 'pos': 'Auxiliary verb', 'pos1': '*'},
  {'surface': 'is there', 'base': 'is there', 'pos': 'Auxiliary verb', 'pos1': '*'},
  {'surface': '。', 'base': '。', 'pos': 'symbol', 'pos1': 'Kuten'},
  {'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
"""
from collections import Counter
from typing import List

import utils


def get_tf(sentence_list: List[List[dict]]) -> dict:
    words = [word["surface"] for sent in sentence_list for word in sent[1:-1]]
    c = Counter(words)
    return c.most_common()


data = utils.read_json("30_neko_mecab.json")
result = get_tf(data)
# [('of', 9194),
#  ('。', 7486),
#  ('hand', 6868),
#  ('、', 6772),
#  ('Is', 6420),
#  ('To', 6243),
#  ('To', 6071),
#  ('When', 5508),
#  ('But', 5337),
#  ('Ta', 3988)]

Recommended Posts

100 language processing knocks 03 ~ 05

100 language processing knocks (2020): 40

100 language processing knocks (2020): 32

100 language processing knocks (2020): 35

100 language processing knocks (2020): 47

100 language processing knocks (2020): 39

100 language processing knocks (2020): 22

100 language processing knocks (2020): 26

100 language processing knocks (2020): 34

100 language processing knocks (2020): 42

100 language processing knocks (2020): 29