100 natural language processing knocks Chapter 4 Morphological analysis (second half)

A record of solving the problems in the second half of Chapter 4. The target file is neko.txt as shown on the web page.

Use MeCab to morphologically analyze the text (neko.txt) of Natsume Soseki's novel "I am a cat" and save the result in a file called neko.txt.mecab. Use this file to implement a program that addresses the following questions. For problems 37, 38, and 39, use matplotlib or Gnuplot.

</ i> 35. Noun concatenation

Extract the concatenation of nouns (nouns that appear consecutively) with the longest match.

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30

def extract_seqs(sentences):
    seqs = []
    seq = []
    for sentence in sentences:
        for morpheme in sentence:
            if morpheme['pos'] == "noun":
                seq.append(morpheme['surface'])
            else:
                if len(seq) > 1:
                    seqs.append(seq)
                seq = []
    return seqs

if __name__ == "__main__":
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_sequences.txt'
    f = open(inputfile, "r")
    g = open(outputfile, "w")
    sentences = problem30.mecab_reader(f)
    sequences = extract_seqs(sentences)
    for sequence in sequences:
        # print "".join(sequence)
        g.write("".join(sequence) + '\n')
    f.close()
    g.close()

</ i> 36. Frequency of word occurrence

Find the words that appear in the sentence and their frequency of appearance, and arrange them in descending order of frequency of appearance.

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30
from collections import Counter

def count_words(sentences):
    words = []
    for sentence in sentences:
        for morpheme in sentence:
            words.append(morpheme['surface'])
    return Counter(words)

if __name__ == "__main__":
    inputfile = "neko.txt.mecab"
    outputfile = "neko.mecab_words.txt"
    f = open(inputfile, 'r')
    g = open(outputfile, 'w')
    sentences = problem30.mecab_reader(f)
    counter = count_words(sentences)
    for word, count in counter.most_common():
        # print word, count
        g.write("%s %s\n" % (word, count))
    f.close()
    g.close()

</ i> 37. Top 10 most frequent words

Display the 10 words that appear frequently and their frequency of appearance in a graph (for example, a bar graph).

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30
import problem36
import matplotlib.pyplot as plt

def plot_words(words, counts, file):
    from matplotlib.font_manager import FontProperties
    fp = FontProperties(fname='/usr/local/Cellar/ricty/3.2.4/share/fonts/Ricty-Regular.ttf')
    plt.bar(range(10), counts, align='center')
    plt.xticks(range(0, 10), words, fontproperties=fp)
    plt.savefig(file)

if __name__ == '__main__':
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_words.png'
    f = open(inputfile, 'r')
    words = []
    counts = []
    sentences = problem30.mecab_reader(f)
    counter = problem36.count_words(sentences)
    for word, count in counter.most_common(10):
        # print word, count
        words.append(word.decode('utf8'))
        counts.append(count)
    plot_words(words, counts, outputfile)
    f.close()

neko.mecab_words.png

</ i> 38. Histogram

Draw a histogram of the frequency of occurrence of words (the horizontal axis represents the frequency of occurrence and the vertical axis represents the number of types of words that take the frequency of occurrence as a bar graph).

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30
import problem36
import pandas as pd

def plot_words_hist(freq, file):
    plot = freq.hist()
    fig = plot.get_figure()
    fig.savefig(file)

if __name__ == '__main__':
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_words_hist.png'
    f = open(inputfile, 'r')
    words = []
    counts = []
    sentences = problem30.mecab_reader(f)
    counter = problem36.count_words(sentences)
    freq = pd.Series(list(counter.values()), index=list(counter.keys()))
    plot_words_hist(freq, outputfile)

neko.mecab_words_hist.png

</ i> 39. Zipf's Law

Plot a log-log graph with the frequency of occurrence of words on the horizontal axis and the frequency of occurrence on the vertical axis.

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30
import problem36
import matplotlib.pyplot as plt


def plot_words_hist_log(counter, file):
    from matplotlib.font_manager import FontProperties
    fp = FontProperties(fname='/usr/local/Cellar/ricty/3.2.4/share/fonts/Ricty-Regular.ttf')
    plt.figure()
    plt.xscale('log')
    plt.yscale('log')
    plt.plot(sorted(list(counter.values()), reverse=True), range(1, len(list(counter))+1))
    plt.savefig(file)


if __name__ == '__main__':
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_words_hist_log.png'
    f = open(inputfile, 'r')
    words = []
    counts = []
    sentences = problem30.mecab_reader(f)
    counter = problem36.count_words(sentences)
    plot_words_hist_log(counter, outputfile)
    f.close()

neko.mecab_words_hist_log.png

Recommended Posts