100 language processing knocks (2020): 40

ans40.py

"""
40.Reading the dependency analysis result (morpheme)
Implement the class Morph that represents morphemes. This class has surface and findall (pos1) as member variables.
Furthermore, the analysis result of CaboCha (neko.txt.Read cabocha), express each sentence as a list of Morph objects, and display the morpheme string of the third sentence.

ans40_cabocha.py with neko.txt.Generate cabocha.
ans40.The example generated by sh is neko.txt.cabocha2. If you compare, neko.txt.cabocha is more beautiful.

ans40.py implementation is direct neko.txt.The speed is fast because cabocha is loaded.
ans40_2.Implementation of py takes analysis time. Not recommended.
"""
from typing import List


class Morph:
    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                           ['* 0 1D 0/1 0.000000',
                            'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                            'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                            '* 1 -1D 0/2 0.000000',
                            'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                            'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                            'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                            '。\t sign,Kuten,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


# ans40
def convert_sent_to_morph(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                                ['* 0 2D 0/0 -0.764522',
                                '\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
                                '* 1 2D 0/1 -0.764522',
                                'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                                'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                                '* 2 -1D 0/2 0.000000',
                                'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                                'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                                'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                                '。\t sign,Kuten,*,*,*,*,。,。,。']

    Returns:
        List[Morph]: [description]
    """
    res = []
    for word in sent:
        if word[0] == "*":
            continue
        features = word.split(",")
        dic = {
            "surface": features[0].split("\t")[0],
            "base": features[6],
            "pos": features[0].split("\t")[1],
            "pos1": features[1],
        }
        res.append(Morph(dic))

    return res


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
morph_sents = [convert_sent_to_morph(sent) for sent in sentences]

for m in morph_sents[2]:
    print(m)  # __str__()

# surface[name]   base[name]      pos[noun]       pos1[General]
# surface[Is]     base[Is]        pos[Particle]       pos1[係Particle]
# surface[yet]   base[yet]      pos[adverb]       pos1[Particle connection]
# surface[No]   base[No]      pos[adjective]     pos1[Independence]
# surface[。]     base[。]        pos[symbol]       pos1[Kuten]

ans40_2.py

from typing import List

import CaboCha


def read_file(path: str) -> List[str]:
    data = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line != "":
                data.append(line)
    return data


class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface  #Surface type
        self.base = base  #Uninflected word
        self.pos = pos  #Part of speech
        self.pos1 = pos1  #Part of speech subclassification 1

    def __str__(self):
        s = "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]"
        return s.format(self.surface, self.base, self.pos, self.pos1)


def get_morph(sent: str) -> list:
    c = CaboCha.Parser()
    parsed_sent = c.parse(sent).toString(CaboCha.FORMAT_LATTICE)
    words = parsed_sent.strip().split("\n")
    # e.g. ['* 0 -1D 0/0 0.000000', 'one\t noun,number,*,*,*,*,one,Ichi,Ichi', 'EOS']

    morphs = []
    for word in words:
        #The beginning is*Line is the result of dependency analysis, so skip it
        if word[0] == "*" or word.strip() == "EOS":
            continue

        #The surface layer is tab-delimited, otherwise','Separate by break
        features = word.split(",")

        morphs.append(
            Morph(
                features[0].split("\t")[0],  # surface
                features[6],  # base
                features[0].split("\t")[1],  # pos
                features[1],  # pos1
            )
        )

    return morphs


file_path = "neko.txt"
sentence_list = read_file(file_path)
# ['one', 'I am a cat.', 'There is no name yet.', 'I have no idea where I was born.']

morphs = [get_morph(sent) for sent in sentence_list]  #Cabocha's analysis time is a little long
for m in morphs[3]:
    print(m)

# surface[Where]   base[Where]      pos[noun]       pos1[代noun]
# surface[so]     base[so]        pos[Particle]       pos1[格Particle]
# surface[Born]   base[Bornる]    pos[verb]       pos1[Independence]
# surface[Ta]     base[Ta]        pos[Auxiliary verb]     pos1[*]
# surface[Or]     base[Or]        pos[Particle]       pos1[副Particle／並立Particle／終Particle]
# surface[Tonto] base[Tonto]    pos[adverb]       pos1[General]
# surface[Register]   base[Register]      pos[noun]       pos1[Change connection]
# surface[But]     base[But]        pos[Particle]       pos1[格Particle]
# surface[Tsuka]   base[Tsukuri]      pos[verb]       pos1[Independence]
# surface[Nu]     base[Nu]        pos[Auxiliary verb]     pos1[*]
# surface[。]     base[。]        pos[symbol]       pos1[Kuten]

ans40_parse_to_cabocha_format.py

import CaboCha


def parse_txt(file_in: str, file_out: str) -> None:
    """Convert neko.txt to cabocha format in a clear format."""
    with open(file_in) as f_in, open(file_out, "w") as f_out:
        cabocha = CaboCha.Parser()

        for line in f_in:
            line = line.strip()
            if line == "":
                continue
            parsed_sent = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            f_out.write(parsed_sent)


file_in = "neko.txt"
file_out = "neko.txt.cabocha"

parse_txt(file_in, file_out)

ans40.sh

cat neko.txt | cabocha -f1 > neko.txt.cabocha