ans40.py
"""
40.Reading the dependency analysis result (morpheme)
Implement the class Morph that represents morphemes. This class has surface and findall (pos1) as member variables.
Furthermore, the analysis result of CaboCha (neko.txt.Read cabocha), express each sentence as a list of Morph objects, and display the morpheme string of the third sentence.
ans40_cabocha.py with neko.txt.Generate cabocha.
ans40.The example generated by sh is neko.txt.cabocha2. If you compare, neko.txt.cabocha is more beautiful.
ans40.py implementation is direct neko.txt.The speed is fast because cabocha is loaded.
ans40_2.Implementation of py takes analysis time. Not recommended.
"""
from typing import List
class Morph:
def __init__(self, data):
self.surface = data["surface"]
self.base = data["base"]
self.pos = data["pos"]
self.pos1 = data["pos1"]
def __repr__(self):
return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
self.surface, self.base, self.pos, self.pos1
)
def read_file(fpath: str) -> List[List[str]]:
"""Get clear format of parsed sentences.
Args:
fpath (str): File path.
Returns:
List[List[str]]: List of sentences, and each sentence contains a word list.
e.g. result[1]:
['* 0 1D 0/1 0.000000',
'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
'* 1 -1D 0/2 0.000000',
'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
'。\t sign,Kuten,*,*,*,*,。,。,。']
"""
with open(fpath, mode="rt", encoding="utf-8") as f:
sentences = f.read().split("EOS\n")
return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]
# ans40
def convert_sent_to_morph(sent: List[str]) -> List[Morph]:
"""Extract word and convert to morph.
Args:
sent (List[str]): A sentence contains a word list.
e.g. sent:
['* 0 2D 0/0 -0.764522',
'\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
'* 1 2D 0/1 -0.764522',
'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
'* 2 -1D 0/2 0.000000',
'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
'。\t sign,Kuten,*,*,*,*,。,。,。']
Returns:
List[Morph]: [description]
"""
res = []
for word in sent:
if word[0] == "*":
continue
features = word.split(",")
dic = {
"surface": features[0].split("\t")[0],
"base": features[6],
"pos": features[0].split("\t")[1],
"pos1": features[1],
}
res.append(Morph(dic))
return res
fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
morph_sents = [convert_sent_to_morph(sent) for sent in sentences]
for m in morph_sents[2]:
print(m) # __str__()
# surface[name] base[name] pos[noun] pos1[General]
# surface[Is] base[Is] pos[Particle] pos1[係Particle]
# surface[yet] base[yet] pos[adverb] pos1[Particle connection]
# surface[No] base[No] pos[adjective] pos1[Independence]
# surface[。] base[。] pos[symbol] pos1[Kuten]
ans40_2.py
from typing import List
import CaboCha
def read_file(path: str) -> List[str]:
data = []
with open(path) as f:
for line in f:
line = line.strip()
if line != "":
data.append(line)
return data
class Morph:
def __init__(self, surface, base, pos, pos1):
self.surface = surface #Surface type
self.base = base #Uninflected word
self.pos = pos #Part of speech
self.pos1 = pos1 #Part of speech subclassification 1
def __str__(self):
s = "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]"
return s.format(self.surface, self.base, self.pos, self.pos1)
def get_morph(sent: str) -> list:
c = CaboCha.Parser()
parsed_sent = c.parse(sent).toString(CaboCha.FORMAT_LATTICE)
words = parsed_sent.strip().split("\n")
# e.g. ['* 0 -1D 0/0 0.000000', 'one\t noun,number,*,*,*,*,one,Ichi,Ichi', 'EOS']
morphs = []
for word in words:
#The beginning is*Line is the result of dependency analysis, so skip it
if word[0] == "*" or word.strip() == "EOS":
continue
#The surface layer is tab-delimited, otherwise','Separate by break
features = word.split(",")
morphs.append(
Morph(
features[0].split("\t")[0], # surface
features[6], # base
features[0].split("\t")[1], # pos
features[1], # pos1
)
)
return morphs
file_path = "neko.txt"
sentence_list = read_file(file_path)
# ['one', 'I am a cat.', 'There is no name yet.', 'I have no idea where I was born.']
morphs = [get_morph(sent) for sent in sentence_list] #Cabocha's analysis time is a little long
for m in morphs[3]:
print(m)
# surface[Where] base[Where] pos[noun] pos1[代noun]
# surface[so] base[so] pos[Particle] pos1[格Particle]
# surface[Born] base[Bornる] pos[verb] pos1[Independence]
# surface[Ta] base[Ta] pos[Auxiliary verb] pos1[*]
# surface[Or] base[Or] pos[Particle] pos1[副Particle/並立Particle/終Particle]
# surface[Tonto] base[Tonto] pos[adverb] pos1[General]
# surface[Register] base[Register] pos[noun] pos1[Change connection]
# surface[But] base[But] pos[Particle] pos1[格Particle]
# surface[Tsuka] base[Tsukuri] pos[verb] pos1[Independence]
# surface[Nu] base[Nu] pos[Auxiliary verb] pos1[*]
# surface[。] base[。] pos[symbol] pos1[Kuten]
ans40_parse_to_cabocha_format.py
import CaboCha
def parse_txt(file_in: str, file_out: str) -> None:
"""Convert neko.txt to cabocha format in a clear format."""
with open(file_in) as f_in, open(file_out, "w") as f_out:
cabocha = CaboCha.Parser()
for line in f_in:
line = line.strip()
if line == "":
continue
parsed_sent = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
f_out.write(parsed_sent)
file_in = "neko.txt"
file_out = "neko.txt.cabocha"
parse_txt(file_in, file_out)
ans40.sh
cat neko.txt | cabocha -f1 > neko.txt.cabocha
Recommended Posts