"""
43.Extraire les clauses contenant la nomenclature liée aux clauses contenant des verbes
Lorsque des clauses contenant une nomenclature se rapportent à des clauses contenant des verbes, extrayez-les dans un format délimité par des tabulations. Cependant, n'émettez pas de symboles tels que des signes de ponctuation.
"""
from collections import defaultdict
from typing import List
def read_file(fpath: str) -> List[List[str]]:
"""Get clear format of parsed sentences.
Args:
fpath (str): File path.
Returns:
List[List[str]]: List of sentences, and each sentence contains a word list.
e.g. result[1]:
['* 0 2D 0/0 -0.764522',
'\u3000\symbole t,Vide,*,*,*,*,\u3000,\u3000,\u3000',
'* 1 2D 0/1 -0.764522',
'je\t substantif,Synonyme,Général,*,*,*,je,Wagahai,Wagahai',
'Est\t assistant,Assistance,*,*,*,*,Est,C,sensationnel',
'* 2 -1D 0/2 0.000000',
'Chat\t substantif,Général,*,*,*,*,Chat,chat,chat',
'alors\t verbe auxiliaire,*,*,*,Spécial,Type continu,Est,De,De',
'y a-t-il\t verbe auxiliaire,*,*,*,Cinq étapes, La ligne Al,Forme basique,y a-t-il,Al,Al',
'。\symbole t,Phrase,*,*,*,*,。,。,。']
"""
with open(fpath, mode="rt", encoding="utf-8") as f:
sentences = f.read().split("EOS\n")
return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]
class Morph:
"""Morph information for each token.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
surface (str):Surface
base (str):Base
pos (str):Partie (base)
pos1 (str):Sous-classification des pièces détachées 1 (pos1
"""
def __init__(self, data):
self.surface = data["surface"]
self.base = data["base"]
self.pos = data["pos"]
self.pos1 = data["pos1"]
def __repr__(self):
return f"Morph({self.surface})"
def __str__(self):
return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
self.surface, self.base, self.pos, self.pos1
)
class Chunk:
"""Containing information for Clause/phrase.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
chunk_id (str): The number of clause chunk (Numéro de phrase).
morphs List[Morph]: Morph (morphème) list.
dst (str): The index of dependency target (Numéro d'index de la clause de contact).
srcs (List[str]): The index list of dependency source. (Numéro d'index de la clause d'origine).
"""
def __init__(self, chunk_id, dst):
self.id = chunk_id
self.morphs = []
self.dst = dst
self.srcs = []
def __repr__(self):
return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
self.id, self.dst, self.srcs, self.morphs
)
def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
"""Extract word and convert to morph.
Args:
sent (List[str]): A sentence contains a word list.
e.g. sent:
['* 0 1D 0/1 0.000000',
'je\t substantif,Synonyme,Général,*,*,*,je,Wagahai,Wagahai',
'Est\t assistant,Assistance,*,*,*,*,Est,C,sensationnel',
'* 1 -1D 0/2 0.000000',
'Chat\t substantif,Général,*,*,*,*,Chat,chat,chat',
'alors\t verbe auxiliaire,*,*,*,Spécial,Type continu,Est,De,De',
'y a-t-il\t verbe auxiliaire,*,*,*,Cinq étapes, La ligne Al,Forme basique,y a-t-il,Al,Al',
'。\symbole t,Phrase,*,*,*,*,。,。,。']
Parsing format:
e.g. "* 0 1D 0/1 0.000000"
|colonne|sens|
| :----: | :----------------------------------------------------------- |
| 1 |La première colonne est`*`.. Indique qu'il s'agit d'un résultat d'analyse des dépendances.|
| 2 |Numéro de phrase (entier à partir de 0)|
| 3 |Numéro de contact +`D` |
| 4 |Adresse principale/Position du mot de fonction et nombre illimité de colonnes d'identité|
| 5 |Score d'engagement. En général, plus la valeur est élevée, plus il est facile de s'engager.|
Returns:
List[Chunk]: List of chunks.
"""
chunks = []
chunk = None
srcs = defaultdict(list)
for i, word in enumerate(sent):
if word[0] == "*":
# Add chunk to chunks
if chunk is not None:
chunks.append(chunk)
# eNw Chunk beggin
chunk_id = word.split(" ")[1]
dst = word.split(" ")[2].rstrip("D")
chunk = Chunk(chunk_id, dst)
srcs[dst].append(chunk_id) # Add target->source to mapping list
else: # Add Morch to chunk.morphs
features = word.split(",")
dic = {
"surface": features[0].split("\t")[0],
"base": features[6],
"pos": features[0].split("\t")[1],
"pos1": features[1],
}
chunk.morphs.append(Morph(dic))
if i == len(sent) - 1: # Add the last chunk
chunks.append(chunk)
# Add srcs to each chunk
for chunk in chunks:
chunk.srcs = list(srcs[chunk.id])
return chunks
def concat_morphs_surface(chunk: Chunk) -> str:
"""Concatenate morph surfaces in a chink.
Args:
chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(je), Morph(Est)]
Return:
e.g. 'je suis'
"""
res = ""
for morph in chunk.morphs:
if morph.pos != "symbole":
res += morph.surface
return res
# ans43
def validate_pos_in_chunk(chunk: Chunk, pos: str) -> bool:
"""Return Ture if 'nom' or 'verbe' in chunk's morphs. Otherwise, return False."""
return any([morph.pos == pos for morph in chunk.morphs])
def concat_chunks_surface(chunks: List[Chunk]):
"""Concatenate surface of dependency source and target between chunks.
Args:
chunks (List[Chunk]): chunks represent a sentences.
e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(je), Morph(Est)] ),
Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(ici), Morph(alors)] ),
Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(début), Morph(main)] ),
Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Humain), Morph(Cette)] ),
Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(chose), Morph(À)] ),
Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(Vous voyez), Morph(Ta), Morph(。)] )]
"""
chunks_surface = []
for chunk in chunks:
if len(chunk.srcs) == 0:
continue
else:
if validate_pos_in_chunk(chunk, "verbe"):
current_chunk_surface = concat_morphs_surface(chunk)
for src in chunk.srcs:
src_chunk = chunks[int(src)]
if validate_pos_in_chunk(src_chunk, "nom"):
src_chunk_surface = concat_morphs_surface(src_chunk)
chunks_surface.append(
"{} {}".format(src_chunk_surface, current_chunk_surface)
)
return chunks_surface
fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
chunks = [convert_sent_to_chunks(sent) for sent in sentences] # ans41
result = [concat_chunks_surface(sent) for sent in chunks] # ans43
result = list(filter(lambda x: len(x) != 0, result)) # filtering the empty list
for sent in result[:3]:
print(sent)
# ['Où êtes-vous né', 'Je n'ai aucune idée']
# ['Pleurer sur place', 'Miaou miaou pleurer', 'Je me souviens seulement de ce que j'étais']
# ['Pour la première fois ici', 'j'ai vu', 'j'ai vu quelque chose']
Recommended Posts