Un enregistrement de la résolution des problèmes dans la seconde moitié du chapitre 6. Le fichier cible est nlp.txt comme indiqué sur la page Web.
Effectuez le traitement suivant sur le texte anglais (nlp.txt).
Extrayez tous les noms de personne dans le texte d'entrée.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
WORD = re.compile(r"<word>(\w+)</word>")
NER = re.compile(r"<NER>(\w+)</NER>")
token = ""
person = ""
f = open('nlp.txt.xml', 'r')
for line in f:
line = line.strip()
word = WORD.search(line)
if word:
token = word.group(1)
continue
ner = NER.search(line)
if ner:
if ner.group(1) == "PERSON":
person = token
print person
f.close()
Sur la base des résultats de l'analyse de co-référence de Stanford Core NLP, remplacer la référence dans la phrase par la mention représentative. Cependant, lors du remplacement, veillez à ce que l'expression de référence d'origine puisse être comprise, telle que «expression de référence représentative (expression de référence)».
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
import xml.etree.ElementTree as et
from functools import partial
LRB = re.compile(r"-LRB- ")
RRB = re.compile(r" -RRB-")
NOTATION = re.compile(r" ([,\.:;])")
LDQ = re.compile(r"`` ")
RDQ = re.compile(r" \'\'")
SQ = re.compile(r" \'")
SQS = re.compile(r" \'s")
class StanfordDocument():
def __init__(self, file):
self.xmltree = et.parse(file)
root = self.xmltree.getroot()
self.sentences = root.find('document/sentences')
self.coreferences = root.find('document/coreference')
def getListOfSentences(self):
sentences = []
for sentence in self.sentences.findall('sentence'):
sentences.append([word.text for word in sentence.findall('tokens/token/word')])
return sentences
def main(file):
doc = StanfordDocument(file)
sentences = doc.getListOfSentences()
for coref in doc.coreferences.findall('coreference'):
mentions = coref.findall('mention')
represent = coref.find('mention[@representative="true"]')
for mention in mentions:
if mention != represent:
sentence_i = int(mention.find('sentence').text) - 1
start_i = int(mention.find('start').text) - 1
end_i = int(mention.find('end').text) - 2
target_sentence = sentences[sentence_i]
target_sentence[start_i] = represent.find('text').text.strip() + ' (' + target_sentence[start_i]
# print list(represent)
# target_sentence[start_i] = "[" + str(sentence_i) +","+ str(start_i) +","+ str(end_i) +","+str(sentences[sentence_i][start_i])+ "]" + ' (' + target_sentence[start_i]
target_sentence[end_i] = target_sentence[end_i] + ')'
return sentences
def prettifySentence(sentence):
s = " ".join(sentence)
partials = map(
lambda x: partial(x[0], x[1]),
[
(LRB.sub, '('),
(RRB.sub, ')'),
(LDQ.sub, '\"'),
(RDQ.sub, '\"'),
(SQS.sub, "\'s"),
(SQ.sub, "\'"),
(NOTATION.sub, r'\1')
]
)
for part in partials:
s = part(s)
return s
if __name__ == "__main__":
file = "nlp_line.txt.xml"
sentences = main(file)
for sentence in sentences:
s = prettifySentence(sentence)
print s
Visualisez les dépendances réduites de la PNL Stanford Core sous forme de graphe orienté. Pour la visualisation, il est conseillé de convertir l'arborescence de dépendances dans le langage DOT et d'utiliser Graphviz. De plus, pour visualiser des graphiques dirigés directement à partir de Python, utilisez pydot.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import sys
import problem56
def dependToDot(i, dependency):
header = "digraph sentence{0} ".format(i)
body_head = "{ graph [rankdir = LR]; "
body = ""
for dep in dependency:
governor, dependent, label = dep.find('governor').text, dep.find('dependent').text, dep.get('type')
body += '"{gov}"->"{dep}" [label = "{label}"]; '.format(gov=governor, dep=dependent, label=label)
dotString = header + body_head + body + "}"
return dotString
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dotSentences = []
for i, sentence in enumerate(sentences):
dependency = sentence.find("dependencies[@type='collapsed-dependencies']")
dotSentences.append(dependToDot(i+1, dependency))
return dotSentences
if __name__ == '__main__':
dotSentences = main('nlp_line.txt.xml')
if len(sys.argv) > 1:
target = int(sys.argv[1]) - 1
print dotSentences[target]
else:
for dotSentence in dotSentences:
print dotSentence
Sortie de l'ensemble des "objets prédicat sujet" au format délimité par des tabulations en fonction du résultat de l'analyse des dépendances (dépendances réduites) de Stanford Core NLP. Cependant, reportez-vous à ce qui suit pour les définitions du sujet, du prédicat et de l'objet. --Predicate: un mot qui a des enfants (dépendants) des relations nsubj et dobj --Subject: enfant (dépendant) qui a une relation nsubj du prédicat --Objet: un enfant (dépendant) qui a une relation dobj du prédicat
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
def extractTuples(sentence):
dependencies = sentence.find("dependencies[@type='collapsed-dependencies']")
dep_triple = []
dep_dic = {}
for dep in dependencies:
gov = (dep.find('governor').get('idx'), dep.find('governor').text)
if dep.get('type') in ['nsubj', 'dobj']:
dep_dic.setdefault(gov, []).append((dep.get('type'), dep.find('dependent').text))
verbs = [key for key, value in dep_dic.iteritems() if set([t for (t, d) in value]) == set(['nsubj', 'dobj'])]
for verb in verbs:
nsubj = [d for (t, d) in dep_dic[verb] if t == 'nsubj']
dobj = [d for (t, d) in dep_dic[verb] if t == 'dobj']
dep_triple += [[verb[1], n, d] for n in nsubj for d in dobj]
return dep_triple
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dep_triple = []
for sentence in sentences:
dep_triple.append(extractTuples(sentence))
return dep_triple
if __name__ == '__main__':
dep_triple = main('nlp_line.txt.xml')
for dep in dep_triple:
for dt in dep:
print "%s\t%s\t%s" % (dt[1], dt[0], dt[2])
Lisez le résultat de l'analyse de la structure des phrases de Stanford Core NLP (formule S) et affichez toutes les phrases de nomenclature (NP) dans la phrase. Afficher toute la nomenclature imbriquée.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
class TreeParser():
def __init__(self):
self.root = None
self._stack = [[]]
def parse(self, tree_string):
read = []
for character in tree_string.strip():
if character == "(":
self._stack.append([])
elif character == " ":
if read:
self._stack[-1].append("".join(read))
read = []
elif character == ")":
if read:
self._stack[-1].append("".join(read))
read = []
self._stack[-2].append(self._stack.pop())
else:
read.append(character)
self.root = self._stack.pop()
def get_phrase(self, tag):
s = self.root[0][1]
return self._recursive_finder(s, tag)
def _recursive_finder(self, lst, tag):
res = []
if lst[0] == tag:
res.append(lst)
for l in lst[1:]:
if isinstance(l, list):
res.extend(self._recursive_finder(l, tag))
return res
def main(file, tag):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
tag_phases = []
for sentence in sentences:
parser = TreeParser()
tree_string = sentence.find('parse').text
parser.parse(tree_string)
tag_phases.append(parser.get_phrase(tag))
return tag_phases
def str_phrase(phrase):
res = []
for p in phrase:
if isinstance(p, list):
if isinstance(p[1], list):
res += str_phrase(p)
else:
res.append(p[1])
return res
if __name__ == "__main__":
np_phases = main("nlp_line.txt.xml", "NP")
for np_phase in np_phases:
for np in np_phase:
phase_list = str_phrase(np)
np_string = problem56.prettifySentence(phase_list)
print np_string