A record of solving the problems in the second half of Chapter 6. The target file is nlp.txt as shown on the web page.
Perform the following processing on the English text (nlp.txt).
Extract all personal names in the input text.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
WORD = re.compile(r"<word>(\w+)</word>")
NER = re.compile(r"<NER>(\w+)</NER>")
token = ""
person = ""
f = open('nlp.txt.xml', 'r')
for line in f:
line = line.strip()
word = WORD.search(line)
if word:
token = word.group(1)
continue
ner = NER.search(line)
if ner:
if ner.group(1) == "PERSON":
person = token
print person
f.close()
Based on the results of co-reference analysis of Stanford Core NLP, replace the reference expression (mention) in the sentence with the representative reference expression (representative mention). However, when replacing, be careful so that the original reference expression can be understood, such as "representative reference expression (reference expression)".
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
import xml.etree.ElementTree as et
from functools import partial
LRB = re.compile(r"-LRB- ")
RRB = re.compile(r" -RRB-")
NOTATION = re.compile(r" ([,\.:;])")
LDQ = re.compile(r"`` ")
RDQ = re.compile(r" \'\'")
SQ = re.compile(r" \'")
SQS = re.compile(r" \'s")
class StanfordDocument():
def __init__(self, file):
self.xmltree = et.parse(file)
root = self.xmltree.getroot()
self.sentences = root.find('document/sentences')
self.coreferences = root.find('document/coreference')
def getListOfSentences(self):
sentences = []
for sentence in self.sentences.findall('sentence'):
sentences.append([word.text for word in sentence.findall('tokens/token/word')])
return sentences
def main(file):
doc = StanfordDocument(file)
sentences = doc.getListOfSentences()
for coref in doc.coreferences.findall('coreference'):
mentions = coref.findall('mention')
represent = coref.find('mention[@representative="true"]')
for mention in mentions:
if mention != represent:
sentence_i = int(mention.find('sentence').text) - 1
start_i = int(mention.find('start').text) - 1
end_i = int(mention.find('end').text) - 2
target_sentence = sentences[sentence_i]
target_sentence[start_i] = represent.find('text').text.strip() + ' (' + target_sentence[start_i]
# print list(represent)
# target_sentence[start_i] = "[" + str(sentence_i) +","+ str(start_i) +","+ str(end_i) +","+str(sentences[sentence_i][start_i])+ "]" + ' (' + target_sentence[start_i]
target_sentence[end_i] = target_sentence[end_i] + ')'
return sentences
def prettifySentence(sentence):
s = " ".join(sentence)
partials = map(
lambda x: partial(x[0], x[1]),
[
(LRB.sub, '('),
(RRB.sub, ')'),
(LDQ.sub, '\"'),
(RDQ.sub, '\"'),
(SQS.sub, "\'s"),
(SQ.sub, "\'"),
(NOTATION.sub, r'\1')
]
)
for part in partials:
s = part(s)
return s
if __name__ == "__main__":
file = "nlp_line.txt.xml"
sentences = main(file)
for sentence in sentences:
s = prettifySentence(sentence)
print s
Visualize the collapsed-dependencies of Stanford Core NLP as a directed graph. For visualization, convert the dependency tree to DOT language and use Graphviz. Also, to visualize directed graphs directly from Python, use pydot.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import sys
import problem56
def dependToDot(i, dependency):
header = "digraph sentence{0} ".format(i)
body_head = "{ graph [rankdir = LR]; "
body = ""
for dep in dependency:
governor, dependent, label = dep.find('governor').text, dep.find('dependent').text, dep.get('type')
body += '"{gov}"->"{dep}" [label = "{label}"]; '.format(gov=governor, dep=dependent, label=label)
dotString = header + body_head + body + "}"
return dotString
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dotSentences = []
for i, sentence in enumerate(sentences):
dependency = sentence.find("dependencies[@type='collapsed-dependencies']")
dotSentences.append(dependToDot(i+1, dependency))
return dotSentences
if __name__ == '__main__':
dotSentences = main('nlp_line.txt.xml')
if len(sys.argv) > 1:
target = int(sys.argv[1]) - 1
print dotSentences[target]
else:
for dotSentence in dotSentences:
print dotSentence
Output the set of "subject predicate object" in tab-delimited format based on the result of the dependency analysis (collapsed-dependencies) of Stanford Core NLP. However, refer to the following for the definitions of subject, predicate, and object. --Predicate: A word that has children (dependants) of nsubj and dobj relationships --Subject: A child (dependent) that has an nsubj relationship from the predicate --Object: A child (dependent) that has a dobj relationship from the predicate
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
def extractTuples(sentence):
dependencies = sentence.find("dependencies[@type='collapsed-dependencies']")
dep_triple = []
dep_dic = {}
for dep in dependencies:
gov = (dep.find('governor').get('idx'), dep.find('governor').text)
if dep.get('type') in ['nsubj', 'dobj']:
dep_dic.setdefault(gov, []).append((dep.get('type'), dep.find('dependent').text))
verbs = [key for key, value in dep_dic.iteritems() if set([t for (t, d) in value]) == set(['nsubj', 'dobj'])]
for verb in verbs:
nsubj = [d for (t, d) in dep_dic[verb] if t == 'nsubj']
dobj = [d for (t, d) in dep_dic[verb] if t == 'dobj']
dep_triple += [[verb[1], n, d] for n in nsubj for d in dobj]
return dep_triple
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dep_triple = []
for sentence in sentences:
dep_triple.append(extractTuples(sentence))
return dep_triple
if __name__ == '__main__':
dep_triple = main('nlp_line.txt.xml')
for dep in dep_triple:
for dt in dep:
print "%s\t%s\t%s" % (dt[1], dt[0], dt[2])
Read the result of phrase structure analysis (S-expression) of Stanford Core NLP and display all noun phrases (NP) in the sentence. Display all nested noun phrases as well.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
class TreeParser():
def __init__(self):
self.root = None
self._stack = [[]]
def parse(self, tree_string):
read = []
for character in tree_string.strip():
if character == "(":
self._stack.append([])
elif character == " ":
if read:
self._stack[-1].append("".join(read))
read = []
elif character == ")":
if read:
self._stack[-1].append("".join(read))
read = []
self._stack[-2].append(self._stack.pop())
else:
read.append(character)
self.root = self._stack.pop()
def get_phrase(self, tag):
s = self.root[0][1]
return self._recursive_finder(s, tag)
def _recursive_finder(self, lst, tag):
res = []
if lst[0] == tag:
res.append(lst)
for l in lst[1:]:
if isinstance(l, list):
res.extend(self._recursive_finder(l, tag))
return res
def main(file, tag):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
tag_phases = []
for sentence in sentences:
parser = TreeParser()
tree_string = sentence.find('parse').text
parser.parse(tree_string)
tag_phases.append(parser.get_phrase(tag))
return tag_phases
def str_phrase(phrase):
res = []
for p in phrase:
if isinstance(p, list):
if isinstance(p[1], list):
res += str_phrase(p)
else:
res.append(p[1])
return res
if __name__ == "__main__":
np_phases = main("nlp_line.txt.xml", "NP")
for np_phase in np_phases:
for np in np_phase:
phase_list = str_phrase(np)
np_string = problem56.prettifySentence(phase_list)
print np_string
Recommended Posts