100 natural language processing knocks Chapter 6 English text processing (second half)

A record of solving the problems in the second half of Chapter 6. The target file is nlp.txt as shown on the web page.

Perform the following processing on the English text (nlp.txt).

</ i> 55. Named entity extraction

Extract all personal names in the input text.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re

WORD = re.compile(r"<word>(\w+)</word>")
NER = re.compile(r"<NER>(\w+)</NER>")

token = ""
person = ""
f = open('nlp.txt.xml', 'r')
for line in f:
    line = line.strip()
    word = WORD.search(line)
    if word:
        token = word.group(1)
        continue
    ner = NER.search(line)
    if ner:
        if ner.group(1) == "PERSON":
            person = token
            print person
f.close()

</ i> 56. Co-reference analysis

Based on the results of co-reference analysis of Stanford Core NLP, replace the reference expression (mention) in the sentence with the representative reference expression (representative mention). However, when replacing, be careful so that the original reference expression can be understood, such as "representative reference expression (reference expression)".

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import xml.etree.ElementTree as et
from functools import partial

LRB = re.compile(r"-LRB- ")
RRB = re.compile(r" -RRB-")
NOTATION = re.compile(r" ([,\.:;])")
LDQ = re.compile(r"`` ")
RDQ = re.compile(r" \'\'")
SQ = re.compile(r" \'")
SQS = re.compile(r" \'s")

class StanfordDocument():
    def __init__(self, file):
        self.xmltree = et.parse(file)
        root = self.xmltree.getroot()
        self.sentences = root.find('document/sentences')
        self.coreferences = root.find('document/coreference')

    def getListOfSentences(self):
        sentences = []
        for sentence in self.sentences.findall('sentence'):
            sentences.append([word.text for word in sentence.findall('tokens/token/word')])
        return sentences

def main(file):
    doc = StanfordDocument(file)
    sentences = doc.getListOfSentences()

    for coref in doc.coreferences.findall('coreference'):
        mentions = coref.findall('mention')
        represent = coref.find('mention[@representative="true"]')
        for mention in mentions:
            if mention != represent:
                sentence_i = int(mention.find('sentence').text) - 1
                start_i = int(mention.find('start').text) - 1
                end_i = int(mention.find('end').text) - 2

                target_sentence = sentences[sentence_i]
                target_sentence[start_i] = represent.find('text').text.strip() + ' (' + target_sentence[start_i]
                # print list(represent)
                # target_sentence[start_i] = "[" + str(sentence_i) +","+ str(start_i) +","+ str(end_i) +","+str(sentences[sentence_i][start_i])+ "]" + ' (' + target_sentence[start_i]
                target_sentence[end_i] = target_sentence[end_i] + ')'
    return sentences

def prettifySentence(sentence):
    s = " ".join(sentence)
    partials = map(
        lambda x: partial(x[0], x[1]),
        [
            (LRB.sub, '('),
            (RRB.sub, ')'),
            (LDQ.sub, '\"'),
            (RDQ.sub, '\"'),
            (SQS.sub, "\'s"),
            (SQ.sub, "\'"),
            (NOTATION.sub, r'\1')
        ]
    )
    for part in partials:
        s = part(s)
    return s

if __name__ == "__main__":
    file = "nlp_line.txt.xml"
    sentences = main(file)
    for sentence in sentences:
        s = prettifySentence(sentence)
        print s

</ i> 57. Dependency analysis

Visualize the collapsed-dependencies of Stanford Core NLP as a directed graph. For visualization, convert the dependency tree to DOT language and use Graphviz. Also, to visualize directed graphs directly from Python, use pydot.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import sys
import problem56

def dependToDot(i, dependency):
    header = "digraph sentence{0} ".format(i)
    body_head = "{ graph [rankdir = LR]; "
    body = ""
    for dep in dependency:
        governor, dependent, label = dep.find('governor').text, dep.find('dependent').text, dep.get('type')
        body += '"{gov}"->"{dep}" [label = "{label}"]; '.format(gov=governor, dep=dependent, label=label)

    dotString = header + body_head + body + "}"
    return dotString

def main(file):
    doc = problem56.StanfordDocument(file)
    sentences = doc.sentences.findall('sentence')
    dotSentences = []
    for i, sentence in enumerate(sentences):
        dependency = sentence.find("dependencies[@type='collapsed-dependencies']")
        dotSentences.append(dependToDot(i+1, dependency))
    return dotSentences

if __name__ == '__main__':
    dotSentences = main('nlp_line.txt.xml')
    if len(sys.argv) > 1:
        target = int(sys.argv[1]) - 1
        print dotSentences[target]
    else:
        for dotSentence in dotSentences:
            print dotSentence

</ i> 58. Extraction of tuples

Output the set of "subject predicate object" in tab-delimited format based on the result of the dependency analysis (collapsed-dependencies) of Stanford Core NLP. However, refer to the following for the definitions of subject, predicate, and object. --Predicate: A word that has children (dependants) of nsubj and dobj relationships --Subject: A child (dependent) that has an nsubj relationship from the predicate --Object: A child (dependent) that has a dobj relationship from the predicate

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem56

def extractTuples(sentence):
    dependencies = sentence.find("dependencies[@type='collapsed-dependencies']")
    dep_triple = []
    dep_dic = {}

    for dep in dependencies:
        gov = (dep.find('governor').get('idx'), dep.find('governor').text)
        if dep.get('type') in ['nsubj', 'dobj']:
            dep_dic.setdefault(gov, []).append((dep.get('type'), dep.find('dependent').text))

    verbs = [key for key, value in dep_dic.iteritems() if set([t for (t, d) in value]) == set(['nsubj', 'dobj'])]

    for verb in verbs:
        nsubj = [d for (t, d) in dep_dic[verb] if t == 'nsubj']
        dobj = [d for (t, d) in dep_dic[verb] if t == 'dobj']
        dep_triple += [[verb[1], n, d] for n in nsubj for d in dobj]

    return dep_triple

def main(file):
    doc = problem56.StanfordDocument(file)
    sentences = doc.sentences.findall('sentence')
    dep_triple = []

    for sentence in sentences:
        dep_triple.append(extractTuples(sentence))

    return dep_triple

if __name__ == '__main__':
    dep_triple = main('nlp_line.txt.xml')
    for dep in dep_triple:
        for dt in dep:
            print "%s\t%s\t%s" % (dt[1], dt[0], dt[2])

</ i> 59. Analysis of S-expressions

Read the result of phrase structure analysis (S-expression) of Stanford Core NLP and display all noun phrases (NP) in the sentence. Display all nested noun phrases as well.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem56

class TreeParser():
    def __init__(self):
        self.root = None
        self._stack = [[]]

    def parse(self, tree_string):
        read = []
        for character in tree_string.strip():
            if character == "(":
                self._stack.append([])
            elif character == " ":
                if read:
                    self._stack[-1].append("".join(read))
                    read = []
            elif character == ")":
                if read:
                    self._stack[-1].append("".join(read))
                    read = []
                self._stack[-2].append(self._stack.pop())
            else:
                read.append(character)
        self.root = self._stack.pop()

    def get_phrase(self, tag):
        s = self.root[0][1]
        return self._recursive_finder(s, tag)

    def _recursive_finder(self, lst, tag):
        res = []
        if lst[0] == tag:
            res.append(lst)
        for l in lst[1:]:
            if isinstance(l, list):
                res.extend(self._recursive_finder(l, tag))
        return res

def main(file, tag):
    doc = problem56.StanfordDocument(file)
    sentences = doc.sentences.findall('sentence')
    tag_phases = []

    for sentence in sentences:
        parser = TreeParser()
        tree_string = sentence.find('parse').text
        parser.parse(tree_string)
        tag_phases.append(parser.get_phrase(tag))

    return tag_phases

def str_phrase(phrase):
    res = []
    for p in phrase:
        if isinstance(p, list):
            if isinstance(p[1], list):
                res += str_phrase(p)
            else:
                res.append(p[1])
    return res

if __name__ == "__main__":
    np_phases = main("nlp_line.txt.xml", "NP")
    for np_phase in np_phases:
        for np in np_phase:
            phase_list = str_phrase(np)
            np_string = problem56.prettifySentence(phase_list)
            print np_string

Recommended Posts