100 natural language processing knocks Chapter 3 Regular expressions (second half)

A record of solving the problems in the second half of Chapter 3.

</ i> 25. Template extraction

Extract the field names and values of the "basic information" template included in the article and store them as a dictionary object.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import json

def pass_string(string):
    return string

def fundamental_data(file, func):
    response = []
    data = {}
    start = re.compile(r"\{\{Basic information")
    end = re.compile(r"\}\}")
    row = re.compile(r"^\s?\|?\s?(.+?)\s?=(.*)\|?")
    flag = False

    pre_key = None
    for line in file:
        if start.match(line):
            flag = True
            continue

        if end.match(line):
            flag = False

        if flag:
            l = row.match(line)

            if l:
                data[l.group(1).strip()] = func(l.group(2).strip())
                pre_key = l.group(1).strip()

            else:
                m = re.match(r"(.*)\}\}$", line)

                if m:
                    data[pre_key] += func(m.group(1).strip())
                    flag = False

                else:
                    data[pre_key] += func(line.strip())
        else:
            if len(data) > 0:
                response.append(data.copy())
                data.clear()

    return response

if __name__ == "__main__":
    inputfile = 'jawiki-england.txt'
    outputfile = 'jawiki-england_fundamental.json'
    f = open(inputfile)
    res = fundamental_data(f, pass_string)
    with open(outputfile, 'w') as g:
        json.dump(res, g, ensure_ascii=False)

#=> (File jawiki-england_fundamental.Output to json)

Templates for basic information are from {{basic information to }} . </ i> There were some exceptions, and }} was not broken at the end of the line, so exception handling was done for that part. I'm wearing it. (I try to pass the function as the second argument of the fundamental_data function for the problem that follows)

</ i> 26. Removal of highlighted markup

When processing> 25, remove MediaWiki's emphasis markup (all weak, emphasized, and strongly emphasized) from the template value and convert it to text (reference: markup quick reference table).

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import json
import problem25

def remove_emphasis(string):
    emphasis = re.compile(r"''('*)(.+)''\1")
    return emphasis.sub(r"\2", string)

if __name__ == "__main__":
    inputfile = 'jawiki-england.txt'
    outputfile = 'jawiki-england_fundamental-rmEmpha.json'
    f = open(inputfile)
    res = problem25.fundamental_data(f, remove_emphasis)
    with open(outputfile, 'w') as g:
        json.dump(res, g, ensure_ascii=False)

#=> (File jawiki-england_fundamental-rmEmpha.Output to json)

The function that removes the highlighted markup is passed to the fundamental_data function created in the previous problem. The \ number in the regular expression represents the numberth pattern, similar to the group method.

</ i> 27. Removal of internal links

In addition to the processing of> 26, remove MediaWiki's internal link markup from the template value and convert it to text (reference: markup quick reference table).

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import json
import problem25
import problem26

def remove_internalLink(string):
    internallink = re.compile(r"\[\[((.+?)\|)?(.+?)\]\]")
    emphasis_removed = problem26.remove_emphasis(string)
    return internallink.sub(r"\3", emphasis_removed)

if __name__ == "__main__":
    inputfile = 'jawiki-england.txt'
    outputfile = 'jawiki-england_fundamental-rmEmpha-rmLink.json'
    f = open(inputfile)
    res = problem25.fundamental_data(f, remove_internalLink)
    with open(outputfile, 'w') as g:
        json.dump(res, g, ensure_ascii=False)

#=> (File jawiki-england_fundamental-rmEmpha-rmLink.Output to json)

Added processing to remove internal link markup in addition to processing the previous problem.

</ i> 28. Removal of MediaWiki markup

In addition to the> 27 process, remove MediaWiki markup from the template values as much as possible and format the basic country information.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import json
import problem25
import problem26
import problem27

def remove_markup(string):
    markups = [
        re.compile(r"\[https?://[a-zA-Z0-9\./]+\s(.+)?\]"),
        re.compile(r"#REDIRECT\s?(.+)"),
        re.compile(r"<!--\s?(.+)\s?-->"),
        re.compile(r"\{\{.*[Ll]ang\|[a-zA-Z\-]+\|(.+)\}\}"),
        re.compile(r"(.*)<ref.+(</ref>)?>"),
        re.compile(r"(.*?)<br\s?/?>"),
        re.compile(r"<[a-z]+.*>(.*?)</[a-z]+>")
    ]
    removed_string = problem27.remove_internalLink(string)
    for m in markups:
        removed_string = m.sub(r"\1", removed_string)
    return removed_string

if __name__ == "__main__":
    inputfile = 'jawiki-england.txt'
    outputfile = 'jawiki-england_fundamental-rmMUs.json'
    f = open(inputfile)
    res = problem25.fundamental_data(f, remove_markup)
    with open(outputfile, 'w') as g:
        json.dump(res, g, ensure_ascii=False)

#=> (File jawiki-england_fundamental-rmMUs.Output to json)

The markups found were external links, redirects, comments, linguistic information, and HTML statements, so I added a feature to remove them.

</ i> 29. Get the URL of the national flag image

Use the contents of the template to get the URL of the national flag image. (Hint: Call imageinfo in the MediaWiki API to convert file references to URLs)

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import requests
import json

inputfile = 'jawiki-england_fundamental-rmMUs.json'
outputfile = 'jawiki-england_nationalflags.txt'

with open(inputfile, 'r') as f:
    template = json.load(f)

wikipedia_api = "http://ja.wikipedia.org/w/api.php?"
prop = {
    'action': "query",
    'prop': "imageinfo",
    'iiprop': "url",
    'format': "json",
    'formatversion': '2',
    'utf8': '',
    'continue': ''
}

g = open(outputfile, "w")

for country in template:
    if u'National flag image' in country:
        countryname = country[u'Abbreviated name']
        filename = country[u'National flag image']
        prop['titles'] = "Image:" + filename
        res = requests.get(url=wikipedia_api, params=prop)
        datum = json.loads(res.text)
        try:
            file_url = datum['query']['pages'][0]['imageinfo'][0]['url']
        except:
            print(datum)
            break
        print(filename, file_url)
        g.write(countryname.encode('utf8').replace('|', ''))
        g.write(", %s\n" % file_url)
g.close()

#=> (File jawiki-england_nationalflags.Output to txt)

Hit the API using the requests module.

Recommended Posts