Get the minutes of the Diet via API

Hit the API from python to collect any parliamentary minutes.

1. Official information

You can also search by GUI from National Diet Library Search System, but there is a proper API manual .jp / api.html).

2. Search & get by specifying keywords

Here, we will collect minutes that include the following keywords for the statements made during the 10 years from 2010 to 2019.

# -*- coding: utf-8 -*-
Created on Thu Dec 26 15:05:04 2019

@author: boomin

pip install untangle

import urllib
import untangle
import urllib.parse

import re
import pandas as pd
import os

spt = os.sep
pklDir  = "pkl"

def getSpeech(keyword:str):
    start="1" #'#Serial number of remark
    apipath = ''

    #Regular expression to remove the speaker part from the content of the statement
    p = re.compile(r'^○([^ ]+)You?\s(.+)')

    enddate= '2020-01-01'

    df = pd.DataFrame()

    while start!=None:
        date = []
        speaker = []
        speech = []
        speakerGroup = []
        speakerPosition = []

        url = apipath+urllib.parse.quote(
            + '&from=' + startdate
            + '&until=' + enddate
            + '&any=' + keyword
            + f'&startRecord={start}'
        #Get signal request search results (XML)
        obj = untangle.parse(url)

        for record in
            speechrecord = record.recordData.speechRecord

            speechdata = speechrecord.speech.cdata.replace("\u3000"," ").replace("\n"," ")
            m =
            if not isinstance(m,type(None)):

        offset = int(start)-1
        index = [ offset+n for n in list(range(len(date))) ]
        adddf = pd.DataFrame({
          }, index=index)
        df = pd.concat([df, adddf ])

        #Since only 100 items are returned at a time, change the start position and repeatedly send the GET function.
            start =
            print(f"finished: {start}")

    df["date"] = pd.to_datetime(df["date"])
    return df

if __name__ == '__main__':
    df1 = getSpeech('Artificial intelligence')
    df2 = getSpeech('AI')
    df3 = getSpeech('big data')
    df4 = getSpeech('Machine learning')

    df = pd.concat([df1,df2,df3,df4])
    #Delete duplicate remarks
    df.drop_duplicates(subset=["date","speaker","speech"], inplace=True)

    df.reset_index(drop=True, inplace=True)

    pd.to_pickle(df, f"{pklDir}{spt}kokkailog.pkl")
    df.to_csv(f"{pklDir}{spt}kokkailog.tsv", sep="\t")

3. Obtained data

In[4]: df.tail()
#           date speaker  ...         speakerGroup speakerPosition
#4288 2019-12-05 Taku Eto...Liberal Democratic Party, Group of Independents Minister of Agriculture, Forestry and Fisheries
#4289 2019-12-05 Masayoshi Hamada...Komeito
#4290 2019-12-05 Mitsuko Ishii...Japan Restoration Party
#4291 2019-12-05 Takashi Midorikawa...Constitutional Democratic / National / Social Insurance / Independent Forum
#4292 2019-12-05 Koichi Hagiuda...Liberal Democratic Party, Independent Minister of Education, Culture, Sports, Science and Technology
#[5 rows x 5 columns]

