Good morning: relaxed: There are times when you want to get information on wikipedia. Let's use the MediaWiki API that can be used in such cases.
You can use wiki functions such as adding, updating, and searching login pages to the wiki.
API: https://ja.wikipedia.org/w/api.php
By giving the above URL a processing type as the parameter "action", you can use various functions such as search and update. This time, we will learn how to search and obtain the basic "acquisition (action = query)" Wiki information.
getWikiData
import requests
import json
def getWikiData(url, params):
res = requests.get( url,
params = params)
return res.json()
url ="https://ja.wikipedia.org/w/api.php"
params = { "action" : "query",
"titles" : "Python",
"format" : "json"
}
print(getWikiData(url,params))
【result】
{
'batchcomplete': '',
'query': {
'pages': {
'993': {
'pageid': 993, ##Unique ID
'ns': 0,## nameSpace
'title': 'Python' ##page title
}
}
}
}
params = { "action" : "query",
"titles" : "Python",
"prop" : "info",
"format" : "json"
}
{
'batchcomplete': '',
'query': {
'pages': {
'993': {
'pageid': 993,
'ns': 0,
'title': 'Python',
'contentmodel': 'wikitext',
'pagelanguage': 'ja',
'pagelanguagehtmlcode': 'ja',
'pagelanguagedir': 'ltr',
'touched': '2020-09-21T07:44:48Z',
'lastrevid': 79623671,
'length': 50355
}
}
}
}
(Most of the time, [shortened major item prop + prop] is the item name)
params = { "action" : "query",
"titles" : "Python",
"prop" : "info",##Major item standing position
"inprop" : "watchers",##Sub-item standing position
"format" : "json"
}
{
'batchcomplete': '',
'query': {
'pages': {
'993': {
'pageid': 993,
'ns': 0,
'title': 'Python',
'contentmodel': 'wikitext',
'pagelanguage': 'ja',
'pagelanguagehtmlcode': 'ja',
'pagelanguagedir': 'ltr',
'touched': '2020-09-21T07:44:48Z',
'lastrevid': 79623671,
'length': 50355,
'watchers': 157 ##here
}
}
}
}
params = { "action" : "query",
"titles" : "Python",
"prop" : "categories",
"clprop" : "sortkey",
"format" : "json"
}
{
'batchcomplete': '',
'query': {
'pages': {
'993': {
'pageid': 993,
'ns': 0,
'title': 'Python',
'categories': [
{
'ns': 14,
'title': 'Category:Python',
'sortkey': '2a0a505954484f4e',
'sortkeyprefix': '*'
},
{
'ns': 14,
'title': 'Category:Object-oriented language',
'sortkey': '505954484f4e0a505954484f4e',
'sortkeyprefix': 'PYTHON'
},
{
'ns': 14,
'title': 'Category:Open source software',
'sortkey': '505954484f4e0a505954484f4e',
'sortkeyprefix': 'PYTHON'
},
{
'ns': 14,
'title': 'Category:Scripting language',
'sortkey': '505954484f4e0a505954484f4e',
'sortkeyprefix': 'PYTHON'
},
{
'ns': 14,
'title': 'Category:Basic Information Technology Engineer Examination',
'sortkey': 'e381afe38184e3819de382930a505954484f4e',
'sortkeyprefix': 'Yes'
},
{
'ns': 14,
'title': 'Category:Articles containing invalid sources/2018',
'sortkey': '420a505954484f4e',
'sortkeyprefix': 'B'
}
]
}
}
}
}
params = { "action" : "query",
"titles" : "Python",
"list" : "allcategories",
"acprop" : "size", ##Different properties for each list
"aclimit" : 5, ##Maximum number of acquisitions Item name changes for each list
"format" : "json"
}
{
'batchcomplete': '',
'continue': {
'accontinue': '.22_LR_firearms',
'continue': '-||'
},
'query': {
'pages': {
'993': {
'pageid': 993,
'ns': 0,
'title': 'Python'
}
},
'allcategories': [
{
'size': 1,
'pages': 1,
'files': 0,
'subcats': 0,
'*': '" + afterCat + "'
},
{
'size': 1,
'pages': 1,
'files': 0,
'subcats': 0,
'*': '" + afterCat + "$2'
},
{
'size': 2,
'pages': 2,
'files': 0,
'subcats': 0,
'*': '$1'
},
{
'size': 3,
'pages': 3,
'files': 0,
'subcats': 0,
'*': '((documentation))Pages with unusual use of'
},
{
'size': 9,
'pages': 9,
'files': 0,
'subcats': 0,
'*': '+Ultra'
}
]
}
}
params = { "action" : "query",
"titles" : "Python",
"list" : "categorymembers",
"cmtitle" : "Category:Object-oriented language", ##If you use categorymembers, you must set this here.
"cmlimit" : 5, ##Maximum number of acquisitions
"cmprop" : "ids|title|sortkey",##Items to get.[|]You can specify multiple delimiters.
"format" : "json"
}
{
'batchcomplete': '',
'continue': {
'cmcontinue': 'page|4345594c4f4e|2496222',
'continue': '-||'
},
'query': {
'pages': {
'993': {
'pageid': 993,
'ns': 0,
'title': 'Python'
}
},
'categorymembers': [
{
'pageid': 821212,
'ns': 0,
'title': 'Comparison of object-oriented languages',
'sortkey': '2ae381b2e3818be3818f0ae382aae38396e382b8e382a7e382afe38388e68c87e59091e8a880e8aa9ee381aee6af94e8bc83'
},
{
'pageid': 181337,
'ns': 0,
'title': 'ActiveBasic',
'sortkey': '41435449564542415349430a4143544956454241534943'
},
{
'pageid': 3785500,
'ns': 0,
'title': 'Ballerina',
'sortkey': '42414c4c4552494e41'
},
{
'pageid': 2066745,
'ns': 0,
'title': 'Boo (Programming language)',
'sortkey': '424f4f2028e38397e383ade382b0e383a9e3839fe383b3e382b0e8a880e8aa9e29'
},
{
'pageid': 1503,
'ns': 0,
'title': 'C Sharp',
'sortkey': '43230a43205348415250'
}
]
}
}
When creating a user dictionary for mecab, I created the following code because I wanted to get all the content titles included in the corresponding category.
makeOwaraiList
import requests
import json
import csv
import re
def getWikiData(url, params):
res = requests.get( url,
params = params)
return res.json()
fileName = "wikiList"
url ="https://ja.wikipedia.org/w/api.php"
params = { "action" : "query",
"list" : "categorymembers",
"cmlimit": "50",
"format" : "json"
}
categories = ['Japanese comedy combination','Japanese comedy combination','Japanese comedy duo','Japanese comedy trio','Japanese comedy group','Japan>Couple comedy combination']
with open( fileName + ".csv",'a', encoding="utf-8") as f:
writer = csv.writer(f)
for category in categories :
params['cmtitle'] = 'Category:' + category;
params['cmcontinue'] = '';
wikiData = getWikiData(url,params)
while True:
for page in wikiData['query']['categorymembers']:
if page['ns'] == 0:
title = (re.sub("\(comedy\)|\(comedyコンビ\)|\(comedytrio\)|\(talent\)|\(comedyグループ\)|\(trio\)|\(unit\)|\(Entertainer\)", "" ,page['title'])).strip()
writer.writerow([title])
if ('continue' in wikiData and wikiData['continue']['cmcontinue']):
params['cmcontinue'] = wikiData['continue']['cmcontinue']
wikiData = getWikiData(url,params)
else :
break
Again, I fully trusted the formula and studied and used the API: relaxed: There are many situations where you can use the information on the wiki, so if you can master this API, it will be more fun to implement.
Recommended Posts