PyCharm 5.0 was released on November 23, 2015. The Release Notes include Python 3.5 support and asynchronous execution visualization. The visualization function of asynchronous execution seemed to be useful, so I tried using it immediately.
I tried Async / await article I wrote earlier with PyCharm to visualize it.
A program that simultaneously downloads 13 Wikipedia articles and saves them in a file
I wrote it roughly.
async_web.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import aiohttp
import asyncio
async def download_file(title, url):
"""
Download url and save to file
:param title: str
:param url: str
"""
local_filename = title + ".txt"
with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
assert resp.status == 200
data = await resp.text()
await save_file(local_filename, data)
return local_filename
async def save_file(filename, text):
"""
Save text to file
:param filename: str
:param text: str
"""
path = "./data/{}".format(filename)
f = open(path, 'w')
f.write(text)
f.close()
async def task_download(url, title):
await download_file(url, title)
urls = [
['Yakult', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E4%BA%AC%E3%83%A4%E3%82%AF%E3%83%AB%E3%83%88%E3%82%B9%E3%83%AF%E3%83%AD%E3%83%BC%E3%82%BA'],
['Giant', 'https://ja.wikipedia.org/wiki/%E8%AA%AD%E5%A3%B2%E3%82%B8%E3%83%A3%E3%82%A4%E3%82%A2%E3%83%B3%E3%83%84'],
['Hanshin', 'https://ja.wikipedia.org/wiki/%E9%98%AA%E7%A5%9E%E3%82%BF%E3%82%A4%E3%82%AC%E3%83%BC%E3%82%B9'],
['Hiroshima', 'https://ja.wikipedia.org/wiki/%E5%BA%83%E5%B3%B6%E6%9D%B1%E6%B4%8B%E3%82%AB%E3%83%BC%E3%83%97'],
['Chunichi', 'https://ja.wikipedia.org/wiki/%E4%B8%AD%E6%97%A5%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%82%BA'],
['Yokohama', 'https://ja.wikipedia.org/wiki/%E6%A8%AA%E6%B5%9CDeNA%E3%83%99%E3%82%A4%E3%82%B9%E3%82%BF%E3%83%BC%E3%82%BA'],
['Softbank', 'https://ja.wikipedia.org/wiki/%E7%A6%8F%E5%B2%A1%E3%82%BD%E3%83%95%E3%83%88%E3%83%90%E3%83%B3%E3%82%AF%E3%83%9B%E3%83%BC%E3%82%AF%E3%82%B9'],
['Nippon-Ham', 'https://ja.wikipedia.org/wiki/%E5%8C%97%E6%B5%B7%E9%81%93%E6%97%A5%E6%9C%AC%E3%83%8F%E3%83%A0%E3%83%95%E3%82%A1%E3%82%A4%E3%82%BF%E3%83%BC%E3%82%BA'],
['Lotte', 'https://ja.wikipedia.org/wiki/%E5%8D%83%E8%91%89%E3%83%AD%E3%83%83%E3%83%86%E3%83%9E%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%BA'],
['Seibu', 'https://ja.wikipedia.org/wiki/%E5%9F%BC%E7%8E%89%E8%A5%BF%E6%AD%A6%E3%83%A9%E3%82%A4%E3%82%AA%E3%83%B3%E3%82%BA'],
['Orix', 'https://ja.wikipedia.org/wiki/%E3%82%AA%E3%83%AA%E3%83%83%E3%82%AF%E3%82%B9%E3%83%BB%E3%83%90%E3%83%95%E3%82%A1%E3%83%AD%E3%83%BC%E3%82%BA'],
['Rakuten', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E5%8C%97%E6%A5%BD%E5%A4%A9%E3%82%B4%E3%83%BC%E3%83%AB%E3%83%87%E3%83%B3%E3%82%A4%E3%83%BC%E3%82%B0%E3%83%AB%E3%82%B9'],
['Japan national football team', 'https://ja.wikipedia.org/wiki/%E3%82%B5%E3%83%83%E3%82%AB%E3%83%BC%E6%97%A5%E6%9C%AC%E4%BB%A3%E8%A1%A8'],
]
loop = asyncio.get_event_loop()
tasks = asyncio.wait([task_download(title, url) for title, url in urls])
loop.run_until_complete(tasks)
loop.close()
■ Threading graph
■ Asyncio graph
Looking at the Asyncio graph, the Task 15-27 part is the part where HTTP communication is performed with ʻaio http, but it works by switching Tasks well. However, it turns out that the subsequent processing takes time. Since the
save_file functionwriting to the disk is blocking IO, it is considered that it took a long time to wait for writing. In other words, rewriting the
save_file function` with non-blocking IO seems to improve it.
I wanted to rewrite the save_file function
written to the disk with non-blocking IO, but I didn't know how to do it, so I rewrote it with old-fashioned multi-process processing.
async_web_mp.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import random
import aiohttp
import asyncio
import multiprocessing as mp
async def task_download(i, title, url):
local_filename = title + "_mp.txt"
with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
assert resp.status == 200
# print(await resp.text())
data = await resp.text()
print(i, title, url)
process = mp.Process(target=save_file, args=(local_filename, data))
process.start()
return local_filename
def save_file(filename, text):
"""
Save text to file
:param filename: str
:param text: str
"""
path = "./data/{}".format(filename)
f = open(path, 'w')
f.write(text)
f.close()
n = "?{}".format(str(random.randint(1, 100000))) #Prevent caching
urls = [
['Yakult', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E4%BA%AC%E3%83%A4%E3%82%AF%E3%83%AB%E3%83%88%E3%82%B9%E3%83%AF%E3%83%AD%E3%83%BC%E3%82%BA'],
['Giant', 'https://ja.wikipedia.org/wiki/%E8%AA%AD%E5%A3%B2%E3%82%B8%E3%83%A3%E3%82%A4%E3%82%A2%E3%83%B3%E3%83%84'],
['Hanshin', 'https://ja.wikipedia.org/wiki/%E9%98%AA%E7%A5%9E%E3%82%BF%E3%82%A4%E3%82%AC%E3%83%BC%E3%82%B9'],
['Hiroshima', 'https://ja.wikipedia.org/wiki/%E5%BA%83%E5%B3%B6%E6%9D%B1%E6%B4%8B%E3%82%AB%E3%83%BC%E3%83%97'],
['Chunichi', 'https://ja.wikipedia.org/wiki/%E4%B8%AD%E6%97%A5%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%82%BA'],
['Yokohama', 'https://ja.wikipedia.org/wiki/%E6%A8%AA%E6%B5%9CDeNA%E3%83%99%E3%82%A4%E3%82%B9%E3%82%BF%E3%83%BC%E3%82%BA'],
['Softbank', 'https://ja.wikipedia.org/wiki/%E7%A6%8F%E5%B2%A1%E3%82%BD%E3%83%95%E3%83%88%E3%83%90%E3%83%B3%E3%82%AF%E3%83%9B%E3%83%BC%E3%82%AF%E3%82%B9'],
['Nippon-Ham', 'https://ja.wikipedia.org/wiki/%E5%8C%97%E6%B5%B7%E9%81%93%E6%97%A5%E6%9C%AC%E3%83%8F%E3%83%A0%E3%83%95%E3%82%A1%E3%82%A4%E3%82%BF%E3%83%BC%E3%82%BA'],
['Lotte', 'https://ja.wikipedia.org/wiki/%E5%8D%83%E8%91%89%E3%83%AD%E3%83%83%E3%83%86%E3%83%9E%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%BA'],
['Seibu', 'https://ja.wikipedia.org/wiki/%E5%9F%BC%E7%8E%89%E8%A5%BF%E6%AD%A6%E3%83%A9%E3%82%A4%E3%82%AA%E3%83%B3%E3%82%BA'],
['Orix', 'https://ja.wikipedia.org/wiki/%E3%82%AA%E3%83%AA%E3%83%83%E3%82%AF%E3%82%B9%E3%83%BB%E3%83%90%E3%83%95%E3%82%A1%E3%83%AD%E3%83%BC%E3%82%BA'],
['Rakuten', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E5%8C%97%E6%A5%BD%E5%A4%A9%E3%82%B4%E3%83%BC%E3%83%AB%E3%83%87%E3%83%B3%E3%82%A4%E3%83%BC%E3%82%B0%E3%83%AB%E3%82%B9'],
['Japan national football team', 'https://ja.wikipedia.org/wiki/%E3%82%B5%E3%83%83%E3%82%AB%E3%83%BC%E6%97%A5%E6%9C%AC%E4%BB%A3%E8%A1%A8'],
]
loop = asyncio.get_event_loop()
tasks = asyncio.wait([task_download(i, x[0], x[1] + n) for i, x in enumerate(urls)])
loop.run_until_complete(tasks)
loop.close()
It improved to 11.5 seconds >> 5.5 seconds.
jawiki-latest-all-titles-in-ns0
async_web_counter.py
# -*- coding: utf-8 -*-
import threading
import time
import urllib
class WikipediaCrawler(object):
"""
Class that responds to wikipedia URL
"""
PATH = './data/jawiki-latest-all-titles-in-ns0'
def __init__(self):
self.lock = threading.Lock()
def get_url(self):
f = open(WikipediaCrawler.PATH, 'r')
for title in f:
time.sleep(0.5)
if not self.lock.acquire(timeout=3):
#When you can't get the lock
print('%s: Cannot acquire lock (timed out)' % t.name)
continue
#I got the lock
try:
time.sleep(0.5)
url = WikipediaCrawler.get_url(title)
print(url)
yield url
finally:
#Unlock
self.lock.release()
@classmethod
def get_wikipedia_url(cls, title):
"""
Generate wikipedia URL from title
"""
_base_url = "https://ja.wikipedia.org/wiki/{}"
url = _base_url.format(urllib.quote_plus(title))
return url[:-3]
def worker(crawler):
t = threading.current_thread()
for url in crawler.get_url():
print(url)
threads_count = 3
threads = []
crawler = WikipediaCrawler()
for i in range(threads_count):
t = threading.Thread(target=worker, args=(crawler,))
threads.append(t)
t.start()
for t in threads:
t.join()
What’s New in PyCharm 5 [Exploring the Python standard library (18) ~ threading.Lock ~](http://mocobeta-backup.tumblr.com/post/86764185357/python-%E6%A8%99%E6%BA%96%E3%83 % A9% E3% 82% A4% E3% 83% 96% E3% 83% A9% E3% 83% AA% E6% 8E% A2% E8% A8% AA-18-threadinglock% E7% B7% A8)
Recommended Posts