In the following article, I wrote the code to download from the URL list in order, so I modified it to download multiple images at the same time.
Download images from URL list in Python
Randomly select a file from the list Download multiple files at the same time Restrict downloads from the same domain Locked so that file writing does not occur at the same time
async_downloader.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time, os, glob, random, requests, threading
headers = { 'User-Agent' : 'Mozilla/5.0' }
cwd = os.getcwd()
result_dir = cwd + '/download/'
list_dir = cwd + '/list/'
done_file = 'done.txt'
fail_file = 'fail.txt'
wait_sec = 1
max_download = 5
class FileHandler(object):
def __init__(self):
self.lock = threading.Lock()
self.file_list = glob.glob(list_dir + '*')
self.done_file = done_file
self.fail_file = fail_file
def clearEmptyRows(self):
self.lock.acquire()
for url_list in self.file_list:
with open(url_list, 'r') as f:
urls = f.read().split('\n')
while '' in urls:
urls.remove('')
with open(url_list, 'w') as f:
f.write('\n'.join(urls))
self.lock.release()
def hasFile(self):
return True if len(self.file_list) else False
def saveDone(self, url):
self.lock.acquire()
with open(self.done_file, 'a') as f:
f.write(url + '\n')
self.lock.release()
def saveFail(self, url):
self.lock.acquire()
with open(self.fail_file, 'a') as f:
f.write(url + '\n')
self.lock.release()
def extractDomain(self, url):
return url.replace('http://', '').replace('https://', '').split('/')[0]
def getUrl(self):
self.lock.acquire()
url_file = random.choice(self.file_list)
with open(url_file, 'r') as f:
urls = f.read().split('\n')
i = random.randrange(len(urls))
domain_new = self.extractDomain(urls[i])
for thread in threading.enumerate():
domain_old = self.extractDomain(thread.name)
if domain_new == domain_old:
self.lock.release()
return self.getUrl()
result = urls.pop(i)
if len(urls):
with open(url_file, 'w') as f:
f.write('\n'.join(urls))
else:
os.remove(url_file)
self.file_list.remove(url_file)
self.lock.release()
return result
def asyncDownload():
def saveImage(file_handler, response):
url = response.url
image = response.content
path_relative = url.replace('http://', '').replace('https://', '')
paths = os.path.split(path_relative)[0].split('/')
path_current = result_dir
for path in paths:
path_current += path + '/'
if not os.path.exists(path_current):
os.mkdir(path_current)
with open('{result_dir}{path_relative}'.format(result_dir = result_dir, path_relative = path_relative), 'wb') as f:
f.write(image)
def downloadImage(file_handler, url):
print('download ' + url)
try:
res = requests.get(url, headers = headers)
saveImage(file_handler, res)
file_handler.saveDone(url)
print('done ' + url)
except requests.exceptions.RequestException as e:
file_handler.saveFail(url)
print('fail ' + e)
file_handler = FileHandler()
file_handler.clearEmptyRows()
while True:
if not file_handler.hasFile():
break
else:
url = file_handler.getUrl()
threading.Thread(name = url, target = downloadImage, args = (file_handler, url)).start()
while threading.active_count() > max_download:
time.sleep(wait_sec)
asyncDownload()
threading – manage parallel processing by threads How to download breast images-2012 Python version
Recommended Posts