Dans l'article suivant, j'ai écrit le code à télécharger à partir de la liste des URL dans l'ordre, je l'ai donc modifié pour télécharger plusieurs images en même temps.
Télécharger des images à partir de la liste d'URL en Python
Sélectionnez au hasard un fichier dans la liste Téléchargez plusieurs fichiers en même temps Restreindre les téléchargements depuis le même domaine Verrouillé pour que l'écriture de fichier ne se produise pas en même temps
async_downloader.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time, os, glob, random, requests, threading
headers = { 'User-Agent' : 'Mozilla/5.0' }
cwd = os.getcwd()
result_dir = cwd + '/download/'
list_dir = cwd + '/list/'
done_file = 'done.txt'
fail_file = 'fail.txt'
wait_sec = 1
max_download = 5
class FileHandler(object):
def __init__(self):
self.lock = threading.Lock()
self.file_list = glob.glob(list_dir + '*')
self.done_file = done_file
self.fail_file = fail_file
def clearEmptyRows(self):
self.lock.acquire()
for url_list in self.file_list:
with open(url_list, 'r') as f:
urls = f.read().split('\n')
while '' in urls:
urls.remove('')
with open(url_list, 'w') as f:
f.write('\n'.join(urls))
self.lock.release()
def hasFile(self):
return True if len(self.file_list) else False
def saveDone(self, url):
self.lock.acquire()
with open(self.done_file, 'a') as f:
f.write(url + '\n')
self.lock.release()
def saveFail(self, url):
self.lock.acquire()
with open(self.fail_file, 'a') as f:
f.write(url + '\n')
self.lock.release()
def extractDomain(self, url):
return url.replace('http://', '').replace('https://', '').split('/')[0]
def getUrl(self):
self.lock.acquire()
url_file = random.choice(self.file_list)
with open(url_file, 'r') as f:
urls = f.read().split('\n')
i = random.randrange(len(urls))
domain_new = self.extractDomain(urls[i])
for thread in threading.enumerate():
domain_old = self.extractDomain(thread.name)
if domain_new == domain_old:
self.lock.release()
return self.getUrl()
result = urls.pop(i)
if len(urls):
with open(url_file, 'w') as f:
f.write('\n'.join(urls))
else:
os.remove(url_file)
self.file_list.remove(url_file)
self.lock.release()
return result
def asyncDownload():
def saveImage(file_handler, response):
url = response.url
image = response.content
path_relative = url.replace('http://', '').replace('https://', '')
paths = os.path.split(path_relative)[0].split('/')
path_current = result_dir
for path in paths:
path_current += path + '/'
if not os.path.exists(path_current):
os.mkdir(path_current)
with open('{result_dir}{path_relative}'.format(result_dir = result_dir, path_relative = path_relative), 'wb') as f:
f.write(image)
def downloadImage(file_handler, url):
print('download ' + url)
try:
res = requests.get(url, headers = headers)
saveImage(file_handler, res)
file_handler.saveDone(url)
print('done ' + url)
except requests.exceptions.RequestException as e:
file_handler.saveFail(url)
print('fail ' + e)
file_handler = FileHandler()
file_handler.clearEmptyRows()
while True:
if not file_handler.hasFile():
break
else:
url = file_handler.getUrl()
threading.Thread(name = url, target = downloadImage, args = (file_handler, url)).start()
while threading.active_count() > max_download:
time.sleep(wait_sec)
asyncDownload()
threading - gérer le traitement parallèle par threads Comment télécharger des images de seins - Version Python 2012
Recommended Posts