Parallel download in Python

In the following article, I wrote the code to download from the URL list in order, so I modified it to download multiple images at the same time.

Download images from URL list in Python

point

Randomly select a file from the list Download multiple files at the same time Restrict downloads from the same domain Locked so that file writing does not occur at the same time

code

`async_downloader.py`


# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time, os, glob, random, requests, threading

headers = { 'User-Agent' : 'Mozilla/5.0' }

cwd = os.getcwd()
result_dir = cwd + '/download/'
list_dir = cwd + '/list/'
done_file = 'done.txt'
fail_file = 'fail.txt'
wait_sec = 1
max_download = 5

class FileHandler(object):
    def __init__(self):
        self.lock = threading.Lock()
        self.file_list = glob.glob(list_dir + '*')
        self.done_file = done_file
        self.fail_file = fail_file

    def clearEmptyRows(self):
        self.lock.acquire()
        for url_list in self.file_list:
            with open(url_list, 'r') as f:
                urls = f.read().split('\n')
            while '' in urls:
                urls.remove('')
            with open(url_list, 'w') as f:
                f.write('\n'.join(urls))
        self.lock.release()

    def hasFile(self):
        return True if len(self.file_list) else False

    def saveDone(self, url):
        self.lock.acquire()
        with open(self.done_file, 'a') as f:
            f.write(url + '\n')
        self.lock.release()

    def saveFail(self, url):
        self.lock.acquire()
        with open(self.fail_file, 'a') as f:
            f.write(url + '\n')
        self.lock.release()

    def extractDomain(self, url):
        return url.replace('http://', '').replace('https://', '').split('/')[0]

    def getUrl(self):
        self.lock.acquire()
        url_file = random.choice(self.file_list)
        with open(url_file, 'r') as f:
            urls = f.read().split('\n')
        i = random.randrange(len(urls))
        domain_new = self.extractDomain(urls[i])
        for thread in threading.enumerate():
            domain_old = self.extractDomain(thread.name)
            if domain_new == domain_old:
                self.lock.release()
                return self.getUrl()
        result = urls.pop(i)
        if len(urls):
            with open(url_file, 'w') as f:
                f.write('\n'.join(urls))
        else:
            os.remove(url_file)
            self.file_list.remove(url_file)
        self.lock.release()
        return result

def asyncDownload():
    def saveImage(file_handler, response):
        url = response.url
        image = response.content
        path_relative = url.replace('http://', '').replace('https://', '')
        paths = os.path.split(path_relative)[0].split('/')
        path_current = result_dir
        for path in paths:
            path_current += path + '/'
            if not os.path.exists(path_current):
                os.mkdir(path_current)
        with open('{result_dir}{path_relative}'.format(result_dir = result_dir, path_relative = path_relative), 'wb') as f:
            f.write(image)

    def downloadImage(file_handler, url):
        print('download ' + url)
        try:
            res = requests.get(url, headers = headers)
            saveImage(file_handler, res)
            file_handler.saveDone(url)
            print('done ' + url)
        except requests.exceptions.RequestException as e:
            file_handler.saveFail(url)
            print('fail ' + e)

    file_handler = FileHandler()
    file_handler.clearEmptyRows()
    while True:
        if not file_handler.hasFile():
            break
        else:
            url = file_handler.getUrl()
            threading.Thread(name = url, target = downloadImage, args = (file_handler, url)).start()
            while threading.active_count() > max_download:
                time.sleep(wait_sec)

asyncDownload()

Referenced site

threading – manage parallel processing by threads How to download breast images-2012 Python version