Parallel download in Python

Parallel download in Python

In the following article, I wrote the code to download from the URL list in order, so I modified it to download multiple images at the same time.

Download images from URL list in Python

point

Randomly select a file from the list Download multiple files at the same time Restrict downloads from the same domain Locked so that file writing does not occur at the same time

code

async_downloader.py


# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time, os, glob, random, requests, threading

headers = { 'User-Agent' : 'Mozilla/5.0' }

cwd = os.getcwd()
result_dir = cwd + '/download/'
list_dir = cwd + '/list/'
done_file = 'done.txt'
fail_file = 'fail.txt'
wait_sec = 1
max_download = 5

class FileHandler(object):
    def __init__(self):
        self.lock = threading.Lock()
        self.file_list = glob.glob(list_dir + '*')
        self.done_file = done_file
        self.fail_file = fail_file

    def clearEmptyRows(self):
        self.lock.acquire()
        for url_list in self.file_list:
            with open(url_list, 'r') as f:
                urls = f.read().split('\n')
            while '' in urls:
                urls.remove('')
            with open(url_list, 'w') as f:
                f.write('\n'.join(urls))
        self.lock.release()

    def hasFile(self):
        return True if len(self.file_list) else False

    def saveDone(self, url):
        self.lock.acquire()
        with open(self.done_file, 'a') as f:
            f.write(url + '\n')
        self.lock.release()

    def saveFail(self, url):
        self.lock.acquire()
        with open(self.fail_file, 'a') as f:
            f.write(url + '\n')
        self.lock.release()

    def extractDomain(self, url):
        return url.replace('http://', '').replace('https://', '').split('/')[0]

    def getUrl(self):
        self.lock.acquire()
        url_file = random.choice(self.file_list)
        with open(url_file, 'r') as f:
            urls = f.read().split('\n')
        i = random.randrange(len(urls))
        domain_new = self.extractDomain(urls[i])
        for thread in threading.enumerate():
            domain_old = self.extractDomain(thread.name)
            if domain_new == domain_old:
                self.lock.release()
                return self.getUrl()
        result = urls.pop(i)
        if len(urls):
            with open(url_file, 'w') as f:
                f.write('\n'.join(urls))
        else:
            os.remove(url_file)
            self.file_list.remove(url_file)
        self.lock.release()
        return result

def asyncDownload():
    def saveImage(file_handler, response):
        url = response.url
        image = response.content
        path_relative = url.replace('http://', '').replace('https://', '')
        paths = os.path.split(path_relative)[0].split('/')
        path_current = result_dir
        for path in paths:
            path_current += path + '/'
            if not os.path.exists(path_current):
                os.mkdir(path_current)
        with open('{result_dir}{path_relative}'.format(result_dir = result_dir, path_relative = path_relative), 'wb') as f:
            f.write(image)

    def downloadImage(file_handler, url):
        print('download ' + url)
        try:
            res = requests.get(url, headers = headers)
            saveImage(file_handler, res)
            file_handler.saveDone(url)
            print('done ' + url)
        except requests.exceptions.RequestException as e:
            file_handler.saveFail(url)
            print('fail ' + e)

    file_handler = FileHandler()
    file_handler.clearEmptyRows()
    while True:
        if not file_handler.hasFile():
            break
        else:
            url = file_handler.getUrl()
            threading.Thread(name = url, target = downloadImage, args = (file_handler, url)).start()
            while threading.active_count() > max_download:
                time.sleep(wait_sec)

asyncDownload()

Referenced site

threading – manage parallel processing by threads How to download breast images-2012 Python version

Recommended Posts

Parallel download in Python
Download the file in Python
Run Python unittests in parallel
Download python
Download Google Drive files in Python
Read files in parallel with Python
Quadtree in Python --2
Python in optimization
CURL in python
Metaprogramming in Python
Python 3.3 in Anaconda
Geocoding in python
SendKeys in Python
Meta-analysis in Python
Unittest in python
Epoch in Python
Discord in Python
Sudoku in Python
DCI in Python
quicksort in python
nCr in python
N-Gram in Python
Programming in python
Plink in Python
Constant in python
Lifegame in Python.
FizzBuzz in Python
Sqlite in python
StepAIC in Python
N-gram in python
LINE-Bot [0] in Python
Csv in python
Disassemble in Python
Reflection in Python
Constant in python
nCr in Python.
format in python
Scons in Python3
Puyo Puyo in python
python in virtualenv
PPAP in Python
Quad-tree in Python
Reflection in Python
Chemistry in Python
Hashable in python
DirectLiNGAM in Python
LiNGAM in Python
Flatten in python
flatten in python
Download images from URL list in Python
Download files in any format using Python
Parallel task execution using concurrent.futures in Python
Sorted list in Python
Daily AtCoder # 36 in Python
Clustering text in Python
Daily AtCoder # 2 in Python
Implement Enigma in python
Daily AtCoder # 32 in Python
Daily AtCoder # 6 in Python
Daily AtCoder # 18 in Python
Edit fonts in Python