This time I would like to write and explain the code that collects images from the specified URL using web scraping.
import requests
from requests.compat import urljoin
from bs4 import BeautifulSoup
import time
from PIL import Image
import urllib.request
import sys.os
class web_scryping:
def __init__(self , url):
self.url = url
self.soup = BeautifulSoup(requests.get(self.url).content, 'lxml')
class download_images(web_scryping):
def download(self , max_down_num):
self.down_num = 0
self.max_down_num = max_down_num
self.save_path = './img/' + str(self.down_num+1) + '.jpg'
now_num = 0
for link in self.soup.find_all("img"):
src_attr = link.get("src")
target = urljoin(self.url, src_attr)
resp = requests.get(target)
image = resp.content
#breakpoint()
print(str(resp) + ' ' + str(now_num))
now_num = now_num + 1
if str(resp) != '<Response [404]>':
with open(self.save_path, 'wb') as f:
f.write(image)
self.down_num = self.down_num + 1
time.sleep(1)
self.save_path = './img/' + str(self.down_num+1) + '.jpg'
if self.down_num == self.max_down_num:
break
def img_resize(self , img_path):
try:
im = Image.open(img_path)
print("Original image size width: {}, height: {}".format(im.size[0], im.size[1]))
im_resize = im.resize(size=(800,1200))
im_resize.save(save_path)
print('image resize sucess')
except:
print('image resize failed')
def main():
url = sys.argv[0]
di = download_images(url)
di.download(50)
if __name__ == '__main__':
main()
def main():
url = sys.argv[0]
di = download_images(url)
di.download(50)
Specify the URL as the first argument in the command line argument. Pass the URL to the download_images class, which inherits from the web_scryping class.
class web_scryping:
def __init__(self , url):
self.url = url
self.soup = BeautifulSoup(requests.get(self.url).content, 'lxml')
The download_images class inherits from the web_scryping class, and since the download_images class does not have an init method, the init method of the web_scryping class is started. Here, get the contents of the URL with the requests.get method and parse the contents of html with BeautifulSoup. Put the analysis result in a class variable called self.soup.
class download_images(web_scryping):
def download(self , max_down_num):
self.down_num = 0
self.max_down_num = max_down_num
self.save_path = './img/' + str(self.down_num+1) + '.jpg'
now_num = 0
for link in self.soup.find_all("img"):
src_attr = link.get("src")
target = urljoin(self.url, src_attr)
resp = requests.get(target)
image = resp.content
#breakpoint()
print(str(resp) + ' ' + str(now_num))
now_num = now_num + 1
if str(resp) != '<Response [404]>':
with open(self.save_path, 'wb') as f:
f.write(image)
self.down_num = self.down_num + 1
time.sleep(1)
self.save_path = './img/' + str(self.down_num+1) + '.jpg'
if self.down_num == self.max_down_num:
break
Use the download method of the download_images class in step 1 to start the download. For self.save_path, specify the name of the image file in the img directory as if it were a number .jpg. self.soup.find_all ("img"): Find the img tag in the html. src_attr = link.get ("src"): Get the src item from the img tag. image = resp.content: The image variable will contain the image object. if str (resp)! ='<Response [404]>': Save the image if it is not 404 because the resp variable contains the result of the response. time.sleep (1): When scraping, it is not desirable to put a burden on the website, so use the sleep method to spare time.
def img_resize(self , img_path):
try:
im = Image.open(img_path)
print("Original image size width: {}, height: {}".format(im.size[0], im.size[1]))
im_resize = im.resize(size=(800,1200))
im_resize.save(save_path)
print('image resize sucess')
except:
print('image resize failed')
This method adjusts the size of the image, but I didn't use it because the resolution was not so good when I increased it.
Recommended Posts