There are already several articles on how to scrape images for image search, but there were few articles that could be used as they were, probably because minor specification changes were frequent, so I wrote an article including where there is a possibility of change. I will. Maybe all you have to do when it doesn't work is to use your browser's developer tools to find out what class and id names you need, but it's obvious to those who know, but if you don't know what's wrong It takes a long time.
macOS Mojave 10.14.5 Python 3.7.2 Chrome 80.0 date 2020-04-01
pip install selenium
Start → google search → try to end
from selenium import webdriver
#Specify the path of the downloaded web driver
DRIVER_PATH = '/User/hoge/1bin/chromedriver'
#Launch web browser
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
#Access google
wd.get('https://google.com')
#Select a search box.There are multiple ways to choose
search_box = wd.find_element_by_name('q')
# search_box = wd.find_element_by_css_selector('input.gLFyf')
# search_box = wd.find_element_by_class_name('gLFyf')
#Search for Qiita
search_box.send_keys('Qiita')
search_box.submit()
time.sleep(5)
#Quit the web browser
wd.quit()
First, get the URL of the thumbnail of the search result.
#Time to wait after operation such as clicking(Seconds)
sleep_between_interactions = 2
#Number of downloads
download_num = 3
#Search word
query = "cat"
#URL for image search
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
wd.get(search_url.format(q=query))
#Get the thumbnail image link(If it is moss here, actually check the selector and change it)
thumbnail_results = wd.find_elements_by_css_selector("img.rg_i")
Click the URL of the obtained thumbnail with the webdriver to display the image and get the URL of the original image. Also, since the class name specified when acquiring the original image URL is subject to change, use the developer tools as appropriate to check and change it.
image_urls = set()
for img in thumbnail_results[:download_num]:
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
#Since you can not get the url in one shot, narrow down after making candidates(Please tell me if there is a way)
# 'n3VNCb'Is subject to change, so look at the element of the clicked image and change it accordingly.
url_candidates = wd.find_elements_by_class_name('n3VNCb')
for candidate in url_candidates:
url = candidate.get_attribute('src')
if url and 'https' in url:
image_urls.add(url)
#If you do not wait for a while, it will not end normally, so add 3 seconds
time.sleep(sleep_between_interactions+3)
wd.quit()
All you have to do is download the image from the URL and you don't need a web driver.
import os
from PIL import Image
import hashlib
image_save_folder_path = 'data'
for url in image_urls:
try:
image_content = requests.get(url).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
try:
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
file_path = os.path.join(image_save_folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
with open(file_path, 'wb') as f:
image.save(f, "JPEG", quality=90)
print(f"SUCCESS - saved {url} - as {file_path}")
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
For google search window, `search_box = wd.find_element_by_name ('q')` ` For the original image of the image search result,
wd.find_elements_by_class_name ('n3VNCb')
`
These can be confirmed using the developer tools of the browser.
Browser "..."-> Other Tools-> Developer Tools
Shortcuts are ``` Ctrl Shift J (on Windows)
or
cmd + Option + J (on Mac)
If you have an image you want to specify, right-click the image and select "Verify" to display the elements corresponding to the image. Image search thumbnails have the following elements. Elements can be copied by clicking the'...' that appears in the selected state.
<img alt="Dogs | The Humane Society of the United States" class="n3VNCb" src="https://www.humanesociety.org/sites/default/files/styles/1240x698/public/2019/02/dog-451643.jpg?h=bf654dbc&itok=MQGvBmuo" data-noaft="1" jsname="HiaYvf" jsaction="load:XAeZkd;" style="width: 450px; height: 253.306px; margin: 0px;">
Since it says class = "n3VNCb" You can select it with wd.find_elements_by_class_name ('n3VNCb').
For copy
import os
import time
from selenium import webdriver
from PIL import Image
import io
import requests
import hashlib
#Time to wait after operation such as clicking(Seconds)
sleep_between_interactions = 2
#Number of downloads
download_num = 3
#Search word
query = "cat"
#URL for image search
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
#Get URL of thumbnail image
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
wd.get(search_url.format(q=query))
#Get the thumbnail image link(If it is moss here, actually check the selector and change it)
thumbnail_results = wd.find_elements_by_css_selector("img.rg_i")
#Click the thumbnail to get the URL of each image
image_urls = set()
for img in thumbnail_results[:download_num]:
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
#Since you can not get the url in one shot, narrow down after making candidates(Please tell me if there is a way)
# 'n3VNCb'Is subject to change, so look at the element of the clicked image and change it accordingly.
url_candidates = wd.find_elements_by_class_name('n3VNCb')
for candidate in url_candidates:
url = candidate.get_attribute('src')
if url and 'https' in url:
image_urls.add(url)
#If you do not wait for a while, it will not end normally, so add 3 seconds
time.sleep(sleep_between_interactions+3)
wd.quit()
#Image download
image_save_folder_path = 'data'
for url in image_urls:
try:
image_content = requests.get(url).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
try:
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
file_path = os.path.join(image_save_folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
with open(file_path, 'wb') as f:
image.save(f, "JPEG", quality=90)
print(f"SUCCESS - saved {url} - as {file_path}")
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
Recommended Posts