Earlier I wrote how to download thumbnail images from Google Image Search, This time, it turned out that if you display the image in detail (when you click the search result once), the link of the original image will be displayed on the page source. Let's use this specification to download the original image (Be careful not to overload it because Google is strong)
** Google image search on Selenium ↓ After displaying the details of the first image, press the right cursor key. ↓ Get the link of the original image ↓ download**
If you haven't installed Selenium, requests
, please.
ChromeDriver
is set in the execution path, so please rewrite it as appropriate (just below the import statement).
I haven't refactored it, so it's dirty, but I'm sorry.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
import re
import urllib.request
import os
from tqdm import tqdm
DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
#↓ It is better to display it so that you can handle it manually when you can not scroll well
#options.add_argument('--headless')
def search():
global driver, actions
driver = webdriver.Chrome(executable_path=DRIVER_PATH,
chrome_options=options)
actions = ActionChains(driver)
url = "https://www.google.com/search?q=" + '+'.join(
query.split()) + "&safe=off&hl=ja&source=lnms&tbm=isch&sa=X"
driver.get(url)
while not driver.find_elements_by_class_name("wXeWr.islib.nfEiy.mM5pbd"):
time.sleep(.5)
driver.find_element_by_class_name("wXeWr.islib.nfEiy.mM5pbd").click()
def getLinks():
global srcs
more = driver.find_element_by_class_name("mye4qd")
end = driver.find_element_by_class_name("OuJzKb.Yu2Dnd")
for i in range(100):
actions.key_down(Keys.ARROW_RIGHT)
cnt = 1
while not (more.is_displayed() or end.is_displayed()):
if cnt % 5 == 0:
if len(
re.findall("imgurl=(.+?)&",
urllib.parse.unquote(
driver.page_source))) > max_num + 5:
break
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
actions.perform()
time.sleep(1)
cnt += 1
if more.is_displayed(): more.click()
while not end.is_displayed():
if cnt % 5 == 0:
if len(
re.findall("imgurl=(.+?)&",
urllib.parse.unquote(
driver.page_source))) > max_num + 5:
break
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
actions.perform()
time.sleep(1)
cnt += 1
for _ in range(5):
actions.perform()
time.sleep(1)
srcs = re.findall("imgurl=(.+?)&",
urllib.parse.unquote(driver.page_source))
driver.close()
def download():
filename = '_'.join(query.split())
while True:
if not os.path.exists(filename):
os.mkdir(filename)
break
else:
filename += "_"
for i, src in enumerate(tqdm(srcs[:max_num])):
ext = src[-4:] if src[-4:] in ('.jpg', '.png', '.gif') else '.png'
with open(f"{filename}\\{filename}{i}{ext}", "wb") as f:
try:
f.write(requests.get(src).content)
except:
try:
with urllib.request.urlopen(src) as u:
f.write(u.read())
except:
continue
if __name__ == "__main__":
query = input("Search: ")
max_num = int(input("How many will you download? (maximum)"))
print("Searching...")
search()
print("Done.")
print("Getting links...")
getLinks()
print("Done.")
print("Now downloading...")
download()
print("Done.")
It will take some time.
Please use it moderately.
Recommended Posts