I want to classify images by deep learning, so I implemented it while investigating scraping.

0. Module import

Use Beautiful Soup.

from bs4 import BeautifulSoup
import urllib
import os
from PIL import Image
import matplotlib.pyplot as plt
from numpy import *
import six
%matplotlib inline

1. Scraping from image search

I'm pulling it out of yahoo and bing search engines.

class imageGetter():
    def __init__(self):
        pass

    def set_search_engine(self, key="yahoo"):
        if key == "yahoo":
            self.search_engine = u"https://search.yahoo.co.jp/image/search?p={0}&oq=&ei=UTF-8&b=21"
        elif key == "bing":
            self.search_engine = u"https://www.bing.com/images/search?q={0}"
    def series_process(self, word_list, padir="padir", key="yahoo", rec=1000):
        os.mkdir(padir)
        for word in word_list:
            o = imageGetter()
            o.search(word, False, key)
            
            cnt = 0
            try:
                for i in range(2, rec):
                    print(i)
                    o.next(i)
                    cnt = i
            except:
                pass
            print("Curation of the images from {} pages is Succeed".format(cnt))
            path = padir+"/"+word
            os.mkdir(path)
            print("Images is dumped at {}".format(path))
            o.dump(path)

    def search(self, search_word, is_show=False, key="yahoo"):
        self.set_search_engine(key)
        self.search_word = search_word
        response = urllib.request.urlopen(self.search_engine.format(urllib.parse.quote(search_word)))
        soup = BeautifulSoup(response, "lxml")

        urllst = []
        for obj in soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                urllst += [line[:n+3]]
            except:
                pass
        self.urllst = urllst
        if is_show:
            self.print()

    def next(self, idx=2, is_show=False):
        url = self.soup.find_all("a",string="%s"%idx)[0].get("href")
        response = urllib.request.urlopen(url)
        self.soup = BeautifulSoup(response, "lxml")
        
        for obj in self.soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                self.urllst += [line[:n+3]]
            except:
                pass
        if is_show:
            self.print()

    def print(self):
        for line in self.urllst:
            print(line)

    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

        plt.show()

    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

You can only get it on the order of 10 sheets. .. .. Is there any good way?

1.1. How to use

Get a list of image urls.

o = imageGetter()
o.search("dog", False) #When set to True, url is output as standard

Image preview (output). If it is a jupyter notebook, the images will be listed in the output.

o.show()

：：：

You can dump the image to a directory.

path = "./tmp"
o.dump(path)

2. Scraping from Flickr

You will need the API key (see here or here (http://helog.jp/api-2/introduction-6/)).

class Flickr_handler:
    def __init__(self, key):
        self.key = key
    
    def search(self, search_word):
        self.search_word = search_word
        line = "https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={0}&per_page=500&format=rest&text={1}"
        
        response = urllib.request.urlopen(line.format(self.key, search_word))
        soup = BeautifulSoup(response, "lxml")
        self.soup = soup
        lst = soup.find_all("photo")
        self.lst = lst
        urllst = []
        for tag in lst:
            urllst += ["http://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg ".format(lst[0].get("farm"),
                 tag.get("server"),
                 tag.get("id"),
                 tag.get("secret"))]
        self.urllst = urllst
        #response = urllib.request.urlopen(url.format(urllib.parse.quote(search_word)))
        
    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))
                
        plt.show()
    
    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

key = #your key 
o = Flickr_handler(key)
o.search("car")

Other usage is the same as 1. You can get up to 500 sheets here.

Image scraping ②-Get images from bing, yahoo, Flickr

0. Module import

1. Scraping from image search

1.1. How to use

2. Scraping from Flickr