Je souhaite classer les images par apprentissage en profondeur, je l'ai donc implémenté tout en étudiant le scraping.

0. Importation de modules

Utilisez une belle soupe.

from bs4 import BeautifulSoup
import urllib
import os
from PIL import Image
import matplotlib.pyplot as plt
from numpy import *
import six
%matplotlib inline

1. Grattage à partir de la recherche d'images

Je le sors des moteurs de recherche Yahoo et Bing.

class imageGetter():
    def __init__(self):
        pass

    def set_search_engine(self, key="yahoo"):
        if key == "yahoo":
            self.search_engine = u"https://search.yahoo.co.jp/image/search?p={0}&oq=&ei=UTF-8&b=21"
        elif key == "bing":
            self.search_engine = u"https://www.bing.com/images/search?q={0}"
    def series_process(self, word_list, padir="padir", key="yahoo", rec=1000):
        os.mkdir(padir)
        for word in word_list:
            o = imageGetter()
            o.search(word, False, key)
            
            cnt = 0
            try:
                for i in range(2, rec):
                    print(i)
                    o.next(i)
                    cnt = i
            except:
                pass
            print("Curation of the images from {} pages is Succeed".format(cnt))
            path = padir+"/"+word
            os.mkdir(path)
            print("Images is dumped at {}".format(path))
            o.dump(path)

    def search(self, search_word, is_show=False, key="yahoo"):
        self.set_search_engine(key)
        self.search_word = search_word
        response = urllib.request.urlopen(self.search_engine.format(urllib.parse.quote(search_word)))
        soup = BeautifulSoup(response, "lxml")

        urllst = []
        for obj in soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                urllst += [line[:n+3]]
            except:
                pass
        self.urllst = urllst
        if is_show:
            self.print()

    def next(self, idx=2, is_show=False):
        url = self.soup.find_all("a",string="%s"%idx)[0].get("href")
        response = urllib.request.urlopen(url)
        self.soup = BeautifulSoup(response, "lxml")
        
        for obj in self.soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                self.urllst += [line[:n+3]]
            except:
                pass
        if is_show:
            self.print()

    def print(self):
        for line in self.urllst:
            print(line)

    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

        plt.show()

    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

Vous ne pouvez l'obtenir que sur l'ordre de 10 feuilles. .. .. Y a-t-il un bon moyen?

1.1. Comment utiliser

Obtenez une liste d'urls d'image.

o = imageGetter()
o.search("chien", False) #Lorsqu'il est défini sur True, l'URL est affichée en standard

Aperçu de l'image (sortie). S'il s'agit d'un notebook jupyter, les images seront répertoriées dans la sortie.

o.show()

：：：

Vous pouvez vider les images dans un répertoire.

path = "./tmp"
o.dump(path)

2. Grattage depuis Flickr

Vous avez besoin d'une clé API (voir ici ou ici (http://helog.jp/api-2/introduction-6/)).

class Flickr_handler:
    def __init__(self, key):
        self.key = key
    
    def search(self, search_word):
        self.search_word = search_word
        line = "https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={0}&per_page=500&format=rest&text={1}"
        
        response = urllib.request.urlopen(line.format(self.key, search_word))
        soup = BeautifulSoup(response, "lxml")
        self.soup = soup
        lst = soup.find_all("photo")
        self.lst = lst
        urllst = []
        for tag in lst:
            urllst += ["http://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg ".format(lst[0].get("farm"),
                 tag.get("server"),
                 tag.get("id"),
                 tag.get("secret"))]
        self.urllst = urllst
        #response = urllib.request.urlopen(url.format(urllib.parse.quote(search_word)))
        
    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))
                
        plt.show()
    
    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

key = #your key 
o = Flickr_handler(key)
o.search("car")

L'autre utilisation est la même que 1. Vous pouvez obtenir jusqu'à 500 feuilles ici.

Grattage d'images ②-Obtenez des images de bing, yahoo, Flickr

0. Importation de modules

1. Grattage à partir de la recherche d'images

1.1. Comment utiliser

2. Grattage depuis Flickr