Je souhaite classer les images par apprentissage en profondeur, je l'ai donc implémenté tout en étudiant le scraping.
Utilisez une belle soupe.
from bs4 import BeautifulSoup
import urllib
import os
from PIL import Image
import matplotlib.pyplot as plt
from numpy import *
import six
%matplotlib inline
Je le sors des moteurs de recherche Yahoo et Bing.
class imageGetter():
def __init__(self):
pass
def set_search_engine(self, key="yahoo"):
if key == "yahoo":
self.search_engine = u"https://search.yahoo.co.jp/image/search?p={0}&oq=&ei=UTF-8&b=21"
elif key == "bing":
self.search_engine = u"https://www.bing.com/images/search?q={0}"
def series_process(self, word_list, padir="padir", key="yahoo", rec=1000):
os.mkdir(padir)
for word in word_list:
o = imageGetter()
o.search(word, False, key)
cnt = 0
try:
for i in range(2, rec):
print(i)
o.next(i)
cnt = i
except:
pass
print("Curation of the images from {} pages is Succeed".format(cnt))
path = padir+"/"+word
os.mkdir(path)
print("Images is dumped at {}".format(path))
o.dump(path)
def search(self, search_word, is_show=False, key="yahoo"):
self.set_search_engine(key)
self.search_word = search_word
response = urllib.request.urlopen(self.search_engine.format(urllib.parse.quote(search_word)))
soup = BeautifulSoup(response, "lxml")
urllst = []
for obj in soup.find_all("img"):
line = obj.get("rel")
try:
n = line.index("jpg")
urllst += [line[:n+3]]
except:
pass
self.urllst = urllst
if is_show:
self.print()
def next(self, idx=2, is_show=False):
url = self.soup.find_all("a",string="%s"%idx)[0].get("href")
response = urllib.request.urlopen(url)
self.soup = BeautifulSoup(response, "lxml")
for obj in self.soup.find_all("img"):
line = obj.get("rel")
try:
n = line.index("jpg")
self.urllst += [line[:n+3]]
except:
pass
if is_show:
self.print()
def print(self):
for line in self.urllst:
print(line)
def show(self):
print("{0} images.".format(len(self.urllst)))
for line in self.urllst:
try:
file = six.BytesIO(urllib.request.urlopen(line).read())
plt.figure(figsize=(10, 10), dpi=80)
plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
plt.axis('off')
plt.imshow(array(Image.open(file)))
except urllib.error.HTTPError:
print("{0} is not found".format(line))
plt.show()
def dump(self, path):
for i, line in enumerate(self.urllst):
try:
file = six.BytesIO(request.urlopen(line).read())
img = Image.open(file)
img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
except urllib.error.HTTPError:
print("{0} is not found".format(line))
Vous ne pouvez l'obtenir que sur l'ordre de 10 feuilles. .. .. Y a-t-il un bon moyen?
Obtenez une liste d'urls d'image.
o = imageGetter()
o.search("chien", False) #Lorsqu'il est défini sur True, l'URL est affichée en standard
Aperçu de l'image (sortie). S'il s'agit d'un notebook jupyter, les images seront répertoriées dans la sortie.
o.show()
: : :
Vous pouvez vider les images dans un répertoire.
path = "./tmp"
o.dump(path)
Vous avez besoin d'une clé API (voir ici ou ici (http://helog.jp/api-2/introduction-6/)).
class Flickr_handler:
def __init__(self, key):
self.key = key
def search(self, search_word):
self.search_word = search_word
line = "https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={0}&per_page=500&format=rest&text={1}"
response = urllib.request.urlopen(line.format(self.key, search_word))
soup = BeautifulSoup(response, "lxml")
self.soup = soup
lst = soup.find_all("photo")
self.lst = lst
urllst = []
for tag in lst:
urllst += ["http://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg ".format(lst[0].get("farm"),
tag.get("server"),
tag.get("id"),
tag.get("secret"))]
self.urllst = urllst
#response = urllib.request.urlopen(url.format(urllib.parse.quote(search_word)))
def show(self):
print("{0} images.".format(len(self.urllst)))
for line in self.urllst:
try:
file = six.BytesIO(urllib.request.urlopen(line).read())
plt.figure(figsize=(10, 10), dpi=80)
plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
plt.axis('off')
plt.imshow(array(Image.open(file)))
except urllib.error.HTTPError:
print("{0} is not found".format(line))
plt.show()
def dump(self, path):
for i, line in enumerate(self.urllst):
try:
file = six.BytesIO(urllib.request.urlopen(line).read())
img = Image.open(file)
img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
except urllib.error.HTTPError:
print("{0} is not found".format(line))
key = #your key
o = Flickr_handler(key)
o.search("car")
L'autre utilisation est la même que 1. Vous pouvez obtenir jusqu'à 500 feuilles ici.
Recommended Posts