If you are not interested in the code and want an image here
It is scraping of Hinatazaka's blog This time we are focusing only on image download
Python:3.7 beautifulsoup4:4.8.1
If you have any improvements, tsukkomi, etc., please leave a comment or Twitter (@Azumi_cpa).
from bs4 import BeautifulSoup
import requests
import time
def get_picture_url(url):
pic_urls = []
page_number = 0
#Get the url of the image on each page until the article disappears
while True:
print(url + "&page=" + str(page_number))
response = requests.get(url + "&page=" + str(page_number))
soup = BeautifulSoup(response.text, 'lxml')
a = soup.find_all('div', class_='p-blog-article')
#I think I can write more beautifully here
if a != []:
for b in a:
for c in b.find_all('img'):
pic_urls.append(c["src"])
print(page_number)
page_number += 1
time.sleep(3)
else:
break
return pic_urls
def save_pictures(name, url):
#Get image url
pic_urls = get_picture_url(url)
#Start saving
for i, url in enumerate(pic_urls):
try:
response = requests.get(url)
image = response.content
#Member name/number.jpg
file_name = name + "/" + str(i) + ".jpg "
with open(file_name, "wb") as aaa:
aaa.write(image)
time.sleep(3)
except:
print("error")
def get_members():
member_list = {}
response = requests.get('https://www.hinatazaka46.com/s/official/diary/member?ima=0000')
soup = BeautifulSoup(response.text, 'lxml')
members = soup.find_all("a", class_="p-blog-face__list")
for member in members:
#Remove spaces and line breaks
member_name = member.text.replace(" ", "")
member_name = member_name.replace("\n", "")
#Also get the url of the 0th page of the blog(From page 1 onwards"&page=n"Add
member_list[member_name] = "https://www.hinatazaka46.com" + member.attrs["href"]
return member_list
def main():
#Create member list
members_list = get_members()
for name, url in members_list.items():
print(name + "start")
save_pictures(name, url)
main()