Scrap lens information from Kakaku.com in python. Get all price information and lens specs with csv. The items to be acquired are as follows.
Camera name, ranking, lowest price, lowest credit price, price URL, compatible mount, lens type, focus, detailed lens type, full size compatible, APS-C only, lens configuration, number of aperture blades, focal length, shortest shooting distance, Maximum shooting magnification, open F value, angle of view, camera shake correction mechanism, drip-proof, dust-proof, wide-angle, telephoto, macro, high magnification, fisheye (fisheye), tilt shooting, mirror, large diameter, pancake, filter diameter , Maximum diameter x length, weight
renzu.py
from bs4 import BeautifulSoup
import urllib.request
import re
import requests
import time
import datetime
#Target site URL
#price.com lens ranking
url = "https://kakaku.com/camera/camera-lens/ranking_1050/"
page_count = 1
linklist = []
#Get each lens page from all ranking pages
while True:
category_res = requests.get(url + "?page=" + str(page_count)).text
soup = BeautifulSoup(category_res, 'html.parser') #Beautiful Soup initialization
print ("{}Page page".format(page_count))
for elm in soup.find_all("a"):
if 'href' in elm.attrs:
link_url = elm.attrs['href']
if "https://kakaku.com/item/" in link_url:
linklist.append(link_url)
# print(link_url)
#Flag until the next page runs out
a_next_tag= soup.find_all("li", {"class": "next"})
if a_next_tag:
# if page_count < 1:
page_count += 1
continue
break
#Remove duplicates
linklist = sorted(list(set(linklist)),key=linklist.index)
################################################################
#Write file name (acquisition date and time)
now = datetime.datetime.now()
filename = "renzu"+now.strftime('%Y%m%d_%H%M%S') + '.csv'
f = open(filename,'a', encoding='cp932',errors='ignore')
f.write("Camera name,Ranking,Lowest price,クレジットLowest price,Price URL,")
#Move to the lens spec sheet
page_html = linklist[0] + "spec/#tab"
res = urllib.request.urlopen(page_html)
page_soup = BeautifulSoup(res, 'html.parser')
#Get a table of lens specs
table = page_soup.findAll("table", {"class":"tblBorderGray mTop15"})[0]
rows = table.findAll("tr")
index=-1
#Write each heading in the lens spec table
for row in rows:
csvRow = []
for cell in row.findAll('th'):
index +=1
if index==0:
continue
if index==17:
continue
if index==26:
continue
if index==29:
continue
cell=cell.get_text()
cell=re.sub(r"[\n\t\s]*", "", str(cell))
f.write(cell)
f.write(",")
f.write("\n")
#Write lens price information
for page_url in linklist:
page_html = page_url + "spec/#tab"
res = urllib.request.urlopen(page_html)
page_soup = BeautifulSoup(res, 'html.parser')
#Required elements and class name
name = page_soup.find("h2",itemprop="name").text
try:
rank = page_soup.find("span",class_="rankNum").text
except AttributeError:
rank = ''
try:
low_price = page_soup.find("div", class_="priceWrap").find("span",class_="priceTxt").text
low_price =low_price.replace(',', '')
except AttributeError:
low_price = ''
try:
cre_price = page_soup.find("div", class_="creditCard").find("span",class_="priceTxt").text
cre_price =cre_price.replace(',', '')
except AttributeError:
cre_price = ''
print(rank)
print(low_price)
f.write(name)
f.write(",")
f.write(rank)
f.write(",")
f.write(low_price)
f.write(",")
f.write(cre_price)
f.write(",")
f.write(page_url)
f.write(",")
#Write lens information
#Specify table
table = page_soup.findAll("table", {"class":"tblBorderGray mTop15"})[0]
rows = table.findAll("tr")
#Write table
for row in rows:
csvRow = []
for cell in row.findAll('td'):
cell=cell.get_text()
cell=re.sub(r"[\n\t\s]*", "", str(cell))
f.write(cell)
f.write(",")
f.write("\n")
f.close()
Recommended Posts