Extract articles with the top 10 stocks in a certain tag.
windows8.1 python3.5
We have ranked Python tags.
Execution method → python stock_rank.py> output.html
stock_rank.py
# -*- coding: utf-8 -*-
import urllib.request
from bs4 import BeautifulSoup
#Initialization of Contribution number
cont = []
for i in range(10):
cont.append(0)
#Title initialization
title = []
for i in range(10):
title.append("")
page_num = 1
while True:
try:
html = urllib.request.urlopen("https://qiita.com/tags/Python/items?page=" + str(page_num)).read()
soup = BeautifulSoup(html, "html.parser")
#HTML extraction by specifying the class
title_all = soup.find_all(class_="publicItem_body")
# publicItem_Skip pages without body class
if len(title_all) == 0:
continue
for i in range(20):
try:
#HTML extraction by specifying the class
cont_all = soup.find_all(class_="publicItem_stockCount")
#Remove annoying tags
cont_sakujo = str(cont_all[i]).replace('<i class="fa fa-stock "></i>','')
# cont_all_Since after is str type, string property cannot be used
#Therefore, convert to Beautiful Soup type
cont_kazu = int(BeautifulSoup(cont_sakujo, "html.parser").string)
for j in range(10):
if cont_kazu >= cont[j]:
#Contribution number substitution
cont.insert(j, cont_kazu)
cont.pop()
#Title assignment
title.insert(j, title_all[i])
title.pop()
break
#Skip articles that are not stocked by anyone
except:
continue
page_num += 1
# HTTP Error 404
except:
break
for i in range(len(title)):
print (str(cont[i]) + " " + str(title[i].a).replace('href="', 'href="http://qiita.com') + "<br>")
When displaying the encoding with utf-8, garbled characters occurred, so I changed it to shift-jis.
Program execution time is long (> _ <)
Get information on the net with Python3 + urllib + BeautifulSoup Scraping with Python and Beautiful Soup Scraping with Beautiful Soup
Recommended Posts