Beautiful Soup
A scraping library featuring a simple API that is easy to remember.
from urllib.parse import urljoin
from bs4 import BeautifulSoup
#Read HTML file
with open('html file') as f:
soup = BeautifulSoup(f, 'html.parser')
#Get the list of elements you want to get with select
for a in soup.select(element)
#Pull out the element you want to get
pyquery
pyquery is a library that can be scraped from HTML in the same way as jQuery. It uses lxml internally and can process at high speed.
from pyquery import PyQuery as pq
#Read an HTML file and get a PyQuery object
d = pq(filename='html file')
#Get the list of elements you want to get
for a in d(element):
#Pull out the element you want to get
Recommended Posts