Beautifulsoup4
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
url = "http://example.jp"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
urljoin(url, "index.html")
with requests.Session() as s:
r = s.get("http://example.jp", headers = headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
Pandas
import pandas as pd
df = pd.read_html("http://example.jp", header=0, index_col=0)
Selenium
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
import time
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome("chromedriver", options=options)
driver.implicitly_wait(10)
#Main window
parent_window = driver.current_window_handle
driver.get("http://example.jp")
#URL display
print(driver.current_url)
time.sleep(3)
#click
driver.find_element_by_link_text("XXXXX").click()
#Window switching
driver.switch_to.window(driver.window_handles[-1])
Recommended Posts