Snippets (scraping) registered in Google Colaboratory

Beautifulsoup4

base

from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

url = "http://example.jp"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")


urljoin(url, "index.html")

session

with requests.Session() as s:

    r = s.get("http://example.jp", headers = headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

Pandas

import pandas as pd

df = pd.read_html("http://example.jp", header=0, index_col=0)

Selenium

!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options

import time

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome("chromedriver", options=options)
driver.implicitly_wait(10)

#Main window
parent_window = driver.current_window_handle

driver.get("http://example.jp")

#URL display
print(driver.current_url)

time.sleep(3)

#click
driver.find_element_by_link_text("XXXXX").click()

#Window switching
driver.switch_to.window(driver.window_handles[-1])