The explanation of the following people was easy to understand. Reference: Scraping with Python (1) Introduction | Let's automatically extract data using scraping
I understand the flow of web scraping. Next, I would like to modify the code so that information can be obtained from multiple sites.
import requests
from bs4 import BeautifulSoup
html_doc = requests.get("https://www.yahoo.co.jp/").text #HTML acquisition of yahoo site
soup = BeautifulSoup(html_doc, 'html.parser') #Beautiful Soup initialization
print(soup.prettify()) #Indent HTML to make it easier to read.
#Get Title
title = soup.title.text
print(title)
#reference)
# Python,How to use Requests
# https://note.nkmk.me/python-requests-usage/
#
#Response object
# url:url attribute
#Status code: status_code attribute
#encoding:encoding attribute
#Response header:headers attribute
#text:text attribute
#Binary data:content attribute
#Get Description
meta_description = soup.find('meta', {'name' : 'description'})
description = meta_description['content']
print(description)
#When getting multiple tags
tags = soup.find_all("a")
print(tags)
##result
[<a class="yMWCYupQNdgppL-NV6sMi _3sAlKGsIBCxTUbNi86oSjt" data-ylk="slk:help;pos:0" href="https://www.yahoo-help.jp/">help</a>,
<a class="yMWCYupQNdgppL-NV6sMi _3sAlKGsIBCxTUbNi86oSjt" data-ylk="rsec:header;slk:logo;pos:0" href="https://www.yahoo.co.jp">Yahoo! JAPAN</a>,
<a aria-label="Transition to premium" class="yMWCYupQNdgppL-NV6sMi _3sAlKGsIBCxTUbNi86oSjt" data-ylk="rsec:header;slk:premium;pos:0" href="https://premium.yahoo.co.jp/"><p class="oLvk9L5Yk-9JOuzi-OHW5"><span class="t_jb9bKlgIcajcRS2hZAP">premium</span><span class="_2Uq6Pw5lfFfxr_OD36xHp6 _3JuM5k4sY_MJiSvJYtVLd_ Y8gFtzzcdGMdFngRO9qFV" style="width:36px;height:38px"></span></p></a>,
<a aria-label="Transition to card" class="yMWCYupQNdgppL-NV6sMi _3sAlKGsIBCxTUbNi86oSjt" data-ylk="rsec:header;slk:card;pos:0" href="https://card.yahoo.co.jp/service/redirect/top/"><p class="oLvk9L5Yk-9JOuzi-OHW5"><span class="t_jb9bKlgIcajcRS2hZAP">card</span><span class="_2Uq6Pw5lfFfxr_OD36xHp6 _3JuM5k4sY_MJiSvJYtVLd_ _1MaEI7rEHB4FpQ1MwfWxIK" style="width:36px;height:38px"></span></p></a>,
<a aria-label="Transition to email" class="yMWCYupQNdgppL-NV6sMi _3sAlKGsIBCxTUbNi86oSjt" data-ylk="rsec:header;slk:mail;pos:0" href="https://mail.yahoo.co.jp/"><p class="oLvk9L5Yk-9JOuzi-OHW5"><span class="t_jb9bKlgIcajcRS2hZAP">Email</span><span class="_2Uq6Pw5lfFfxr_OD36xHp6 _3JuM5k4sY_MJiSvJYtVLd_ _3Qi5P0lTFbNkWishPzz8tb" style="width:36px;height:38px"></span></p></a>,
...]
#Get the text and link of the obtained a tag
for tag in tags:
print (tag.string)
print (tag.get("href"))
##result
help
https://www.yahoo-help.jp/
Yahoo! JAPAN
https://www.yahoo.co.jp
None
https://premium.yahoo.co.jp/
...
import pandas as pd
from google.colab import files
columns = ["name", "url"]
df = pd.DataFrame(columns=columns)
#Add article name and article URL to dataframe
for tag in tags:
name = tag.string
url = tag.get("href")
se = pd.Series([name, url], columns)
print(se)
df = df.append(se, columns)
# result.Output to CSV with the name csv
filename = "result.csv"
df.to_csv(filename, encoding = 'utf-8-sig', index=False)
files.download(filename)
#reference)
#Export / add csv file with pandas (to_csv)
# https://note.nkmk.me/python-pandas-to-csv/
Recommended Posts