The data to be scraped is here
# -*- coding: utf-8 -*-
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
url = 'https://www.data.jma.go.jp/obd/stats/etrn/view/10min_s1.php?prec_no=44&block_no=47662&year=2019&month=01&day=01&view=p1'
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
element = soup.find_all('tr', attrs={'class':'mtx', 'style':'text-align:right;'})
out = []
for ele in element:
data_list = []
for e in ele:
data_list.append(e.text)
out.append(data_list)
df = pd.DataFrame(data=out, columns=['Hour and minute','Local barometric pressure','Sea level pressure','Precipitation','temperature','Relative humidity','Average wind speed','Average wind direction','Maximum instantaneous wind speed','Maximum instantaneous wind direction','Daylight hours'])
df.to_csv('tokyo_2019-01-01.csv', index=None,encoding='SJIS')
#① Object generation to parse the specified HTML
soup = BeautifulSoup(html, 'html.parser')
#② Get all tr tags by specifying the conditions
element = soup.find_all('tr', attrs={'class':'mtx', 'style':'text-align:right;'})
After that, extract the text from it and add it to the list one by one
** ↓ Output result ↓ **
require 'csv'
require 'mechanize'
agent = Mechanize.new
url = 'https://www.data.jma.go.jp/obd/stats/etrn/view/10min_s1.php?prec_no=44&block_no=47662&year=2019&month=01&day=01&view=p1'
page = agent.get(url)
html = page.search('tr')
out = []
html.each do |element|
if element.get_attribute('style') == 'text-align:right;' then
data_list=[]
ele = element.search('td')
ele.each do |e|
data_list << e.inner_text
end
out << data_list
end
end
header = ['Hour and minute','Local barometric pressure','Sea level pressure','Precipitation','temperature','Relative humidity','Average wind speed','Average wind direction','Maximum instantaneous wind speed','Maximum instantaneous wind direction','Daylight hours']
CSV.open('tokyo_2019-01-01.csv','w') do |csv|
csv << header
out.each do |val|
csv << val
end
end
#① Create an instance of the Mechanize class and get the HTML of the specified url
agent = Mechanize.new
page = agent.get(url)
#② After searching the tr tag with the search method, the style tag is text-align:inner when right_Get text with text method
html = page.search('tr')
out = []
html.each do |element|
if element.get_attribute('style') == 'text-align:right;' then
data_list=[]
ele = element.search('td')
ele.each do |e|
data_list << e.inner_text
#~abridgement~
** ↓ Output result ↓ **
Surprisingly, there was not much difference in the amount of code. Is it possible to search for tags by specifying attributes in Ruby? .. ..
that's all! !! !!