Python
https://github.com/yuzuru-program/scraping-python-yahoo
index.py
import urllib.request as request
from bs4 import BeautifulSoup
req = request.Request(
"https://www.yahoo.co.jp",
None,
{}
)
instance = request.urlopen(req)
soup = BeautifulSoup(instance, "html.parser")
li = soup.select('main article section ul')[0].select('li')
for m in li:
print(m.text)
print(m.select("a")[0].get("href"))
print()
Node.js
https://github.com/yuzuru-program/scraping-node-yahoo
package.json
{
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"node-fetch": "^2.6.0"
}
}
index.js
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const main = async () => {
// https://www.yahoo.co.jp/Throw a request to
const _ret = await fetch('https://www.yahoo.co.jp/', {
method: 'get',
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
},
referrer: '',
}).catch((err) => {
console.log(err);
});
if (_ret.status !== 200) {
console.log(`error status:${_ret.status}`);
return;
}
//Converted for use with jquery ticks
const $ = cheerio.load(await _ret.text());
const _li = $('main article section ul').eq(0).find('li');
//View Yahoo Top News
_li.map(function (i) {
console.log(_li.eq(i).text());
console.log(_li.eq(i).find('a').attr()['href']);
console.log();
});
};
main();
PHP
https://github.com/yuzuru-program/scraping-php-yahoo
index.php
<?php
require_once './phpQuery-onefile.php';
function my_curl($url)
{
$cp = curl_init();
/*option:Get the redirected page when redirected*/
curl_setopt($cp, CURLOPT_RETURNTRANSFER, 1);
/*option:Specify the URL*/
curl_setopt($cp, CURLOPT_URL, $url);
/*option:Specify the timeout time*/
curl_setopt($cp, CURLOPT_TIMEOUT, 30);
/*option:Specify a user agent*/
curl_setopt($cp, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36');
$data = curl_exec($cp);
curl_close($cp);
return $data;
}
$url = 'https://www.yahoo.co.jp';
$doc = phpQuery::newDocument(my_curl($url));
$ul = $doc->find('main article section')->find("ul:eq(0)");
for ($i = 0; $i < count($ul->find("li")); ++$i) {
$li = $ul->find("li:eq($i)");
echo $li[0]->text();
echo "\n";
echo $li[0]->find("a")->attr('href').PHP_EOL;
echo "\n";
}
?>
phpQuery-onefile.php https://github.com/yuzuru-program/scraping-php-yahoo/blob/master/phpQuery-onefile.php
Ruby
https://github.com/yuzuru-program/scraping-ruby-yahoo
index.rb
require "nokogiri"
require "open-uri"
doc = Nokogiri::HTML(open("https://www.yahoo.co.jp"))
test = doc.css("main article section ul")[0].css("li")
test.each do |li|
puts li.content
puts li.css("a")[0][:href]
puts
end
Go
https://github.com/yuzuru-program/scraping-go-yahoo
index.go
package main
import (
"fmt"
"log"
"net/http"
"github.com/PuerkitoBio/goquery"
)
func main() {
req, _ := http.NewRequest("GET", "http://yahoo.co.jp", nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
res, _ := new(http.Client).Do(req)
if res.StatusCode != 200 {
log.Fatalf("status code error: %d %s\n", res.StatusCode, res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Println(err)
}
li := doc.Find("main article section ul").Eq(0).Find("li")
li.Each(func(index int, s *goquery.Selection) {
fmt.Println(s.Text())
tmp, err := s.Find("a").Attr("href")
if err != true {
log.Fatal(err)
}
fmt.Println(tmp + "\n")
})
}
VBA
'Microsoft HTML Object Library
'Microsoft Internet Controls
'Function to delete IE process
Function IeProcessKill()
CreateObject("WScript.Shell").Exec ("taskkill.exe /F /IM iexplore.exe")
Application.Wait Now + TimeValue("0:00:2")
End Function
'Yahoo top scraping
Sub main()
Dim ie As InternetExplorer
'Delete IE process'
Call IeProcessKill
'IE startup
Set ie = New InternetExplorer
'Hide site
ie.Visible = False
Debug.Print "Loading..."
Debug.Print
'Yahoo
ie.Navigate "https://www.yahoo.co.jp/"
Do While ie.Busy = True Or ie.readyState < READYSTATE_COMPLETE
Loop
For Each tmp In ie.document.querySelector("main article section ul").getElementsByTagName("li")
Debug.Print Trim(tmp.textContent)
Debug.Print tmp.getElementsByTagName("a")(0).href
Debug.Print
Next tmp
'Browser close
ie.Quit
Set ie = Nothing
End Sub
Recommended Posts