-python (pyenv) installation memo -Scraping with Golang + PhantomJS
See the following article Npm installation procedure
$ sudo yum install -y bzip2
$ npm install --save phantomjs
selenium
$ pip install selenium
nose
$ pip install nose
sample
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
# account
email = 'user'
password = 'password'
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS body path
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Wait 5 seconds
wait = WebDriverWait(driver, 5)
#############
# get html
#############
# login page
login_page_url = 'http://127.0.0.1/sign_in'
driver.get(login_page_url)
#Wait for the page to load
wait.until(ec.presence_of_all_elements_located)
#Check the current URL
nose.eq_('http://127.0.0.1:8080/login', driver.current_url)
#############
# login
#############
# button click
show_signin = driver.find_element_by_id('showSignIn')
show_signin.click()
# email
login_xpath = '//*[@id="user_email"]'
#Wait until the target element is visible
wait.until(ec.visibility_of_element_located((By.XPATH, login_xpath)))
#Email form input
login_id_form = driver.find_element_by_xpath(login_xpath)
login_id_form.clear()
login_id_form.send_keys(email)
# password
password_xpath = '//*[@id="user_password"]'
#Wait until the target element is visible
wait.until(ec.visibility_of_element_located((By.XPATH, password_xpath)))
#password form input
password_form = driver.find_element_by_xpath(password_xpath)
password_form.clear()
password_form.send_keys(password)
# submit
submit_xpath = '//*[@id="new_user"]/div[4]/input'
driver.find_element_by_xpath(submit_xpath).click()
#############
# result
#############
driver.get('http://127.0.0.1/users/edit')
#Wait for the page to load
wait.until(ec.presence_of_all_elements_located)
#Check the current URL
nose.eq_('http://127.0.0.1:8080/users/edit', driver.current_url)
#Check the display elements on the screen to see if you are logged in
user_email = driver.find_element_by_xpath('//*[@id="user_email"]').get_attribute("value")
nose.eq_(email, user_email)
BeautifulSoap You can easily parse html by using BeautifulSoap in combination.
$ pip install beautifulsoup4
It is safe to include the following parser as well. Reference: Let's specify parser explicitly in Beautiful Soup 4.x
$ pip install html5lib
$ pip install lxml
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
from bs4 import BeautifulSoup
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS body path
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Wait 5 seconds
wait = WebDriverWait(driver, 5)
#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')
#############
# parse html
#############
html = BeautifulSoup(data)
print(html)
print(html.title)
print(html.title.string)
print(html.find('h1'))
print(html.find('select',{'id':'hoge'}))
# -*- coding:utf-8 -*-
import pandas as pd
url = 'http://stocks.finance.yahoo.co.jp/stocks/history/?code=998407.O'
tables = pd.io.html.read_html(url, flavor='bs4')
print(tables[1])
You can parse from html source as well as url.
# -*- coding:utf-8 -*-
import pandas as pd
html = '''
<html>
<body>
<table>
<tr><td>sample1</td></tr>
<tr><td>sample2</td></tr>
<tr><td>sample3</td></tr>
<tr><td>sample4</td></tr>
</table>
</body>
</html>
'''
tables = pd.io.html.read_html(html, flavor='bs4')
print(tables[0])
PhantomJS with pandas
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from bs4 import BeautifulSoup
import pandas as pd
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS body path
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Wait 5 seconds
wait = WebDriverWait(driver, 5)
#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')
# parse
soup = BeautifulSoup(data,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df[0])
-Scraping with Python + Selenium + Phantom.js + Beautifulsoup -Pandas is super useful when scraping HTML tables -Easy scraping of table with pandas
Recommended Posts