Scraping with Python + PhantomJS

-python (pyenv) installation memo -Scraping with Golang + PhantomJS

environment

python3.5.2
PhantomJS 2.1.1
CentOS7.1

npm installation

See the following article Npm installation procedure

Dependency installation

$ sudo yum install -y bzip2

Install PhantomJS

$ npm install --save phantomjs

selenium

$ pip install selenium

nose

$ pip install nose

sample

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose

# account
email = 'user'
password = 'password'


#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS body path
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
    "phantomjs.page.settings.userAgent" : user_agent,
    'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Wait 5 seconds
wait = WebDriverWait(driver, 5)

#############
# get html
#############
# login page
login_page_url = 'http://127.0.0.1/sign_in'
driver.get(login_page_url)
#Wait for the page to load
wait.until(ec.presence_of_all_elements_located)
#Check the current URL
nose.eq_('http://127.0.0.1:8080/login', driver.current_url)

#############
# login
#############
# button click
show_signin = driver.find_element_by_id('showSignIn')
show_signin.click()


# email
login_xpath = '//*[@id="user_email"]'
#Wait until the target element is visible
wait.until(ec.visibility_of_element_located((By.XPATH, login_xpath)))

#Email form input
login_id_form = driver.find_element_by_xpath(login_xpath)
login_id_form.clear()
login_id_form.send_keys(email)

# password
password_xpath = '//*[@id="user_password"]'
#Wait until the target element is visible
wait.until(ec.visibility_of_element_located((By.XPATH, password_xpath)))

#password form input
password_form = driver.find_element_by_xpath(password_xpath)
password_form.clear()
password_form.send_keys(password)

# submit
submit_xpath = '//*[@id="new_user"]/div[4]/input'
driver.find_element_by_xpath(submit_xpath).click()


#############
# result
#############
driver.get('http://127.0.0.1/users/edit')
#Wait for the page to load
wait.until(ec.presence_of_all_elements_located)
#Check the current URL
nose.eq_('http://127.0.0.1:8080/users/edit', driver.current_url)
#Check the display elements on the screen to see if you are logged in
user_email = driver.find_element_by_xpath('//*[@id="user_email"]').get_attribute("value")
nose.eq_(email, user_email)

BeautifulSoap You can easily parse html by using BeautifulSoap in combination.

$ pip install beautifulsoup4

It is safe to include the following parser as well. Reference: Let's specify parser explicitly in Beautiful Soup 4.x

$ pip install html5lib
$ pip install lxml

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
from bs4 import BeautifulSoup

#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS body path
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
    "phantomjs.page.settings.userAgent" : user_agent,
    'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Wait 5 seconds
wait = WebDriverWait(driver, 5)

#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')

#############
# parse html
#############
html = BeautifulSoup(data)
print(html)
print(html.title)
print(html.title.string)
print(html.find('h1'))
print(html.find('select',{'id':'hoge'}))

parse table with pandas

# -*- coding:utf-8 -*-
import pandas as pd

url = 'http://stocks.finance.yahoo.co.jp/stocks/history/?code=998407.O'
tables = pd.io.html.read_html(url, flavor='bs4')
print(tables[1])

You can parse from html source as well as url.

# -*- coding:utf-8 -*-
import pandas as pd

html = '''
<html>
<body>
<table>
<tr><td>sample1</td></tr>
<tr><td>sample2</td></tr>
<tr><td>sample3</td></tr>
<tr><td>sample4</td></tr>
</table>
</body>
</html>
'''

tables = pd.io.html.read_html(html, flavor='bs4')
print(tables[0])

PhantomJS with pandas

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from bs4 import BeautifulSoup
import pandas as pd

#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS body path
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
    "phantomjs.page.settings.userAgent" : user_agent,
    'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Wait 5 seconds
wait = WebDriverWait(driver, 5)

#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')

# parse
soup = BeautifulSoup(data,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df[0])

reference

-Scraping with Python + Selenium + Phantom.js + Beautifulsoup -Pandas is super useful when scraping HTML tables -Easy scraping of table with pandas

Web Scraping with Pandas and Beautifulsoup