Notes on how to get various soups. It seems that you have to use selenium if you have a site where javascript is valid, such as google image search.
get_soup.py
#-*- coding:utf-8 -*-
from bs4 import BeautifulSoup
def get_soup_uulib2(url):
import urllib2
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
page = opener.open(url)
soup = BeautifulSoup(page,"lxml")
return soup
def get_soup_urequests(url):
import requests
s = requests.Session()
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
print soup
def get_soup_uselenium(url):
from selenium import webdriver
#need chromedriver #https://sites.google.com/a/chromium.org/chromedriver/downloads
chromedriver = "./chromedriver"
driver = webdriver.Chrome(chromedriver)
driver.get(url)
page_source= driver.page_source
soup=BeautifulSoup(page_source,"lxml")
return soup
#javascript=enable
print get_soup_uselenium("https://www.google.co.jp/search?q=Cat")
#java=off
#print get_soup_uulib2("https://www.google.co.jp/search?q=Cat")
#print get_soup_uulib2("https://www.google.co.jp/search?q=Cat")
It is troublesome to start up the browser one by one.
I googled headless selenium
and tried various things, but gave up.
Can anyone please tell me?
For HEADless (does not launch the browser)
Just set driver = webdriver.PhantomJS ()
.
Enter with brew install phantomjs
etc.