--Get the URL in the tweet of a specified twitter account --Scraping the specified information from the page with a specific character string added to the URL --Save the scrubbed information in postgres --Stop crawling if you hit data that you have already saved --Use Beautiful Soup
crawl.py
# -*- coding: utf-8 -*-
try:
# Python 3
from urllib import request
except ImportError:
# Python 2
import urllib2 as request
from bs4 import BeautifulSoup
import twpy
import time
#postgresql connection
import psycopg2
def main():
#Read tweet data
api = twpy.api
tweets = api.user_timeline(screen_name = "ID of a twitter account")
connector = psycopg2.connect(host="hoge",port=5432,dbname="hogehoge",user="hoge",password="hoge")
max_hoge_id_fetcher = connector.cursor()
cursor = connector.cursor()
max_hoge_id_fetcher.execute('select MAX(hoge_id) from hoge')
#Latest hoge saved in DB_Get id
for row in max_hoge_id_fetcher:
max_hoge_id = row[0]
print("The latest saved ID is"+str(hoge_id))
#Read tweets one by one and crawl the URL
for tweet in tweets:
text = tweet.text
url = tweet.entities['urls']
expanded_url = url[0]['expanded_url']
#This time, the crawl destination is a URL with a specific character string
crawl_url = expanded_url + "hogehoge"
response = request.urlopen(crawl_url)
#Read response and save to body
body = response.read()
#Parse HTML and put it in soup
soup = BeautifulSoup(body,'html.parser')
hoge_id = soup.find('id').text
print(str(hoge_id)+"To start")
#Once you get to the latest hogeid, don't crawl after that.
if int(hoge_id) <= max_hoge_id:
print('This data is already included.')
break
description = soup.find('description').text
#Omission
#Insert data
cursor.execute('insert into hoge(hoge_id,description,hogehoge,,,) values(%s,%s,hogehoge)',(hoge_id,description,hoge,))
print("inserted!")
#Sleep time set to 3 seconds
time.sleep(3)
#Save your changes
connector.commit()
cursor.close()
connector.close()
if __name__ == '__main__':
main()
twpy.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#Import Tweepy library
import tweepy
#Set various keys
CONSUMER_KEY = 'hoge'
CONSUMER_SECRET = 'hoge'
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
ACCESS_TOKEN = 'hoge'
ACCESS_SECRET = 'hoge'
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
#Create an API instance
api = tweepy.API(auth)
#Ready to operate the Twitter API from Python.
print "Done!"
$ python crawl.py
Done!
The latest saved ID is 92
Start 98
inserted!
Start 97
inserted!
Start 96
inserted!
Start 95
inserted!
Start 94
inserted!
Start 93
inserted!
Start 92
This data is already included.
[Python: Scraping websites with BeautifulSoup4](http://momijiame.tumblr.com/post/114227737756/python-beautifulsoup4-%E3%82%92%E4%BD%BF%E3%81%A3 % E3% 81% A6-web-% E3% 82% B5% E3% 82% A4% E3% 83% 88% E3% 82% 92% E3% 82% B9% E3% 82% AF% E3% 83% AC% E3% 82% A4% E3% 83% 94% E3% 83% B3% E3% 82% B0% E3% 81% 99% E3% 82% 8B) Using Tweepy to operate Twitter API in Python very easily
Thank you very much.
Recommended Posts