Overview

--Get the URL in the tweet of a specified twitter account --Scraping the specified information from the page with a specific character string added to the URL --Save the scrubbed information in postgres --Stop crawling if you hit data that you have already saved --Use Beautiful Soup

Details

`crawl.py`


# -*- coding: utf-8 -*-
try:
    # Python 3
    from urllib import request
except ImportError:
    # Python 2
    import urllib2 as request

from bs4 import BeautifulSoup

import twpy
import time

#postgresql connection
import psycopg2

def main():
    #Read tweet data
    api = twpy.api
    tweets = api.user_timeline(screen_name = "ID of a twitter account")

    connector = psycopg2.connect(host="hoge",port=5432,dbname="hogehoge",user="hoge",password="hoge")
    max_hoge_id_fetcher = connector.cursor()
    cursor = connector.cursor()

    max_hoge_id_fetcher.execute('select MAX(hoge_id) from hoge')

    #Latest hoge saved in DB_Get id
    for row in max_hoge_id_fetcher:
        max_hoge_id = row[0]
        print("The latest saved ID is"+str(hoge_id))

    #Read tweets one by one and crawl the URL
    for tweet in tweets:
        text = tweet.text
        url = tweet.entities['urls']
        expanded_url = url[0]['expanded_url']
        
        #This time, the crawl destination is a URL with a specific character string
        crawl_url = expanded_url + "hogehoge"
        response = request.urlopen(crawl_url)

        #Read response and save to body
        body = response.read()

        #Parse HTML and put it in soup
        soup = BeautifulSoup(body,'html.parser')

        hoge_id = soup.find('id').text

        print(str(hoge_id)+"To start")

        #Once you get to the latest hogeid, don't crawl after that.
        if int(hoge_id) <= max_hoge_id:
            print('This data is already included.')
            break

        description = soup.find('description').text

        #Omission

        #Insert data
        cursor.execute('insert into hoge(hoge_id,description,hogehoge,,,) values(%s,%s,hogehoge)',(hoge_id,description,hoge,))
        print("inserted!")

        #Sleep time set to 3 seconds
        time.sleep(3)

    #Save your changes
    connector.commit()

    cursor.close()
    connector.close()


if __name__ == '__main__':
    main()

`twpy.py`


#!/usr/bin/env python
# -*- coding:utf-8 -*-

#Import Tweepy library
import tweepy

#Set various keys
CONSUMER_KEY = 'hoge'
CONSUMER_SECRET = 'hoge'
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
ACCESS_TOKEN = 'hoge'
ACCESS_SECRET = 'hoge'
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

#Create an API instance
api = tweepy.API(auth)

#Ready to operate the Twitter API from Python.
print "Done!"

Execution example

$ python crawl.py 
Done!
The latest saved ID is 92
Start 98
inserted!
Start 97
inserted!
Start 96
inserted!
Start 95
inserted!
Start 94
inserted!
Start 93
inserted!
Start 92
This data is already included.

(Part of) the site I referred to

[Python: Scraping websites with BeautifulSoup4](http://momijiame.tumblr.com/post/114227737756/python-beautifulsoup4-%E3%82%92%E4%BD%BF%E3%81%A3 % E3% 81% A6-web-% E3% 82% B5% E3% 82% A4% E3% 83% 88% E3% 82% 92% E3% 82% B9% E3% 82% AF% E3% 83% AC% E3% 82% A4% E3% 83% 94% E3% 83% B3% E3% 82% B0% E3% 81% 99% E3% 82% 8B) Using Tweepy to operate Twitter API in Python very easily

Thank you very much.

Crawl the URL contained in the twitter tweet with python