For the last few days, I've been addicted to getting dates from each news site. I have made the code I wrote into a somewhat practical class, so I will publish it. Of course, some sites do not allow bots to crawl, so be careful when using it. I am trying to get it from the following site.
Asahi Shimbun Nikkei newspaper Sankei Shimbun Yomiuri Shimbun Mainichi newspaper Yahoo! News CNN Bloomberg BBC Reuter Wall Street Journal Forbes Japan Newsweek CNN.co.jp ABC News Ministry of Foreign Affairs AFP BB NHK News Nikkan Kogyo Shimbun EUROPA NEWSWIRE United Nations Information Center OPCW News HAARETZ THE DAILY STAR INDEPENDENT JETRO Yukan Fuji
Get the requests library, parse, etc. with BS4, then get the date with regular expression and datetime strptime and make it datetime type. The particle size varies, such as sites that can be taken by date and sites that can be taken by minutes. Originally, what can be taken on a daily basis should be a date object, but it's not good. The notation fluctuation of variable names is also terrible. It is a script that says, "I wish I could get a rough update date with as high a probability as possible." There is also a question that it will not be possible to obtain the update date unless this is done, so I would like to hear the comments of those who are familiar with it. ..
news_timestamp.py
import bs4
import requests
import datetime
import re
from jeraconv import jeraconv
class ScrapeNewsTimestamp:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
}
def scrape_return_timestamp_1(self,bs4Obj):
try:
#Asahi Shimbun
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day%H o'clock%M minutes")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_2(self,bs4Obj):
try:
#Nikkei newspaper time is money
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[1].string, "%Y year%m month%d day%H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_3(self,bs4Obj):
try:
#Nikkei Newspaper Overseas Financial News
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.cmnc-publish')[0].string, "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_4(self,bs4Obj):
try:
#Nikkei newspaper spring and autumn
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.cmnc-publish')[0].string, "%Y/%m/%With d")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_5(self,bs4Obj):
try:
#Sankei Shimbun International
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('#__r_publish_date__')[0].string, "%Y.%m.%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_6(self,bs4Obj):
try:
#Yomiuri Shimbun Domestic
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_7(self,bs4Obj):
try:
#Mainichi Shimbun Tokyo morning edition
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day Tokyo morning edition")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_8(self,bs4Obj):
try:
#Mainichi Shimbun Tokyo evening edition
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%Day d Tokyo evening edition")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_9(self,bs4Obj):
try:
#Mainichi Shimbun breaking news
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day%H o'clock%M minutes")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_10(self,bs4Obj):
try:
#Mainichi Shimbun Premier
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_11(self,bs4Obj):
try:
#Yahoo!news
bs4Obj = bs4Obj
m1 = re.match(r'\d{1,2}/\d{1,2}',str(bs4Obj.select('p.source')[0].string))
tmp1 = m1.group()
m2 = re.search(r'\d{1,2}:\d{1,2}',str(bs4Obj.select('p.source')[0].string))
tmp2 = m2.group()
news_timestamp = datetime.datetime.strptime(str(datetime.datetime.now().year)+tmp1+' '+tmp2, "%Y%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_12(self,bs4Obj):
try:
#CNN
bs4Obj = bs4Obj
m1 = re.search(r'Updated (\d{4}) GMT', str(bs4Obj.select('.update-time')[0].getText()))
m2 = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4})', str(bs4Obj.select('.update-time')[0].getText()))
nes_timestamp_tmp = m2.groups()[2]+m2.groups()[1]+m2.groups()[0]+m1.groups()[0]
news_timestamp = datetime.datetime.strptime(nes_timestamp_tmp, "%Y%d%B%H%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_13(self,bs4Obj):
try:
#Bloomberg
bs4Obj = bs4Obj
timesamp_tmp = re.sub(' ','',str(bs4Obj.select('time')[0].string))
timesamp_tmp = re.sub('\n','',timesamp_tmp)
news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%Y year%m month%d day%H:%MJST")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_14(self,bs4Obj):
try:
#BBC
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("div.date.date--v2")[0].string, "%d %B %Y")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_15(self,bs4Obj):
try:
#Reuter
bs4Obj = bs4Obj
m1 = re.match(r'(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}',str(bs4Obj.select(".ArticleHeader_date")[0].string))
m2 = re.search(r'\d{1,2}:\d{1,2}',str(bs4Obj.select(".ArticleHeader_date")[0].string))
news_timestamp = datetime.datetime.strptime(m1.group()+' '+m2.group(), "%B %d, %Y %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_16(self,bs4Obj):
try:
#Wall Street Journal
bs4Obj = bs4Obj
m = re.sub(' ','',str(bs4Obj.select(".timestamp.article__timestamp")[0].string))
m = re.sub('\n','',m)
m = re.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec).(\d{1,2}),(\d{4})(\d{1,2}):(\d{1,2})',str(m))
tmp = m.groups()
timesamp_tmp = tmp[0]+' '+ tmp[1].zfill(2)+' '+tmp[2]+' '+tmp[3].zfill(2)+' '+tmp[4].zfill(2)
news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%b %d %Y %H %M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_17(self,bs4Obj):
try:
#Forbes Japan
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("time")[0].string, "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_18(self,bs4Obj):
try:
#Newsweek
bs4Obj = bs4Obj
m = re.search(r'(\d{1,2})/(\d{1,2})/(\d{1,2}) at (\d{1,2}:\d{1,2}) ', str(bs4Obj.select('time')[0].string))
tmp = m.groups()
timesamp_tmp = tmp[0].zfill(2)+' '+ tmp[1].zfill(2)+' '+'20'+tmp[2].zfill(2)+' '+tmp[3]
news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%m %d %Y %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_19(self,bs4Obj):
try:
#CNN.co.jp
bs4Obj = bs4Obj
m1 = re.search(r'\d{4}.\d{2}.\d{2}', str(bs4Obj.select("div .metadata-updatetime")[0]))
m2 = re.search(r'\d{1,2}:\d{1,2}', str(bs4Obj.select("div .metadata-updatetime")[0]))
news_timestamp = datetime.datetime.strptime(m1.group()+' '+m2.group(), "%Y.%m.%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_20(self,bs4Obj):
try:
#BBC 2
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("div.date.date--v2")[0].string, "%Y year%m month%d day")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_21(self,bs4Obj):
try:
#ABC News
bs4Obj = bs4Obj
tmp = re.match(r'(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4}), (\d{1,2}:\d{1,2}) (AM|PM)',bs4Obj.select(".Byline__Meta.Byline__Meta--publishDate")[0].string)
hour = int(tmp.groups()[3].split(':')[0])
mini = tmp.groups()[3].split(':')[1]
if tmp.groups()[4] == 'PM':
hour += 12
hour = str(hour)
news_timestamp = datetime.datetime.strptime(tmp.groups()[0]+' '+tmp.groups()[1]+' '+tmp.groups()[2]+' '+hour+' '+mini,"%B %d %Y %H %M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_22(self,bs4Obj):
try:
#Ministry of Foreign Affairs
bs4Obj = bs4Obj
j2w = jeraconv.J2W()
m = bs4Obj.select('.rightalign')[0].string
y = m.split('Year')[0]
md = m.split('Year')[1]
news_timestamp = datetime.datetime.strptime(str(j2w.convert(str(y)+'Year'))+'Year'+ md, "%YYear%m month%d day")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_23(self,bs4Obj):
try:
#AFP BB
bs4Obj = bs4Obj
for meta_tag in bs4Obj.find_all('meta', attrs={'property':"article:modified_time"}):
m = re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}',meta_tag.get('content'))
news_timestamp = datetime.datetime.strptime(m.group(),"%Y-%m-%dT%H:%M:%S")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_24(self,bs4Obj):
try:
#NHK News
bs4Obj = bs4Obj
for meta_tag in bs4Obj.find_all('time'):
news_timestamp = datetime.datetime.strptime(meta_tag.get('datetime'),'%Y-%m-%dT%H:%M')
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_25(self,bs4Obj):
try:
#Nikkei Newspaper
bs4Obj = bs4Obj
m = re.search(r'\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}',bs4Obj.select('.cmnc-publish')[0].string)
news_timestamp = datetime.datetime.strptime(m.group(), "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_26(self,bs4Obj):
try:
#Nikkan Kogyo Shimbun
bs4Obj = bs4Obj
m = re.search(r'\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{1,2}',str(bs4Obj.select('.date')[1]))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_27(self,bs4Obj):
try:
#Asahi Shimbun Ronza
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.date')[0].string, "%Y year%m month%d day")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_28(self,bs4Obj):
try:
#Asahi Shimbun Junior and Senior High School Newspaper
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.date')[0].string, "%Y year%m month%d day")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_28(self,bs4Obj):
try:
#EUROPA NEWSWIRE
bs4Obj = bs4Obj
m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{1,2}), (\d{4})',bs4Obj.select(".icon-cal")[0].string)
news_timestamp = datetime.datetime.strptime(''.join(m.groups()), "%b%d%Y")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_29(self,bs4Obj):
try:
#United Nations Information Center
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("#cm_header_text")[0].string, "%Y year%m month%d day")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_29(self,bs4Obj):
try:
#OPCW News
bs4Obj = bs4Obj
m = re.search(r'(\d{1,2}) (January|February|March|April|May|June|July|August|September|October|November|December) \d{4}',bs4Obj.select(".news__date")[0].get_text())
news_timestamp = datetime.datetime.strptime(m.group(), "%d %B %Y")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_30(self,bs4Obj):
try:
#HAARETZ
bs4Obj = bs4Obj
m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2}, \d{4} \d{1,2}:\d{1,2}',bs4Obj.select("time")[1].get_text())
news_timestamp = datetime.datetime.strptime(m.group(), "%b %d, %Y %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_31(self,bs4Obj):
try:
#THE DAILY STAR
bs4Obj = bs4Obj
m = re.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec). (\d{1,2}), (\d{4}) \| (\d{1,2}):(\d{1,2})',bs4Obj.select("time")[0].get_text())
news_timestamp = datetime.datetime.strptime(''.join(m.groups()), "%b%d%Y%H%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_32(self,bs4Obj):
try:
#INDEPENDENT
bs4Obj = bs4Obj
m = re.search(r'\d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}',str(bs4Obj.select("amp-timeago")[0]))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y-%m-%dT%H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_33(self,bs4Obj):
try:
#JETRO
bs4Obj = bs4Obj
m = re.search(r'\d{4}Year\d{1,2}Month\d{1,2}Day',str(bs4Obj.select('p')))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y year%m month%d day")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_34(self,bs4Obj):
try:
#Yukan Fuji
bs4Obj = bs4Obj
m = re.search(r'\d{4}.\d{1,2}.\d{1,2}',str(bs4Obj.select('#__r_publish_date__')[0]))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y.%m.%d")
return news_timestamp
except Exception as e:
return None
def main(self,URL):
self.URL = URL
try:
get_url_info = requests.get(URL,headers=self.headers)
bs4Obj = bs4.BeautifulSoup(get_url_info.content, 'lxml')
except Exception as e:
print(e)
return 'Unable to access URL'
for i in range(1,35):
func_name = 'self.scrape_return_timestamp_' + str(i)
ts_temp = eval(func_name)(bs4Obj)
if ts_temp:
return td_temp
test.py
from news_timestamp import *
TS = ScrapeNewsTimestamp()
news_timestamp = TS.main('https://www.mofa.go.jp/mofaj/press/release/press1_000423.html')
print(news_timestamp)
2020-02-15 00:00:00
I will leave it here as well. https://github.com/KanikaniYou/news_timestamp
Recommended Posts