[Python] Get the update date of a news article from HTML

For the last few days, I've been addicted to getting dates from each news site. I have made the code I wrote into a somewhat practical class, so I will publish it. Of course, some sites do not allow bots to crawl, so be careful when using it. I am trying to get it from the following site.

Asahi Shimbun Nikkei newspaper Sankei Shimbun Yomiuri Shimbun Mainichi newspaper Yahoo! News CNN Bloomberg BBC Reuter Wall Street Journal Forbes Japan Newsweek CNN.co.jp ABC News Ministry of Foreign Affairs AFP BB NHK News Nikkan Kogyo Shimbun EUROPA NEWSWIRE United Nations Information Center OPCW News HAARETZ THE DAILY STAR INDEPENDENT JETRO Yukan Fuji

Get the requests library, parse, etc. with BS4, then get the date with regular expression and datetime strptime and make it datetime type. The particle size varies, such as sites that can be taken by date and sites that can be taken by minutes. Originally, what can be taken on a daily basis should be a date object, but it's not good. The notation fluctuation of variable names is also terrible. It is a script that says, "I wish I could get a rough update date with as high a probability as possible." There is also a question that it will not be possible to obtain the update date unless this is done, so I would like to hear the comments of those who are familiar with it. ..

`news_timestamp.py`


import bs4
import requests
import datetime
import re
from jeraconv import jeraconv

class ScrapeNewsTimestamp:
    def __init__(self):

        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        }

    def scrape_return_timestamp_1(self,bs4Obj):
        try:
            #Asahi Shimbun
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day%H o'clock%M minutes")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_2(self,bs4Obj):
        try:
            #Nikkei newspaper time is money
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[1].string, "%Y year%m month%d day%H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_3(self,bs4Obj):
        try:
            #Nikkei Newspaper Overseas Financial News
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('.cmnc-publish')[0].string, "%Y/%m/%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_4(self,bs4Obj):
        try:
            #Nikkei newspaper spring and autumn
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('.cmnc-publish')[0].string, "%Y/%m/%With d")
            return news_timestamp
        except Exception as e:
            return None


    def scrape_return_timestamp_5(self,bs4Obj):
        try:
            #Sankei Shimbun International
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('#__r_publish_date__')[0].string, "%Y.%m.%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_6(self,bs4Obj):
        try:        
            #Yomiuri Shimbun Domestic
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y/%m/%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_7(self,bs4Obj):
        try:                
            #Mainichi Shimbun Tokyo morning edition
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day Tokyo morning edition")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_8(self,bs4Obj):
        try:        
            #Mainichi Shimbun Tokyo evening edition
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%Day d Tokyo evening edition")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_9(self,bs4Obj):
        try:
            #Mainichi Shimbun breaking news
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day%H o'clock%M minutes")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_10(self,bs4Obj):
        try:
            #Mainichi Shimbun Premier
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y year%m month%d day")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_11(self,bs4Obj):
        try:
            #Yahoo!news
            bs4Obj = bs4Obj
            m1 = re.match(r'\d{1,2}/\d{1,2}',str(bs4Obj.select('p.source')[0].string))
            tmp1 = m1.group()
            m2 = re.search(r'\d{1,2}:\d{1,2}',str(bs4Obj.select('p.source')[0].string))
            tmp2 = m2.group()
            news_timestamp = datetime.datetime.strptime(str(datetime.datetime.now().year)+tmp1+' '+tmp2, "%Y%m/%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_12(self,bs4Obj):
        try:
            #CNN
            bs4Obj = bs4Obj
            m1 = re.search(r'Updated (\d{4}) GMT', str(bs4Obj.select('.update-time')[0].getText()))
            m2 = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4})', str(bs4Obj.select('.update-time')[0].getText()))
            nes_timestamp_tmp = m2.groups()[2]+m2.groups()[1]+m2.groups()[0]+m1.groups()[0]
            news_timestamp = datetime.datetime.strptime(nes_timestamp_tmp, "%Y%d%B%H%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_13(self,bs4Obj):
        try:
            #Bloomberg
            bs4Obj = bs4Obj
            timesamp_tmp = re.sub(' ','',str(bs4Obj.select('time')[0].string))
            timesamp_tmp = re.sub('\n','',timesamp_tmp)
            news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%Y year%m month%d day%H:%MJST")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_14(self,bs4Obj):
        try:
            #BBC
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select("div.date.date--v2")[0].string, "%d %B %Y")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_15(self,bs4Obj):
        try:
            #Reuter
            bs4Obj = bs4Obj
            m1 = re.match(r'(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}',str(bs4Obj.select(".ArticleHeader_date")[0].string))
            m2 = re.search(r'\d{1,2}:\d{1,2}',str(bs4Obj.select(".ArticleHeader_date")[0].string))
            news_timestamp = datetime.datetime.strptime(m1.group()+' '+m2.group(), "%B %d, %Y %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_16(self,bs4Obj):
        try:
            #Wall Street Journal
            bs4Obj = bs4Obj
            m = re.sub(' ','',str(bs4Obj.select(".timestamp.article__timestamp")[0].string))
            m = re.sub('\n','',m)
            m = re.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec).(\d{1,2}),(\d{4})(\d{1,2}):(\d{1,2})',str(m))
            tmp = m.groups()
            timesamp_tmp = tmp[0]+' '+ tmp[1].zfill(2)+' '+tmp[2]+' '+tmp[3].zfill(2)+' '+tmp[4].zfill(2)
            news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%b %d %Y %H %M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_17(self,bs4Obj):
        try:
            #Forbes Japan
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select("time")[0].string, "%Y/%m/%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_18(self,bs4Obj):
        try:
            #Newsweek
            bs4Obj = bs4Obj
            m = re.search(r'(\d{1,2})/(\d{1,2})/(\d{1,2}) at (\d{1,2}:\d{1,2}) ', str(bs4Obj.select('time')[0].string))
            tmp = m.groups()
            timesamp_tmp = tmp[0].zfill(2)+' '+ tmp[1].zfill(2)+' '+'20'+tmp[2].zfill(2)+' '+tmp[3]
            news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%m %d %Y %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_19(self,bs4Obj):
        try:
            #CNN.co.jp
            bs4Obj = bs4Obj
            m1 = re.search(r'\d{4}.\d{2}.\d{2}', str(bs4Obj.select("div .metadata-updatetime")[0]))
            m2 = re.search(r'\d{1,2}:\d{1,2}', str(bs4Obj.select("div .metadata-updatetime")[0]))
            news_timestamp = datetime.datetime.strptime(m1.group()+' '+m2.group(), "%Y.%m.%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_20(self,bs4Obj):
        try:
            #BBC 2
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select("div.date.date--v2")[0].string, "%Y year%m month%d day")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_21(self,bs4Obj):
        try:
            #ABC News
            bs4Obj = bs4Obj
            tmp = re.match(r'(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4}), (\d{1,2}:\d{1,2}) (AM|PM)',bs4Obj.select(".Byline__Meta.Byline__Meta--publishDate")[0].string)
            hour = int(tmp.groups()[3].split(':')[0])
            mini = tmp.groups()[3].split(':')[1]
            if tmp.groups()[4] == 'PM':
                hour += 12
            hour = str(hour)
            news_timestamp = datetime.datetime.strptime(tmp.groups()[0]+' '+tmp.groups()[1]+' '+tmp.groups()[2]+' '+hour+' '+mini,"%B %d %Y %H %M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_22(self,bs4Obj):
        try:
            #Ministry of Foreign Affairs
            bs4Obj = bs4Obj
            j2w = jeraconv.J2W()
            m = bs4Obj.select('.rightalign')[0].string
            y =  m.split('Year')[0]
            md = m.split('Year')[1]
            news_timestamp = datetime.datetime.strptime(str(j2w.convert(str(y)+'Year'))+'Year'+ md, "%YYear%m month%d day")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_23(self,bs4Obj):
        try:
            #AFP BB
            bs4Obj = bs4Obj
            for meta_tag in bs4Obj.find_all('meta', attrs={'property':"article:modified_time"}):
                m = re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}',meta_tag.get('content'))
                news_timestamp = datetime.datetime.strptime(m.group(),"%Y-%m-%dT%H:%M:%S")
                return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_24(self,bs4Obj):
        try:
            #NHK News
            bs4Obj = bs4Obj
            for meta_tag in bs4Obj.find_all('time'):
                news_timestamp = datetime.datetime.strptime(meta_tag.get('datetime'),'%Y-%m-%dT%H:%M')
                return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_25(self,bs4Obj):
        try:
            #Nikkei Newspaper
            bs4Obj = bs4Obj
            m = re.search(r'\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}',bs4Obj.select('.cmnc-publish')[0].string)
            news_timestamp = datetime.datetime.strptime(m.group(), "%Y/%m/%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_26(self,bs4Obj):
        try:
            #Nikkan Kogyo Shimbun
            bs4Obj = bs4Obj
            m = re.search(r'\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{1,2}',str(bs4Obj.select('.date')[1]))
            news_timestamp = datetime.datetime.strptime(m.group(), "%Y/%m/%d %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_27(self,bs4Obj):
        try:
            #Asahi Shimbun Ronza
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('.date')[0].string, "%Y year%m month%d day")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_28(self,bs4Obj):
        try:  
            #Asahi Shimbun Junior and Senior High School Newspaper
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select('.date')[0].string, "%Y year%m month%d day")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_28(self,bs4Obj):
        try:  
            #EUROPA NEWSWIRE
            bs4Obj = bs4Obj          
            m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{1,2}), (\d{4})',bs4Obj.select(".icon-cal")[0].string)
            news_timestamp = datetime.datetime.strptime(''.join(m.groups()), "%b%d%Y")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_29(self,bs4Obj):
        try:
            #United Nations Information Center
            bs4Obj = bs4Obj
            news_timestamp = datetime.datetime.strptime(bs4Obj.select("#cm_header_text")[0].string, "%Y year%m month%d day")
            return news_timestamp
        except Exception as e:
            return None    

    def scrape_return_timestamp_29(self,bs4Obj):
        try:
            #OPCW News
            bs4Obj = bs4Obj 
            m = re.search(r'(\d{1,2}) (January|February|March|April|May|June|July|August|September|October|November|December) \d{4}',bs4Obj.select(".news__date")[0].get_text())
            news_timestamp = datetime.datetime.strptime(m.group(), "%d %B %Y")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_30(self,bs4Obj):
        try:
            #HAARETZ
            bs4Obj = bs4Obj
            m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2}, \d{4} \d{1,2}:\d{1,2}',bs4Obj.select("time")[1].get_text())
            news_timestamp = datetime.datetime.strptime(m.group(), "%b %d, %Y %H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_31(self,bs4Obj):
        try:
            #THE DAILY STAR
            bs4Obj = bs4Obj
            m = re.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec). (\d{1,2}), (\d{4}) \| (\d{1,2}):(\d{1,2})',bs4Obj.select("time")[0].get_text())
            news_timestamp = datetime.datetime.strptime(''.join(m.groups()), "%b%d%Y%H%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_32(self,bs4Obj):
        try:
            #INDEPENDENT
            bs4Obj = bs4Obj
            m = re.search(r'\d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}',str(bs4Obj.select("amp-timeago")[0]))
            news_timestamp = datetime.datetime.strptime(m.group(), "%Y-%m-%dT%H:%M")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_33(self,bs4Obj):
        try:
            #JETRO
            bs4Obj = bs4Obj
            m = re.search(r'\d{4}Year\d{1,2}Month\d{1,2}Day',str(bs4Obj.select('p')))
            news_timestamp = datetime.datetime.strptime(m.group(), "%Y year%m month%d day")
            return news_timestamp
        except Exception as e:
            return None

    def scrape_return_timestamp_34(self,bs4Obj):
        try:
            #Yukan Fuji
            bs4Obj = bs4Obj 
            m = re.search(r'\d{4}.\d{1,2}.\d{1,2}',str(bs4Obj.select('#__r_publish_date__')[0]))
            news_timestamp = datetime.datetime.strptime(m.group(), "%Y.%m.%d")
            return news_timestamp
        except Exception as e:
            return None


    def main(self,URL):
        self.URL = URL
        try:
            get_url_info = requests.get(URL,headers=self.headers)
            bs4Obj = bs4.BeautifulSoup(get_url_info.content, 'lxml')
        except Exception as e:
            print(e)
            return 'Unable to access URL'

        for i in range(1,35):
            func_name = 'self.scrape_return_timestamp_' + str(i)
            ts_temp = eval(func_name)(bs4Obj)
            if ts_temp:
                return td_temp

`test.py`


from news_timestamp import *

TS = ScrapeNewsTimestamp()
news_timestamp = TS.main('https://www.mofa.go.jp/mofaj/press/release/press1_000423.html')
print(news_timestamp)

2020-02-15 00:00:00

I will leave it here as well. https://github.com/KanikaniYou/news_timestamp