We will look back on the past year by TF-IDF analysis for news articles related to the new coronavirus.
Moon | article number |
Main events |
---|---|---|
1 | 64 | 1/6 Ministry of Health, Labor and Welfare calls attention "Pneumonia of unknown cause in Wuhan, China" 1/16 First confirmed infected person in Japan, Chinese man traveling to Wuhan |
2 | 210 | 2/3 Cruise ship confirmed to be infected by passengers, entering Yokohama port 2/13 A woman in her 80s living in Kanagawa prefecture who died for the first time in Japan |
3 | 88 | 3/9 Expert meeting calls for avoidance of "three dense" 3/24 Decided to postpone the Tokyo Olympics |
4 | 320 | 4/7 Declaration of emergency in the Greater Tokyo Area and 7 prefectures of Osaka, Hyogo, and Fukuoka 4/16 Expand the state of emergency nationwide |
5 | 357 | 5/4 State of emergency extended until May 31 5/25 Completely lift the state of emergency |
6 | 65 | 6/Alleviate self-restraint from moving across 19 prefectures nationwide 6/29 Over 500,000 dead in the world |
7 | 35 | 7/3 Over 200 people infected in Japan for the first time in 2 months 7/22 GoTo Travel Start/795 people infected daily in Japan, the highest number ever |
8 | 18 | 8/17 4-June GDP is 27 annually.8%Decrease 8/20 Countermeasures subcommittee views that the epidemic has reached its peak |
9 | 7 | 9/5 WHO “Vaccine distribution will start in the middle of next year” 9/18 GoTo Travel Reservation to / from Tokyo lifted |
10 | 12 | 10/1 GoTo eat start 10/12 Rapid spread of infection in Europe |
11 | 25 | 11/19 The number of domestically infected people reached a record high for the second consecutive day 11/20 Government Subcommittee Recommendations for Government to Review GoTo |
12 | 15 | 12/14 GoTo Travel Stopped all over Japan 12/17 Tokyo, 822 new infections per day, to the highest alert level |
Total | 1216 |
from google.colab import files
uploaded = files.upload()
#Fill the arithmetic progression from 1 to 12 with 0 and make it 2 digits
months = ['{0:02d}'.format(i) for i in range(1,13,1)]
docs = []
for month in months:
#Generate file name
file_name = "nipponcom_covid19_2020-" + month + ".txt"
#Read as text
with open(file_name, mode='rt', encoding='utf-8-sig') as f:
text = f.read()
docs.append(text)
docs
used for TF-IDF is for 12 months on a monthly basis, and only nouns are separated by single-byte spaces.import pandas as pd
metrics = []
for doc in docs:
value = []
#Split with whitespace as delimiter
words = pd.Series(doc.split(" "))
#Count the number of elements
value.append(len(words))
#Count the number of unique elements
value.append(words.nunique())
metrics.append(value)
#Formatted to data frame
names = ["Number of extracts", "Vocabulary number"]
months = ['{0}Moon'.format(i) for i in range(1, 13, 1)]
pd.DataFrame(metrics, columns=names, index=months)
from collections import Counter
rank_frequency = []
for doc in docs:
value = []
#Split with whitespace as delimiter
words = pd.Series(doc.split(" "))
#Count the number of unique vocabularies
cnt = Counter(words)
v = cnt.most_common(10) #Top
value.append(v)
rank_frequency.append(value)
rank_frequency
import numpy as np
#Get the top 10 words each month
ranking = []
for a in rank_frequency:
temp = []
for i in a:
for n in range(0,10,1):
j = i[n]
temp.append(j[0])
ranking.append(temp)
#Data frame
data = np.array(ranking).T
rank = ['{0}Rank'.format(i) for i in range(1, 11, 1)]
pd.DataFrame(data, columns=months, index=rank)
TfidfVectorizer
to calculate the $ tfidf $ score.from sklearn.feature_extraction.text import TfidfVectorizer
#Generate model
vectorizer = TfidfVectorizer(smooth_idf=False)
X = vectorizer.fit_transform(docs)
#Data frame
values = X.toarray()
feature_names = vectorizer.get_feature_names()
month_num = ['{0:02d}'.format(i) for i in range(1,13,1)]
df_score = pd.DataFrame(values, columns = feature_names, index=month_num)
print(df_score)
for i in range(0,12,1):
monthly_rank = []
df_score_ = df_score[i:i+1].T
df_score_sorted = df_score_.sort_values(month_num[i], ascending=False)
print(df_score_sorted.head(10))
result = []
for i,j in zip(range(0,12,1), month_num):
test = df_score[i:i+1].T
#Get the top 10 words
test_sorted = test.sort_values(j, ascending=False)
test_rank = test_sorted.head(10)
#Extract only noun labels
r = test_rank.index
result.append(r)
pd.DataFrame(result,columns=rank,index=months).T
word_list
.import itertools
#Specify October
n = 10
word_list = []
for i in range(0,n,1):
df = df_score[i:n-1]
df = df.loc[:, (df != 0).any(axis=0)]
word = list(df.columns)
word_list.append(word)
#Flatten to one dimension
word_list = list(itertools.chain.from_iterable(word_list))
len(word_list)
df_score
used here is calculated separately from the data from January to October.#Extract only this month
df_current = df_score[n-1:n]
df_current = df_current.loc[:, (df_current != 0).any(axis=0)]
#Removal of existing words
for i in word_list:
if i in df_current:
df_current = df_current.drop(i, axis=1)
# TF-Extract the top 10 words of IDF
df_current = df_current.T
df_sorted = df_current.sort_values(str(n), ascending=False)
df_sorted.head(10)
Recommended Posts