As part of my study of Python, I tried Twitter API, word cloud creation, word negative / positive judgment, etc.
What I did was create WordCloud using only positive words (adjectives, verbs) among the words tweeted with the keyword "transfer girl".
The steps are created by the following procedure.
I will try to get the tweet immediately.
get_time_lines.py
import json
import config
from requests_oauthlib import OAuth1Session
from time import sleep
import emoji
from mongo_dao import MongoDAO
#Remove emoji
def remove_emoji(src_str):
return ''.join(c for c in src_str if c not in emoji.UNICODE_EMOJI)
#API key setting(Separate file config.Defined in py)
CK = config.CONSUMER_KEY
CS = config.CONSUMER_SECRET
AT = config.ACCESS_TOKEN
ATS = config.ACCESS_TOKEN_SECRET
#Authentication process
twitter = OAuth1Session(CK, CS, AT, ATS)
#Timeline acquisition endpoint
url = "https://api.twitter.com/1.1/search/tweets.json"
#Keywords to get
keyword = 'Transfer girls'
#Parameter definition
params = {'q': keyword,
'count': 200}
# arg1:DB Name
# arg2:Collection Name
mongo = MongoDAO("db", "tenkou")
mongo.delete_many({})
#Get the latest 200 cases / params from the second time onwards['max_id']Get tweets older than the ID set in
for j in range(100):
res = twitter.get(url, params=params)
if res.status_code == 200:
#API remaining count
limit = res.headers['x-rate-limit-remaining']
print("API remain: " + limit)
if limit == 1:
sleep(60*15)
n = 0
result = json.loads(res.text)
#Process by tweet
tweets = result['statuses']
for tweet in tweets:
#If there is an emoji, wordcloud cannot be used, so delete it
tweet['text'] = remove_emoji(tweet['text'])
#Register the entire tweet data
mongo.insert_one(tweet)
if len(tweets) >= 1:
params['max_id'] = tweets[-1]['id']-1
Write config to another py file
config.py
CONSUMER_KEY = "****"
CONSUMER_SECRET = "****"
ACCESS_TOKEN = "****"
ACCESS_TOKEN_SECRET = "****"
The operation class of mongoDB looks like this
(Partial excerpt) mongo_dao.py
from pymongo import MongoClient
class MongoDAO(object):
def __init__(self, dbName, collectionName):
self.client = MongoClient()
self.db = self.client[dbName] #Set DB name
self.collection = self.db.get_collection(collectionName)
def insert_one(self, document):
return self.collection.insert_one(document)
def insert_many(self, documents):
return self.collection.insert_many(documents)
It's enough to use only text data for tweet data, but I think that it may be used for something later, so I'm putting everything in.
This process creates data for negative / positive judgment of words. For the data for judgment, we used the "Japanese Evaluation Polar Dictionary (Noun Edition)" created and published by Tohoku University Inui-Okazaki Laboratory.
Japanese Evaluation Polarity Dictionary (Noun Edition)
The dictionary data has the following format.
A few days e ~ becomes / becomes (state) objective
10%e ~ becomes / becomes (state) objective
100%e ~ becomes / becomes (state) objective
"Word", "negative (n) / positive (p) / neutral (e)", and "state" are registered as tab-separated data.
insert_noun.py
from mongo_dao import MongoDAO
import codecs
mongo = MongoDAO("db","noun")
dict_path = './dict/noun_dict.trim'
with codecs.open(dict_path, "r", "utf-8") as f:
for line in f:
d = line[:-1].split('\t')
if d[1] == 'n':
d.append(-1)
elif d[1] == 'p':
d.append(1)
else:
d.append(0)
mongo.insert_one({"word": d[0], "np": d[1], "evaluation": d[2], "score": d[3]})
In this process, the downloaded dictionary data is tab-delimited and plunged into mongoDB. Since the negative / positive judgment could be used for something, we have added the data "score" where the positive is "1", the negative is "-1", and the others are "0".
tweet_analyze.py
import MeCab
from mongo_dao import MongoDAO
import word_cloud
from wordcloud import WordCloud
target = "tenkou"
#MeCab preparation
tagger = MeCab.Tagger("-Ochasen")
#Get data from mongoDB
mongo = MongoDAO("db", target)
target_results = mongo.find()
#For storing analysis results
positive_words = []
negative_words = []
neutral_words = []
tweet_score = 0
#Change DB connection destination to dictionary data
mongo = MongoDAO("db", "noun")
for target_result in target_results:
text = target_result['text']
mecab_results = tagger.parse(text)
for result in mecab_results.split('\n'):
word = result.split('\t')[0]
mongo_result = mongo.find_one(filter={"word":word})
if type(mongo_result) is dict:
tweet_score += mongo_result['score']
if mongo_result['np'] == 'n':
negative_words.append(word)
elif mongo_result['np'] == 'p':
positive_words.append(word)
elif mongo_result['np'] == 'e':
neutral_words.append(word)
else:
neutral_words.append(word)
#Words to exclude from the word cloud
stop_words = ['RT','@', '//','NECOPLASTIC', 'Nekopla', 'cat','chuLa', 'FESTIVE','FES', 'TIVE',
'Nana Land','JYAPON','Nana','land','JAPONISM','JYA','NEO','PON','What kind of kini','What',
'Kini','To do','Take','Teru','come','Become','Is','To be','Let','Oru','Dollar','Blur']
#Use the font that is on your device
font_path = 'C:\\WINDOWS\\Fonts\\meiryo.ttc'
#Create a word cloud with positive words
wordcloud = WordCloud(background_color="white",font_path=font_path,contour_color='steelblue', collocations = False,
contour_width=3,width=900, height=500,stopwords=set(stop_words)).generate(word_cloud.parseWordCloudText(positive_words))
wordcloud.to_file("./output_wordcloud/wordcloud_" + target + "_positive.png ")
#Create a word cloud with negative words
wordcloud = WordCloud(background_color="white",font_path=font_path,contour_color='steelblue', collocations = False,
contour_width=3,width=900, height=500,stopwords=set(stop_words)).generate(word_cloud.parseWordCloudText(negative_words))
wordcloud.to_file("./output_wordcloud/wordcloud_" + target + "_negative.png ")
word_cloud.py
from janome.tokenizer import Tokenizer
from collections import defaultdict
def counter(texts):
t = Tokenizer()
words_count = defaultdict(int)
words = []
for text in texts:
tokens = t.tokenize(text)
for token in tokens:
#Extract only adjectives and nouns from part of speech
pos = token.part_of_speech.split(',')[0]
if pos in ['adjective','verb']:
#Omit unnecessary words(After seeing the actual result, I wrote words that seem unnecessary)
if token.base_form not in ["thing", "Yo", "so", "this", "It"]:
words_count[token.base_form] += 1
words.append(token.base_form)
return words_count, words
def parseWordCloudText(textList):
return " ".join(textList) if type(textList) is list else ""
Python --How to create a Word Cloud Sentiment analysis of Japanese sentences with python (+ basics of language processing) I learned a lot. Thank you very much.
Word cloud of positive words It turned out that they were tweeted with wonderful idol-like words such as "passion", "beautiful girl", "perfect", and "neat".
Negative word word cloud When I searched for tweets, I found a lot of tweets of people who were bullied with the word transfer girl. It can be inferred that these tweets also influenced this result.
With this content, I was able to learn the feel of natural language processing using python. Analysis using Twitter as a data source is not a hard sentence, so there are many words that do not get caught in the analysis. In addition, many tweets other than those related to the content you want to process will be caught in the search, so I felt that it was a difficult point how to exclude them.
Also, I realized that it is quite difficult to judge from the context whether a word that is used in both good and bad meanings such as "dangerous" is used for negative / positive judgment.
Recommended Posts