Get tweets with any keyword using the Twitter API. Save the acquired data as text and pass it to MeCab. Perform morphological analysis and try to make sentences with Markov chains.
This time I kept the count to 140 so that it can be used for tweets as it is, You may try to make it long. The accuracy is low. I just realized the awesomeness of the compressed newspaper.
markov.py
#!/user/bin/env python
# -*- coding: utf-8 -*-
from requests_oauthlib import OAuth1Session
import json
import sys
import MeCab
import random
while True:
search_words = raw_input(u"Keyword?: ")
C_KEY = "******************************"
C_SECRET = "******************************"
A_KEY = "******************************"
A_SECRET = "******************************"
def Limit_Status():
url = "https://api.twitter.com/1.1/application/rate_limit_status.json"
params = {}
tw = OAuth1Session(C_KEY,C_SECRET,A_KEY,A_SECRET)
req = tw.get(url, params = params)
if req.status_code == 200:
limit = req.headers["x-rate-limit-remaining"]
print ("API remain: " + limit)
return Limit_Status
def Search_words():
url = "https://api.twitter.com/1.1/search/tweets.json?"
params = {
"q": unicode(search_words, "utf-8"),
"lang": "ja",
"result_type": "recent",
"count": "100"
}
tw = OAuth1Session(C_KEY,C_SECRET,A_KEY,A_SECRET)
req = tw.get(url, params = params)
tweets = json.loads(req.text)
for tweet in tweets["statuses"]:
f = open("test.txt" , "aw")
lists = (tweet["text"].encode("utf-8"))
if "http" in lists:
lists = lists.split("http", 1)[0]
lists = lists.split("@")[0]
lists = lists.split("RT")[0]
f.write(lists)
f.flush()
f.close()
def Mecab_file():
f = open("test.txt","rb")
data = f.read()
f.close()
mt = MeCab.Tagger("-Owakati")
wordlist = mt.parse(data)
markov = {}
w1 = ""
w2 = ""
w3 = ""
w4 = ""
w5 = ""
w6 = ""
w7 = ""
w8 = ""
for word in wordlist:
if w1 and w2 and w3 and w4 and w5 and w6 and w7 and w8:
if (w1,w2,w3,w4,w5,w6,w7,w8) not in markov:
markov[(w1,w2,w3,w4,w5,w6,w7,w8)] = []
markov[(w1,w2,w3,w4,w5,w6,w7,w8)].append(word)
w1,w2,w3,w4,w5,w6,w7,w8 = w2,w3,w4,w5,w6,w7,w8,word
count = 0
sentence = ""
w1,w2,w3,w4,w5,w6,w7,w8 = random.choice(markov.keys())
while count < 140:
if markov.has_key((w1,w2,w3,w4,w5,w6,w7,w8)) == True:
tmp = random.choice(markov[(w1,w2,w3,w4,w5,w6,w7,w8)])
sentence += tmp
w1,w2,w3,w4,w5,w6,w7,w8 = w2,w3,w4,w5,w6,w7,w8,tmp
count +=1
if " " in sentence:
sentence = sentence.split(" ", 1)[0]
print sentence
if search_words:
Search_words()
Mecab_file()
Limit_Status()
else:
break
I tried to operate with 8 chains. It turned out that it wasn't interesting unless I stopped at about 4 chains.
Originally I wanted to remove all unnecessary data from Json data, but my knowledge at the moment is limited. For the time being, when http is included in the text, I tried to remove it with split.
As usual, if test.txt does not exist in the same directory, it will be generated. If there is, overwrite it.
The loop in While breaks when executed without entering a search word. It may be good to store various search words separately.
I tried to edit it. Sharpen the parts that you don't need with regular expressions, Random choices such as "desu" and "masu" so that the end of the sentence does not get strange.
I felt that this one was more practical.
def Mecab_file():
f = open("tweet.txt","rb")
data = f.read()
f.close()
mt = MeCab.Tagger("-Owakati")
wordlist = mt.parse(data)
wordlist = wordlist.rstrip(" \n").split(" ")
markov = {}
w = ""
for x in wordlist:
if w:
if markov.has_key(w):
new_list = markov[w]
else:
new_list =[]
new_list.append(x)
markov[w] = new_list
w = x
choice_words = wordlist[0]
sentence = ""
count = 0
while count < 90:
sentence += choice_words
choice_words = random.choice(markov[choice_words])
count += 1
sentence = sentence.split(" ", 1)[0]
p = re.compile("[!-/:-@[-`{-~]")
sus = p.sub("", sentence)
random_words_list = [u"。", u"is.", u"It is."]
last_word = random.choice(random_words_list)
print re.sub(re.compile("[!-~]"),"",sus), last_word
Recommended Posts