Experiment to try morphological analysis for each specified URL. I tried to remove HTML tags with a regular expression, but I can't remove them.
urlmecab.py
#!/user/bin/env python
# -*- coding: utf-8 -*-
import urllib
import sys
import MeCab
import re
while True:
search_url = raw_input(u"input URL: ")
def Mecab_file():
req = urllib.urlopen(search_url)
dlText = req.read()
mt = MeCab.Tagger("mecabrc")
data = []
p = re.compile(r"<[^>]*?>")
sus = p.sub("", dlText)
data.append(sus)
node = mt.parseToNode("\n".join(data))
words = {}
while node:
word = node.surface
if word and node.posid >=36 and node.posid <=67:
if not words.has_key(word):
words[word] = 0
words[word] += 1
node = node.next
word_items = words.items()
word_items.sort()
word_items.reverse()
for word, count in word_items:
print word, count
if search_url:
Mecab_file()
else:
break
Extract only nouns with the part of speech ID of MeCab.
if word and node.posid >=36 and node.posid <=67:
If you change this part, you may be able to play a lot. Loop as long as you keep typing the URL. Loop break with blank enter. http://〜入力する必要あり。
Recommended Posts