Morphological analysis and tfidf (with test code) that can be done in about 1 minute

Preparation

pip install nltk
pip install mecab-python

Try pasting the code below and running it

The function to output TF-IDF is tfidf The function for morphological analysis is extract_words The long-running guy below the import unit test at the bottom is a test

#!/usr/bin/env python
#-*- encoding: utf-8 -*-
import nltk
import MeCab
import urllib2
from urllib2 import HTTPError
from itertools import chain


def tfidf(doc,docs):
  """If you specify the target document and the morphologically parsed word list of the whole sentence, the TF of the target document-Returns IDF"""
  tokens = list(chain.from_iterable(docs)) #flatten
  A = nltk.TextCollection(docs)
  token_types = set(tokens)
  return [{"word":token_type,"tfidf":A.tf_idf(token_type, doc)} for token_type in token_types]
    

def extract_words(text):
  """Given text, returns a list of nouns"""
  text =  text.encode("utf-8") if isinstance(text,unicode) else text
  mecab = MeCab.Tagger("")
  node = mecab.parseToNode(text)
  words = []
  while node:
    fs = node.feature.split(",")
    if (node.surface is not None) and node.surface != "" and fs[0] in [u'noun']:
      words.append(node.surface)
    node = node.next
  return words

import unittest

class MachineLearningTest(unittest.TestCase):
  def test_extract_words(self):
    """Morphological analysis test"""
    text = "Morphologically parse text and return a list of nouns"
    keywords = extract_words(text)
    self.assertEqual(keywords, ["text","morpheme","analysis","noun","list"])
  def test_tfidf(self):
    """tfidf test"""
    urls = ["http://qiita.com/puriketu99/items/"+str(i) for i in range(1,10)]
    def url2words(url):
      try:
        html = urllib2.urlopen(url).read()
      except HTTPError:
        html = ""
      plain_text = nltk.clean_html(html).replace('\n','')
      words = extract_words(plain_text)
      return words
    docs = [url2words(url) for url in urls]
    tfidfs_fizzbuzz = tfidf(docs[0],docs)
    tfidfs_fizzbuzz.sort(cmp=lambda x,y:cmp(x["tfidf"],y["tfidf"]),reverse=True)
    result = [e for i,e in enumerate(tfidfs_fizzbuzz) if len(e["word"]) > 2 and i < 30]
    self.assertEqual(result[7]["word"],"yaotti")#If Qiita side changes the design, the test may fail
    print result
    #[{'tfidf': 0.08270135278254376, 'word': 'quot'},
    # {'tfidf': 0.02819364299404901, 'word': 'FizzBuzz'},
    # {'tfidf': 0.02067533819563594, 'word': 'fizzbuzz'},
    # {'tfidf': 0.02067533819563594, 'word': 'Buzz'},
    # {'tfidf': 0.016916185796429405, 'word': 'Fizz'},
    # {'tfidf': 0.016726267030018446, 'word': 'end'},
    # {'tfidf': 0.015036609596826138, 'word': 'map'},
    # {'tfidf': 0.015036609596826138, 'word': 'yaotti'},
    # {'tfidf': 0.011277457197619604, 'word': 'def'}]

if __name__ == '__main__':
  unittest.main()