Calculate information gain using NLTK

Information gain is used when you want to select features and use characteristic features when training a maximum entropy classifier. [This book](http://www.amazon.co.jp/%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86%E3%81%AE%E3% 81% 9F% E3% 82% 81% E3% 81% AE% E6% A9% 9F% E6% A2% B0% E5% AD% A6% E7% BF% 92% E5% 85% A5% E9% 96% 80-% E8% 87% AA% E7% 84% B6% E8% A8% 80% E8% AA% 9E% E5% 87% A6% E7% 90% 86% E3% 82% B7% E3% 83% AA % E3% 83% BC% E3% 82% BA-% E9% AB% 98% E6% 9D% 91-% E5% A4% A7% E4% B9% 9F / dp / 4339027510 / ref = sr_1_2? Ie = UTF8 & qid = 1328698086 & sr = 8-2 "Introduction to machine learning for language processing") is very easy to understand.

`informationgain.py`


from math import log
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import ConditionalFreqDist
from nltk.probability import FreqDist


def information_gain(labeled_documents):
    ig = {}
    labeldist = FreqDist()
    docnumdist = FreqDist()
    cldist = ConditionalFreqDist()

    n = 1
    for (label, doc) in labeled_documents:
        labeldist.inc(label)
        unigrams = set(ngrams(word_tokenize(doc), n))
        for unit in unigrams: 
            cldist[unit].inc(label)
            docnumdist.inc(unit)

    H_C = 0.0
    pr_label = {}
    for label in labeldist.samples(): 
        if not label in pr_label: pr_label[label] = 0.0
        pr_label[label] = labeldist[label] / float(labeldist.N())
    H_C = -sum([pr * log(pr, 2) for pr in pr_label.values()])
    print "H(C) = %.2f" % H_C


    H_C_given_X = {}
    for c in cldist.conditions():
        for label in cldist[c].samples():
            print "Pr(%s|X_{%s} = 1) = %.2f" % (label, c, cldist[c][label] / float(labeldist[label]))
            print "Pr(%s|X_{%s} = 0) = %.2f" % (label, c, 1.0 - cldist[c][label] / float(labeldist[label]))
            pr_label_given_x1 = cldist[c][label] / float(labeldist[label])
            pr_label_given_x0 = 1.0 - pr_label_given_x1

            if not "%s = 1" % c in H_C_given_X: H_C_given_X["%s = 1" % c] = 0.0
            H_C_given_X["%s = 1" % c] -= pr_label_given_x1 * log(pr_label_given_x1, 2)
            if not "%s = 0" % c in H_C_given_X: H_C_given_X["%s = 0" % c] = 0.0
            H_C_given_X["%s = 0" % c] -= pr_label_given_x0 * log(pr_label_given_x0, 2)
        print

    for c in cldist.conditions():
        pr_x1 = docnumdist[c] / float(docnumdist.N())
        pr_x0 = 1.0 - pr_x1
        if not c in ig:
            ig[c] = H_C - pr_x1 * H_C_given_X["%s = 1" % c] - pr_x0 * H_C_given_X["%s = 0" % c]

    for i, j in sorted(ig.items(), key=lambda x:x[1], reverse=True): print i, j


documents = [("pos", "good good good excellent"), ("pos", "good very excellent"),
             ("pos", "good fine good excellent"), ("pos", "bad very fine"),
             ("neg", "bad bad worse"), ("neg", "worse good worse excellent"),
             ("neg", "excellent very bad"), ("neg", "bad very worse")]

information_gain(documents)

It can be seen that "fine" and "worse", which appear in only one of the polarities, have high information gain and are characteristic words. Also, since "very" that appears in both polarities is a non-characteristic word, it can be confirmed that the information gain is low.

('worse',) 0.52573480121
('fine',) 0.5
('good',) 0.188721875541
('bad',) 0.188721875541
('excellent',) 0.0428913353502
('very',) 0.0