Last time, I explained about the theory (mathematical formula) of naive Bayes in Machine learning beginners try to touch naive Bayes (1) --Theory. did.
On this page, I'll try a simple implementation using Python. As usual, it is made by imitating the site that I referred to, so it will be explained & this explanation if you are interested. By the way, I haven't written Python at all, so please watch with warm eyes ...
The sample code is at the bottom.
The Naive Bayesian formula can be expressed as follows.
P(cat|doc) = \log P(cat) + \prod_{i=0}^k \log P(word_k|cat)
I don't know if it's a program ... I will organize what I need first.
< P(cat) >
{'cat1': 1,'cat2': 1, ...}
< P(word|cat) >
{'cat1': {'word1': 1, ...}, ...}
nb = NaiveBayes()
nb.train(data)
I want to use it as above, so I design it as a Class. If you declare what you need in init above, the code up to __init__
will be ..
class NaiveBayes:
def __init__(self):
self.vocabularies = set() #Non-duplicate category
self.categories = set() #Non-overlapping vocabulary
self.category_count = {} #Number of occurrences of category, category_count[cat]
self.word_count = {} #Number of word occurrences for each category, word_count[cat]
...
set ()
is a unique array. Used for vocabularies, categories.{}
Is a dictionary. It is used to count the number of categories and the number of words in each category.def train(self, data):
self.category_count = defaultdict(lambda: 0)
for d in data:
category = d[0]
self.categories.add(category) #Add category
self.category_count[category] += 1
for word in d[1:]: #Add vocabulary
self.vocabularies.add(word)
for category in self.categories:
self.word_count[category] = defaultdict(lambda: 0)
for d in data:
category = d[0]
for word in d[1:]:
self.word_count[category][word] += 1
This is a preparation. Already ... that's it.
from collections import defaultdict
hash = {}
hash = defaultdict(lambda: 0)
You can use defaultdict
to specify the initial value when there is no value. I was worried about Laplace smoothing, but I decided to add + 1
to the calculation.
P(word|cat)
\prod_{i=0}^k \log P(word_k|cat)
Define a function called word_probability
to calculate each term of. How to find this
P(word_k|cat) = \frac{category(cat)Words in(word)Number of appearances+ 1}{category(cat)Total number of words that appear in+Total number of words}
It was. Here we are applying Laplace Smoothing
.
def word_probability(self, word, category):
'''Probability of being a category given a word, P(word|cat)'''
#Apply Laplace Smoothing
word_count = self.word_count[category][word] + 1
vocabulary_count = sum(self.word_count[category].values()) + len(self.vocabularies)
return float(word_count) / float(vocabulary_count)
Calculate P (cat | doc)
.
P(cat|doc) = \log P(cat) + \prod_{i=0}^k \log P(word_k|cat)
Given a sentence, use the above function to create a function that calculates the score because we have to take the total product of the probabilities of each word.
However, please note that the multiplication of logs with the same base is the addition, so it is the sum, not the total product.
Also, since the base of log is 10
, it will be a negative value when calculated. I personally disliked it, so I did x (-1)
at the end. I think either one is fine.
First, put P (cat)
in score
, and then addP (word | cat)
in a loop.
def score(self, words, category):
'''documents(word)Probability of being a category given'''
documents_count = sum(self.category_count.values())
score = math.log(float(self.category_count[category]) / documents_count)
for word in words:
score += math.log(self.word_probability(word, category))
#Because the bottom of log is 10, it will be negative+To
return score * (-1)
Well, it's finally classification. In reality, I think you'll just call this function after training.
def classify(self, words):
'''P(cat|doc)Returns the largest category'''
best = None
value = 0
for category in self.categories:
v = self.score(words, category)
if v > value:
best = category
value = v
return best
The structure is simple, it calculates the score for all the categories passed in the learning stage and returns the highest one.
#coding:utf-8
from collections import defaultdict
import math
class NaiveBayes:
def __init__(self):
self.vocabularies = set() #Non-duplicate category
self.categories = set() #Non-overlapping vocabulary
self.category_count = {} #Number of occurrences of category, category_count[cat]
self.word_count = {} #Number of word occurrences for each category, word_count[cat]
def train(self, data):
self.category_count = defaultdict(lambda: 0)
for d in data:
category = d[0]
self.categories.add(category) #Add category
self.category_count[category] += 1
for word in d[1:]: #Add vocabulary
self.vocabularies.add(word)
for category in self.categories:
self.word_count[category] = defaultdict(lambda: 0)
for d in data:
category = d[0]
for word in d[1:]:
self.word_count[category][word] += 1
def word_probability(self, word, category):
'''Probability of being a category given a word, P(word|cat)'''
#Apply Laplace Smoothing
word_count = self.word_count[category][word] + 1
vocabulary_count = sum(self.word_count[category].values()) + len(self.vocabularies)
return float(word_count) / float(vocabulary_count)
def score(self, words, category):
'''documents(word)Probability of being a category given'''
documents_count = sum(self.category_count.values())
score = math.log(float(self.category_count[category]) / documents_count)
for word in words:
score += math.log(self.word_probability(word, category))
#Because the bottom of log is 10, it will be negative+To
return score * (-1)
def classify(self, words):
'''P(cat|doc)Returns the largest category'''
best = None
value = 0
for category in self.categories:
v = self.score(words, category)
if v > value:
best = category
value = v
return best
if __name__ == "__main__":
data = [["yes", "Chinese", "Beijing", "Chinese"],
["yes", "Chinese", "Chinese", "Shanghai"],
["yes", "Chinese", "Macao"],
["no", "Tokyo", "Japan", "Chinese"]]
#Train naive bayes classifier
nb = NaiveBayes()
nb.train(data)
print "P(Chinese|yes) = ", nb.word_probability("Chinese", "yes")
print "P(Tokyo|yes) = ", nb.word_probability("Tokyo", "yes")
print "P(Japan|yes) = ", nb.word_probability("Japan", "yes")
print "P(Chinese|no) = ", nb.word_probability("Chinese", "no")
print "P(Tokyo|no) = ", nb.word_probability("Tokyo", "no")
print "P(Japan|no) = ", nb.word_probability("Japan", "no")
#
# #Predict test data categories
test = ["Chinese", "Chinese", "Chinese", "Tokyo", "Japan"]
print "log P(yes|test) =", nb.score(test, "yes")
print "log P(no|test) =", nb.score(test, "no")
print nb.classify(test)
I have referred to the following pages very much. Thank you very much.
Recommended Posts