A personal note when using MeCab from Python.
mecab.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import MeCab
m = MeCab.Tagger()
print m.parse("If a dog walks, it hits a stick.")
$ ./mecab.py
Dog noun,General,*,*,*,*,dog,Dog,Dog
Also particles,Particle,*,*,*,*,Also,Mo,Mo
Walk verb,Independence,*,*,Five-dan / Ka line,Assumed form,walk,Arche,Arche
Particle,Connection particle,*,*,*,*,If,Ba,Ba
Stick noun,General,*,*,*,*,rod,Bow,baud
Particles,Case particles,General,*,*,*,To,D,D
Hit verb,Independence,*,*,Five steps, La line,Uninflected word,Hit,Ataru,Ataru
.. symbol,Kuten,*,*,*,*,。,。,。
EOS
mecab_from_file.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
param = sys.argv
infile = param[1]
f = open(infile)
line = f.readline()
import MeCab
m = MeCab.Tagger()
while line:
res = m.parseToNode(line)
while res:
print res.feature
#noun,General,*,*,*,*,dog,Dog,Dog
res = res.next
line = f.readline()
It's easier to use collections.defaultdict when counting elements.
mecab_class_count.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
param = sys.argv
infile = param[1]
f = open(infile)
line = f.readline()
import MeCab
m = MeCab.Tagger()
from collections import defaultdict
frequency = defaultdict(int)
while line:
res = m.parseToNode(line)
while res:
# print res.feature
#noun,General,*,*,*,*,dog,Dog,Dog
arr = res.feature.split(",")
class_1 = arr[0]
frequency[class_1] += 1
res = res.next
line = f.readline()
# print frequency
# defaultdict(<type 'int'>, {'...
for k, v in frequency.iteritems():
print k, v
$ ./mecab_morph_count.py input.txt
Verb 4
BOS/EOS 8
Noun 9
Particle 7
Auxiliary verb 1
#MeCab instance
m = MeCab.Tagger(' -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd')
m = MeCab.Tagger('-r my_mecabrc')
Recommended Posts