When using mecab with python, I have to rewrite it in various ways when I want to specify the part of speech and write it freely, so I wrote the class myself to eliminate the inconvenience, so I will publish it.
import MeCab
import unicodedata
import re
class MecabParser():
def __init__(self, word_classes=None, word_class_details=None):
"""
Args:
word_classes (list, optional):Part of speech specified in Japanese. Defaults to None.
word_class_details (list, optional):Specifying the details of part of speech. Defaults to None.
See below for the part of speech defined by mecab
https://taku910.github.io/mecab/posid.html
"""
self._word_classes = word_classes
self._word_class_details = word_class_details
def _format_text(self, text):
"""
Formatting text before putting it in MeCab
"""
text = re.sub(r'http(s)?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
text = re.sub(r'[ -/:-@\[-~_]', "", text) #Half-width symbol
text = re.sub(r'[︰-@]', "", text) #Double-byte symbol
text = re.sub(r'\d', "", text) #Numbers
text = re.sub('\n', " ", text) #Newline character
text = re.sub('\r', " ", text) #Newline character
return text
def parse(self, text, is_base=False):
text = self._format_text(text)
#Character code conversion process. If not converted, the voiced sound mark and the semi-voiced sound mark will be separated.
text = unicodedata.normalize('NFC', text)
result = []
tagger = MeCab.Tagger(
'-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
#You can avoid surface read error by parsing once before parseToNode
tagger.parse('')
nodes = tagger.parseToNode(text)
while nodes:
wclass = nodes.feature.split(',')
#If no part of speech is specified, all words are separated.
if not self._word_classes:
result.append(wclass[6] if is_base else nodes.surface)
nodes = nodes.next
continue
#If no part of speech details are specified, all the part of speech is divided.
if not self._word_class_details:
if wclass[0] in self._word_classes:
result.append(wclass[6] if is_base else nodes.surface)
nodes = nodes.next
continue
#Divide according to the detailed specification of part of speech
if wclass[0] in self._word_classes and wclass[1] in self._word_class_details:
result.append(wclass[6] if is_base else nodes.surface)
nodes = nodes.next
#Remove first and last whitespace strings
if len(result) > 0:
result.pop(0)
result.pop(-1)
return result
I save this in a file called parser.py
and use it.
The feeling of use is as follows, and it is relatively easy to divide words by specifying the part of speech.
>>> from parser import MecabParser
>>> mp = MecabParser(word_classes=['noun'], word_class_details=['General','固有noun'])
>>> text = 'I'm hungry today, so I came to eat one of the best ramen in the neighborhood.'
>>> mp.parse(text)
['stomach', 'Neighborhood', 'Tenkaippin', 'ramen']
The dictionary uses mecab-ipadic-neologd, words with changed endings are corrected to their original form, and the text is formatted in advance, so I hope that the user can modify it to their liking. think.
Recommended Posts