pip install mecab
worked fine in one shot.. Note that it is not ʻimport Mecab
What you get is a "long string", not a list or tuple. A little inconvenient.
I made a class with python. By doing MeCab.Tagger ('-Odump')
in the constructor, all the information fetched by MeCab is stored in the field, and each method fetches only the necessary information from that field in regular expression and outputs it.
The code is as follows.
MeCab_handler.py
import re, MeCab
import numpy as np
import jaconv
from pykakasi import kakasi
class MeCab_handler:
"""
MeCab.Tagger('-Odump').parse((Constructor arguments))And
Get the result as a one-dimensional ndarray in each method
"""
def __init__(self, sentence):
self.parse_result = MeCab.Tagger('-Odump').parse(sentence)
def get_separated(self):
"""
Word-separation
"""
tmp = np.array(re.findall('\n[0-9]+ ([^ ]*)', self.parse_result))
return tmp[0:np.size(tmp)-1] #Cut EOS
def get_words_basic(self):
"""
Uninflected word
"""
tmp = np.array(re.findall('\n[0-9]+ [^ ]* (?:[^,]*,){6,6}([^,]*)', self.parse_result))
return tmp[0:np.size(tmp)-1]
def get_POS(self, need_detail=False):
"""
Part of speech
Optional argument need_If detail is True,
Subclassification(Up to 3 stages)Get it if there is
"""
if need_detail:
tmp = np.array(re.findall('\n[0-9]+ [^ ]* ([^,]*(?:,[^*,]+(?:,[^*,]+(?:,[^*,])?)?)?)', self.parse_result))
else:
tmp = np.array(re.findall('\n[0-9]+ [^ ]* ([^,]*)', self.parse_result))
return tmp[0:np.size(tmp)-1] #Cut EOS
def get_conjugation_type(self):
"""
Utilization type
"""
tmp = np.array(re.findall('\n[0-9]+ [^ ]* (?:[^,]*,){4,4}([^,]*)', self.parse_result), dtype='object')
tmp = np.where(tmp=='*', None, tmp)
return tmp[0:np.size(tmp)-1]
def get_conjugation_form(self):
"""
Inflected form
"""
tmp = np.array(re.findall('\n[0-9]+ [^ ]* (?:[^,]*,){5,5}([^,]*)', self.parse_result))
return tmp[0:np.size(tmp)-1]
def get_katakana(self):
"""
Katakana
"""
tmp = np.array(re.findall('\n[0-9]+ [^ ]* (?:[^,]*,){7,7}([^,]*)', self.parse_result))
return tmp[0:np.size(tmp)-1]
def get_hiragana(self):
"""
Hiragana
"""
katakanas = self.get_katakana()
hiraganas = np.zeros(0, dtype=katakanas.dtype)
for katakana in katakanas:
hiraganas = np.append(hiraganas, jaconv.kata2hira(katakana))
return hiraganas
def get_how_to_speak(self):
"""
How to pronounce. get_hiragana and get_It may be different from katakana etc.
Romaji
"""
tmp = np.array(re.findall('\n[0-9]+ [^ ]* (?:[^,]*,){8,8}([^ ]*)', self.parse_result))
katakanas = tmp[0:np.size(tmp)-1]
kakac = kakasi()
kakac.setMode("K", "a") #Katakana to ascii
kakac.setMode("r", "Hepburn") #Hepburn is adopted for Romaji
conv = kakac.getConverter()
romans = np.zeros(0, dtype='object')
for katakana in katakanas:
romans = np.append(romans, conv.do(katakana))
return romans
I wrote the function in the source code, but it is as shown in the table below.
Method | Example (print ('print (MeCab_handler ('The United States cried. Movie Doraemon" Nobita's Theory and Practice "). Method) ) |
---|---|
get_separated () | ['National''is''crying''. "" Movie "" Doraemon "" "" "Nobita" "" Theory "" and "" Practice "" ""] |
get_words_basic () | ['National''is''cry''". "" Movie "" Doraemon "" "" "Nobita" "" Theory "" and "" Practice "" ""] |
get_POS () | `['noun''particle'' particle''auxiliary verb''symbol''noun''noun''symbol''particle' |
get_POS (True) | ['noun, proper noun, region, one'" particle, case particle, general'' verb, independence''particle'' symbol, punctuation''noun, general' '''Noun, proper noun, person's name, first name'' Noun, generalization''Noun, general''' Noun, case particle, general''Noun, Sahen connection''Noun, parenthesis closing'] |
get_conjugation_type () | [None None'Five-stage / Kakou Ionbin''Special / Ta'None None None None None None None None None None] |
get_conjugation_form () | ['*''*''Conjugated word''Uninflected word''*''*''*''*''*''*''*' *'*' *'*' ] |
get_katakana () | ['Zenbei''Ga''Nai''Ta''. "" Aiga "" Doraemon "" "" "Nobita" "No" "Lilon" "To" "Jissen" ""] |
get_hiragana () | `['Zenbei''is''not''wa'. ''Eiga''Doraemon' ' |
get_how_to_speak() | ['zenbei' 'ga' 'nai' 'ta' '。' 'eiga' 'doraemon' '「' 'nobita' 'no' 'riron' 'to' 'jissen' '」'] |
Recommended Posts