When using Mecab / Cabocha / KNP on Windows + python, if you try to enjoy using "import mecab" or "import Cabocha", recompiling, UTF-8 specification, etc. will be troublesome. Therefore, use Sjift-Jis + external call to enable the default installation.
** Set homefldr anywhere ** (2016/12/17 postscript: Also supports KNP)
import os
import subprocess
from tempfile import NamedTemporaryFile
class JapaneseDependencyRelation:
def __init__(self, encoding=None):
if encoding is None:
encoding = 'utf-8'
# Mecab,Cabocha Encoding(Follow during installation)
self.text_encoding = encoding
#TMP file creation location
self.homefldr = 'C:\\tmp'
def get_encode_original(self, j_txt):
hoge = []
for ch in j_txt:
try:
hoge.append(ch.encode(self.text_encoding))
except:
raise UnicodeEncodeError
return ''.join(hoge)
def do_mecab(self, j_txt):
if os.path.exists(self.homefldr) == False:
raise FileNotFoundError
try:
with NamedTemporaryFile(delete=False, dir=self.homefldr) as temp:
try:
j_txt = j_txt.encode(self.text_encoding)
except:
j_txt = self.get_encode_original(j_txt)
temp.write(j_txt)
except Exception as e:
print(e)
raise UnicodeTranslateError
else:
command = ['mecab']
process = subprocess.Popen(command, stdin=open(temp.name, 'r'), stdout=subprocess.PIPE)
output = process.communicate()[0]
finally:
temp.close()
os.unlink(temp.name)
return output.decode(self.text_encoding)
def do_cabocha(self, j_txt, fmt=None):
if fmt is None:
fmt = "xml"
if os.path.exists(self.homefldr) == False:
raise FileNotFoundError
try:
with NamedTemporaryFile(delete=False, dir=self.homefldr) as temp:
try:
j_txt = j_txt.encode(self.text_encoding)
except:
j_txt = self.get_encode_original(j_txt)
temp.write(j_txt)
except Exception as e:
print(e)
raise UnicodeTranslateError
else:
'''
-f, --output-format=TYPE set output format style
0 - tree(default)
1 - lattice
2 - tree + lattice
3 - XML
'''
#cabocha argument
if (fmt == "xml"):
command = ['cabocha', '-f', '3']
elif (fmt == "tree"):
command = ['cabocha', '-f', '2']
else:
command = ['cabocha', '-f', '1']
process = subprocess.Popen(command, stdin=open(temp.name, 'r'), stdout=subprocess.PIPE)
output = process.communicate()[0]
finally:
temp.close()
os.unlink(temp.name)
return output.decode(self.text_encoding)
def do_knp(self, j_txt, fmt=None, level=None, output=None):
"""
Specifying the display of analysis results(fmt)
-tab Tabular display(tab)
-Display in tabular format with less simple output(simple)
-td Display corresponding to "Analysis result general-purpose display tool" 1(td)
-tree (default)Display by tree structure(tree)
-Tabular display of bnsttab clauses(tab1)
-display by tree structure of bnsttree clause(tree1)
-display in sexp list format(sexp)
Specifying the level of analysis(level)
-bnst Convert a morpheme sequence to a clause sequence(1)
-dpnd In addition, perform dependency analysis between clauses(2)
-case (default)In addition, analyze the case relationship(3)
-anaphora In addition, analyze the anaphora relationship(4)
-ne Furthermore, analyze the named entity(5)
Specifying the output information of the analysis result(output)
-normal (default)Show only final analysis results(1)
-detail Possibility of dependency Line example, similarity matrix between clauses, etc. are also displayed.(2)
-debug Displaying more detailed information in the middle of analysis(3)
"""
def set_argument(f, l, o):
arg = ['juman|knp']
if f == "tab":
arg.append("-tab")
elif f == "td":
arg.append("-td")
elif f == "tree":
arg.append("-tree")
elif f == "tab1":
arg.append("-bnsttab")
elif f == "tree1":
arg.append("-bnsttree")
elif f == "sexp":
arg.append("-sexp")
else:
arg.append("-simple")
if l == 1:
arg.append("-bnst")
elif l == 2:
arg.append("-dpnd")
elif l == 3:
arg.append("-case")
elif l == 5:
arg.append("-ne")
else:
arg.append("-anaphora")
if o == 2:
arg.append("-detail")
elif o == 3:
arg.append("-debug")
else:
arg.append("-normal")
return arg
if fmt is None:
fmt = "tab"
if level is None:
level = 4
if output is None:
output = 1
if os.path.exists(self.homefldr) == False:
raise FileNotFoundError
try:
with NamedTemporaryFile(delete=False, dir=self.homefldr) as temp:
try:
j_txt = j_txt.encode(self.text_encoding)
except:
j_txt = self.get_encode_original(j_txt)
temp.write(j_txt)
except Exception as e:
print(e)
raise UnicodeTranslateError
else:
command = set_argument(fmt, level, output)
process = subprocess.Popen(command, shell=True, stdin=open(temp.name, 'r'), stdout=subprocess.PIPE)
output = process.communicate()[0]
finally:
temp.close()
#Delete TMP file
os.unlink(temp.name)
return output.decode(self.text_encoding)
if __name__ == "__main__":
operation_japanese = JapaneseDependencyRelation('shift-jis')
text = 'The customer next door is a customer who often eats persimmons'
# Mecab
print(operation_japanese.do_mecab(text))
# Cabocha
print(operation_japanese.do_cabocha(text, "xml"))
print(operation_japanese.do_cabocha(text, "tree"))
print(operation_japanese.do_cabocha(text, "word"))
# KNP
print(operation_japanese.do_knp(text, "tab"))
print(operation_japanese.do_knp(text, "td"))
print(operation_japanese.do_knp(text, "simple"))
print(operation_japanese.do_knp(text, "tree"))
print(operation_japanese.do_knp(text, "tab1"))
print(operation_japanese.do_knp(text, "tree1"))
print(operation_japanese.do_knp(text, "sexp"))
Recommended Posts