https://gist.github.com/jpena930/0753edfd27e010503755ccfdaeb965bf
#coding: utf-8
from __future__ import print_function # Only needed for Python 2
import MeCab
import CaboCha
import sys
import os
cabocha = CaboCha.Parser("-f1 -n1")
m = MeCab.Tagger ("-Ochasen")
# For reading from file
class getWords():
def readText(self, filename):
###Extract the file
with open(filename, 'r', encoding='utf-8') as f:
tText = f.read()
f.close()
return tText
#Usage: python training_generator <text file>
with open(sys.argv[1], 'r') as my_file:
text = my_file.read()
getText = getWords()
#file_output = '<Filename>'
file_output = sys.argv[1]
text = getText.readText(file_output)
cabocha_text = cabocha.parseToString(text)
cabocha_text = cabocha_text.replace("B-ORGANIZATION", "B-ORG")
cabocha_text = cabocha_text.replace("I-ORGANIZATION", "I-ORG")
cabocha_text = cabocha_text.replace("B-ARTIFACT", "B-ART")
cabocha_text = cabocha_text.replace("I-ARTIFACT", "I-ART")
cabocha_text = cabocha_text.replace("B-LOCATION", "B-LOC")
cabocha_text = cabocha_text.replace("I-LOCATION", "I-LOC")
cabocha_text = cabocha_text.replace("B-DATE", "B-DAT")
cabocha_text = cabocha_text.replace("I-DATE", "I-DAT")
cabocha_text = cabocha_text.replace("B-TIME", "B-TIM")
cabocha_text = cabocha_text.replace("I-TIME", "I-TIM")
cabocha_text = cabocha_text.replace("B-PERSON", "B-PSN")
cabocha_text = cabocha_text.replace("I-PERSON", "I-PSN")
cabocha_text = cabocha_text.replace("B-MONEY", "B-MNY")
cabocha_text = cabocha_text.replace("I-MONEY", "I-MNY")
cabocha_text = cabocha_text.replace("B-PERCENT", "B-PNT")
cabocha_text = cabocha_text.replace("I-PERCENT", "I-PNT")
#Remove commas and replace with tab
cabocha_text = cabocha_text.replace(",", "\t")
filename = file_output + '_generated.txt'
if os.path.exists(filename):
os.remove(filename)
# Remove * and add line space
for line in cabocha_text.splitlines():
if not line.startswith('*'):
with open(filename, 'a') as f:
print(line, file=f)
if line.startswith('。'):
with open(filename, 'a') as f:
print("", file=f)
readFile = open(filename)
lines = readFile.readlines()
lines = lines[:-1]
readFile.close()
w = open(filename,'w')
w.writelines([item for item in lines[:-1]])
w.close()
Next Step: Fix tags to suit your needs
Reference: http://qiita.com/Hironsan/items/326b66711eb4196aa9d4 https://github.com/Hironsan/IOB2Corpus
Recommended Posts