I had a hard time as it was, so I will post it for the time being. There may be a better way. If you are a beginner like me, please refer to it.
The environment is python 3.6.9 and Ubuntu 18.04.4.
change_NER.py
# coding:utf-8
import spacy
with open('input.txt','r') as f:
nlp = spacy.load('ja_ginza')
data = f.read()
doc = nlp(data)
with open('output.txt','w') as f:
text = list(data) #Store each character in the list
entity = [ent.label_ for ent in doc.ents] #Named entity label
start = [ent.start_char for ent in doc.ents] #From what character is the named entity
end = [ent.end_char for ent in doc.ents] #What character is the named entity
num = 0
stop = False
for i in range(len(text)):
if i == start[num]:
f.write(entity[num])
if num < len(start) - 1: #Out of range prevention
num += 1
stop = True
elif stop == True:
if i < end[num-1]: #Only the number of characters in the named entity
continue #Consume i
elif i == end[num-1]:
stop = False
f.write(text[i])
else:
f.write(text[i])
Recommended Posts