I wanted to align the normalization process with SudachiPy, so I extracted only the relevant process.
from sudachipy import dictionary
from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
dictionary = dictionary.Dictionary()
input_text_plugins = dictionary.input_text_plugins
grammar = dictionary.grammar
def normalize(text):
builder = UTF8InputTextBuilder(text, grammar)
for plugin in input_text_plugins:
plugin.rewrite(builder)
return builder.get_text()
if __name__ == '__main__':
print(normalize('ABC123')) # -> abc123
Recommended Posts