3. Natural language processing with Python 2-2. Co-occurrence network [mecab-ipadic-NEologd]


** 1. Preparation of text data **

⑴ Reading text data

from google.colab import files
uploaded = files.upload()


with open('20200926_suga_un.txt', mode='rt', encoding='utf-8') as f:
    read_text = f.read()
sugatxt = read_text


⑵ Data cleaning

#Delete unnecessary characters / symbols
def clean(text):
    text = text.replace("\n", "")
    text = text.replace("\u3000", "")
    text = text.replace("「", "")
    text = text.replace("」", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("、", "")
    return text

text = clean(sugatxt)

#Split line by line
lines = text.split("。")


** 2. Creating co-occurrence data **

⑶ Installation of MeCab and mecab-ipadic-NEologd

# MeCab
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
!pip install mecab-python3 > /dev/null

# mecab-ipadic-NEologd
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null 
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1

#Error avoidance by symbolic links
!ln -s /etc/mecabrc /usr/local/etc/mecabrc

⑷ Create an instance by specifying mecab-ipadic-NEologd

#Check the dictionary path
!echo `mecab-config --dicdir`"/mecab-ipadic-neologd"


import MeCab

path = "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
m_neo = MeCab.Tagger(path)

⑸ Create a sentence-based noun list

stopwords = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0", 
             "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", 
             "one", "two", "three", "four", "Five", "Six", "Seven", "Eight", "Nine", "〇", 
             "Year", "Month", "Day", "Next", "Discount", "Times", "Target", "Disease", "that's all", "Less than", "周Year", "Case", "Every time",
             "of", "もof", "thing", "Yo", "Sama", "For", "Tend to", "this", "It", "that", "Who", 
             "*", ",", ","]
noun_list  = []

for line in lines:
    result = []
    v1 = m_neo.parse(line)
    v2 = v1.splitlines()
    for v in v2:
        v3 = v.split("\t")
        if len(v3) == 2:
            v4 = v3[1].split(',')
            if (v4[0] == "noun") and (v4[6] not in stopwords):


⑹ Generation of co-occurrence data

import itertools #A module that collects iterator functions
from collections import Counter #A class that counts the number of times a dictionary type appears

#Generate a sentence-based noun pair list
pair_list = []
for n in noun_list:
    if len(noun_list) >= 2:
        lt = list(itertools.combinations(n, 2))

#Flatten the noun pair list
all_pairs = []
for p in pair_list:

#Count the frequency of noun pairs
cnt_pairs = Counter(all_pairs)


** 3. Draw network diagram **

⑺ Creation of drawing data

import pandas as pd
import numpy as np

#Generate the top 30 pairs of dictionaries
dict = sorted(cnt_pairs.items(), key=lambda x:x[1], reverse=True)[:30]

#Convert dict type to 2D array
result = []
for key, value in dict:
    temp = []
    for k in key:

data = np.array(result)


⑻ Import of visualization library

import matplotlib.pyplot as plt
import networkx as nx
%matplotlib inline 

#Module to make matplotlib support Japanese display
!pip install japanize-matplotlib
import japanize_matplotlib

⑼ Visualization by NetworkX

#Generating a graph object
G = nx.Graph()

#Data reading

#Drawing a graph
                 node_shape = "s",
                 node_color = "chartreuse", 
                 node_size = 800,
                 edge_color = "gray", 
                 font_family = "IPAexGothic") #Japanese font specification



mecab-ipadic-NEologd MeCab standard
"Infection" "infection", "Disease"
"Developing countries" "On the way", "Country"
"Association of Southeast Asian Nations" "Southeast Asia", "Countries", "Union"
"Human security" "Human", "of", "safety", "security"

