[Co-occurrence analysis] Easy co-occurrence analysis with Python! [Python]

Co-occurrence analysis

Previous article [https://qiita.com/osakasho/items/0a0b50fc17c38d96c45e] Now that I've only done morphological analysis, I'll also do co-occurrence analysis and graph it.

Install what you need

pip install pyvis


import spacy
nlp = spacy.load('ja_ginza_nopn')
import re
import itertools
import collections
from pyvis.network import Network
import pandas as pd
import time

"""---------Disassembly module--------"""
def sentence_separator(path, colname):
    black_list = ["test"]
    df = pd.read_csv(path, encoding="utf_8_sig")
    data = df[colname]
    sentence = []
    for d in data:
            total_ls, noun_ls, verm_ls = ginza(d)
    return sentence

def ginza(word):
    doc = nlp(word)
    #Survey results
    total_ls = []
    Noun_ls = [chunk.text for chunk in doc.noun_chunks]
    Verm_ls = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    for n in Noun_ls:
    for v in Verm_ls:
    return total_ls, Noun_ls, Verm_ls

#Acquire text data.
filename = "list.csv"
file_path = filename
colname = "lyrics"

sentences = sentence_separator(file_path, colname)
sentence_combinations = [list(itertools.combinations(sentence, 2)) for sentence in sentences]
sentence_combinations = [[tuple(sorted(words)) for words in sentence] for sentence in sentence_combinations]
target_combinations = []
for sentence in sentence_combinations:

#Main processing of network drawing
def kyoki_word_network():
    # got_net = Network(height="500px", width="100%", bgcolor="#222222", font_color="white", notebook=True)
    got_net = Network(height="1000px", width="95%", bgcolor="#FFFFFF", font_color="black", notebook=True)

    # set the physics layout of the network
    # got_net.barnes_hut()
    got_data = pd.read_csv("kyoki.csv")[:150]

    sources = got_data['first']  # count
    targets = got_data['second']  # first
    weights = got_data['count']  # second

    edge_data = zip(sources, targets, weights)

    for e in edge_data:
        src = e[0]
        dst = e[1]
        w = e[2]

        got_net.add_node(src, src, title=src)
        got_net.add_node(dst, dst, title=dst)
        got_net.add_edge(src, dst, value=w)

    neighbor_map = got_net.get_adj_list()

    # add neighbor data to node hover data
    for node in got_net.nodes:
        node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
        node["value"] = len(neighbor_map[node["id"]])

    return got_net

#Summary Roomba
ct = collections.Counter(target_combinations)

#Temporarily save data
pd.DataFrame([{'first' : i[0][0], 'second' : i[0][1], 'count' : i[1]} for i in ct.most_common()]).to_csv('kyoki.csv', index=False, encoding="utf_8_sig")


#Execution of processing
got_net = kyoki_word_network()


I think that kyoki.html is output, so please start it with a browser. image.png

the end.

