** Word N-gram ** uses a set of adjacent words as the unit of data. 2-gram (2 words) is as follows.
** Co-occurrence (co-location) ** counts the number of times ** words appear together in the target unit (sentence) **.
The above is an example of two words for nouns. In other words, regardless of their mutual positional relationship, ** the combination of words that appear in the same sentence is the unit of data **.
import re
import zipfile
import urllib.request
import os.path
import glob
re
: Abbreviation for Regular Expression, a module for manipulating regular expressionszipfile
: Module for manipulating zip filesglob
: Module to get the file path nameURL = 'https://www.aozora.gr.jp/cards/000148/files/772_ruby_33099.zip'
def download(URL):
#Download zip file
zip_file = re.split(r'/', URL)[-1]
urllib.request.urlretrieve(URL, zip_file)
dir = os.path.splitext(zip_file)[0]
#Unzip and save the zip file
with zipfile.ZipFile(zip_file) as zip_object:
zip_object.extractall(dir)
os.remove(zip_file)
#Get the path of the text file
path = os.path.join(dir,'*.txt')
list = glob.glob(path)
return list[0]
def convert(download_text):
#Read file
data = open(download_text, 'rb').read()
text = data.decode('shift_jis')
#Extraction of text
text = re.split(r'\-{5,}', text)[2]
text = re.split(r'Bottom book:', text)[0]
text = re.split(r'[#New Page]', text)[0]
#Delete unnecessary parts
text = re.sub(r'《.+?》', '', text)
text = re.sub(r'[#.+?]', '', text)
text = re.sub(r'|', '', text)
text = re.sub(r'\r\n', '', text)
text = re.sub(r'\u3000', '', text)
text = re.sub(r'「', '', text)
text = re.sub(r'」', '', text)
return text
#Get file path
download_file = download(URL)
#Extract only the text
text = convert(download_file)
#Split into a statement-based list
sentences = text.split("。")
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7
MeCab.Tagger ()
is the specification of "output mode", but -Ochasen
outputs the result of morphological analysis.import MeCab
mecab = MeCab.Tagger("-Ochasen")
#Generate a sentence-by-sentence noun list
noun_list = [
[v.split()[2] for v in mecab.parse(sentence).splitlines()
if (len(v.split())>=3 and v.split()[3][:2]=='noun')]
for sentence in sentences
]
for sentence in sentences
, it is subjected to morphological analysis bymecab.parse (sentence)
.splitlines ()
is used as v
, and the third element[2]
is added to the list by dividing v
by split ()
. To get[2]
is the uninflected word ( ■ </ font> part).
v.split () [3] [: 2] =='noun'
, the part of speech corresponding to the fourth element[3]
of v
is a noun (<font color = "LightBlue"). Only those that match "> ■ </ font> part) will be extracted.import itertools
from collections import Counter
Counter
: Module for counting the number of occurrences of each element#Generate a sentence-based noun pair list
pair_list = [
list(itertools.combinations(n, 2))
for n in noun_list if len(noun_list) >=2
]
#Flattening the noun pair list
all_pairs = []
for u in pair_list:
all_pairs.extend(u)
#Count the frequency of noun pairs
cnt_pairs = Counter(all_pairs)
, list them with
list (), and store them in
pair_list`.pair_list
is a sentence unit, it cannot be counted as it is. Therefore, flatten it by sequentially adding it to the newly prepared variable ʻall_pairs with ʻextend ()
.Counter ()
to generate ** dictionary-type co-occurrence data ** cnt_pairs
.import pandas as pd
import numpy as np
tops = sorted(
cnt_pairs.items(),
key=lambda x: x[1], reverse=True
)[:50]
sorted ()
and lambda
expressions, and sorts dictionary-type objects based on the elements specified under key = lambda
.x [1]
extracts the top 50 pairs from the second element, that is, the reverse sort by frequency reverse = True
.noun_1 = []
noun_2 = []
frequency = []
#Creating a data frame
for n,f in tops:
noun_1.append(n[0])
noun_2.append(n[1])
frequency.append(f)
df = pd.DataFrame({'The above noun': noun_1, 'Later noun': noun_2, 'Frequency of appearance': frequency})
#Setting weighted data
weighted_edges = np.array(df)
weighted_edges
(weighted data).import matplotlib.pyplot as plt
import networkx as nx
%matplotlib inline
japanize_matplotlib
and then specify the Japanese font.#Module to make matplotlib support Japanese display
!pip install japanize-matplotlib
import japanize_matplotlib
font_family =" IPAexGothic "
is important, and by specifying ** font_family with Japanese font **, the node label will be made compatible with Japanese display.#Generating a graph object
G = nx.Graph()
#Reading weighted data
G.add_weighted_edges_from(weighted_edges)
#Drawing a network diagram
plt.figure(figsize=(10,10))
nx.draw_networkx(G,
node_shape = "s",
node_color = "c",
node_size = 500,
edge_color = "gray",
font_family = "IPAexGothic") #Font specification
plt.show()
Recommended Posts