In the previous post, I focused on Word Similarity and learned how to process and analyze data. Here you will learn about data analysis with a focus on word continuity.
First, as a review, I will write a string processing using regular expressions and a word-separated code by Janome.
import os
import json
import pandas as pd
import re
from janome.tokenizer import Tokenizer
#Creating a non-broken speech dataset
file_path = './6110_nlp_preprocessing_data/init100/'
file_dir = os.listdir(file_path)
label_text = []
for file in file_dir[:10]:
r = open(file_path + file, 'r', encoding='utf-8')
json_data = json.load(r)
for turn in json_data['turns']:
turn_index = turn['turn-index']
speaker = turn['speaker']
utterance = turn['utterance']
if turn_index != 0:
if speaker == 'U':
u_text = ''
u_text = utterance
else:
a = ''
for annotate in turn['annotations']:
a = annotate['breakdown']
tmp1 = str(a) + '\t' + u_text
tmp2 = tmp1.split('\t')
label_text.append(tmp2)
df_label_text = pd.DataFrame(label_text)
df_label_text = df_label_text.drop_duplicates()
df_label_text_O = df_label_text[df_label_text[0] == 'O']
t = Tokenizer()
#Create an empty list to store the word-separated words
wakatiO = []
tmp1 = []
tmp2 = ''
#Read line by line
for row in df_label_text_O.values.tolist():
#Remove unnecessary strings with regular expressions
reg_row = re.sub('[0-9a-zA-Z]+', '', row[1])
reg_row = reg_row.replace('\n', '')
#Store words in a list by dividing them with Janome
tmp1 = t.tokenize(reg_row, wakati=True)
wakatiO.append(tmp1)
tmp1 = []
#Show word-separated words
print(wakatiO)
We will quantify the words for ease of use in subsequent analysis. For digitization, create a conversion list (dictionary) for assigning IDs to words. Here, we will assign serial numbers in descending order of the number of occurrences of words.
Divide the dataset, then count the number of word occurrences and sort them in descending order
Click here for usage examples
import os
import json
import pandas as pd
import re
from janome.tokenizer import Tokenizer
from collections import Counter
import itertools
#Creating a non-broken speech dataset
file_path = './6110_nlp_preprocessing_data/init100/'
file_dir = os.listdir(file_path)
label_text = []
for file in file_dir[:10]:
r = open(file_path + file, 'r', encoding='utf-8')
json_data = json.load(r)
for turn in json_data['turns']:
turn_index = turn['turn-index']
speaker = turn['speaker']
utterance = turn['utterance']
if turn_index != 0:
if speaker == 'U':
u_text = ''
u_text = utterance
else:
a = ''
for annotate in turn['annotations']:
a = annotate['breakdown']
tmp1 = str(a) + '\t' + u_text
tmp2 = tmp1.split('\t')
label_text.append(tmp2)
df_label_text = pd.DataFrame(label_text)
df_label_text = df_label_text.drop_duplicates()
df_label_text_O = df_label_text[df_label_text[0] == 'O']
#Divide and remove unnecessary strings with regular expressions
t = Tokenizer()
wakatiO = []
tmp1 = []
tmp2 = ''
for row in df_label_text_O.values.tolist():
reg_row = re.sub('[0-9a-zA-Z]+', '', row[1])
reg_row = reg_row.replace('\n', '')
tmp1 = t.tokenize(reg_row, wakati=True)
wakatiO.append(tmp1)
tmp1 = []
#① Count the number of appearances of words
word_freq = Counter(itertools.chain(*wakatiO))
#② Sort words in order of appearance number and add to dic
dic = []
for word_uniq in word_freq.most_common():
dic.append(word_uniq[0])
#③ Create a dictionary by giving IDs to words
# enumerate(dic)Using a for statement`dic`Index from`i`, Elements`word_uniq`Get to
dic_inv = {}
for i, word_uniq in enumerate(dic, start=1):
dic_inv.update({word_uniq: i})
#Display dictionary
print(dic_inv)
To count the number of occurrences of a word
With the Counter class of the Python standard library
itertools.chain()Is used.
①Counter
Count the number of elements.
collections.Counter()If you pass a list or tuple to
A Counter object with an element in the key and the number of occurrences in the value is created.
You can also use dictionary-type methods
List of keys keys(), List of values values(), Key and
List of tuples of value pairs items()Can be obtained.
import collections
#list`list_`Generate a
list_ = ['a', 'c', 'a', 'c', 'b', 'b', 'c', 'c', 'a']
# `list_`Count the elements of
C = collections.Counter(list_)
#Count result
print(C)
Counter({'c': 4, 'a': 3, 'b': 2})
#If you specify an element, the count result is returned.
print(C['c'])
4
#If you specify an element that is not included`0`return it
print(C['d'])
0
#In order of appearance count(element,Number of appearances)Returns a tuple in the form of
print(C.most_common())
[('c', 4), ('a', 3), ('b', 2)]
#Get the second element of the tuple
most_common = C.most_common()
print(most_common[1])
('a', 3)
②itertools.chain
Flattens a multidimensional list to one dimension.
The list is[1,2,3,],[4,5,6]Even if it looks like
You can access the elements of each list.
Multiple lists will result in an error, so*And pass it as a single list.
import itertools
from collections import Counter
b = [['A', 'B', 'C',],['D', 'E', 'F']]
print(list(itertools.chain(*b)))
['A', 'B', 'C', 'D', 'E', 'F']
a = Counter(itertools.chain(*b))
print(a)
Counter({'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 1, 'F': 1})
How to sort words
③most_common(n)Is used.
Returns a tuple of lists sorted in descending order of quantity. If n is specified, n tuples will be returned from the one with the largest number. If n is omitted, all elements are returned.
from collections import Counter
list_ = ['a', 'c', 'a', 'c', 'b', 'b', 'c', 'c', 'a']
C = Counter(list_)
print(C)
Counter({'c': 4, 'a': 3, 'b': 2})
#In order of appearance count(element,Number of appearances)Returns a tuple in the form of
print(C.most_common())
[('c', 4), ('a', 3), ('b', 2)]
print(C.most_common(2))
[('c', 4), ('a', 3)]
#Get the second and subsequent elements of the tuple
mc = C.most_common()
print(mc[1:])
[('a', 3), ('b', 2)]
For non-broken speech datasets Convert words to IDs using the created dictionary Store in a new array.
wakatiO contains a list of word-separated words.
print(wakatiO[0:1])
[['Good morning', '。'], ['I'm sorry', '、', 'you', 'Is', 'Who', 'is', 'Or', '?']]
The word with the ID is stored in dic_inv.
print(dic_inv['Good morning'])
218
print(dic_inv['。'])
2
Using the method of or statement and dictionary type data reference (contents explained in the previous section) Converts the list of words in wakatiO to the ID of dic_inv.
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
[218, 2]
The second half for waka in wakatiO
Pass the wakatiO word list to waka one by one.
The middle for word in waka is
Pass the waka word list word by word to word.
First half dic_inv[word]Is
The ID corresponding to word is dictionary dic_Get from inv.
Click here for usage examples
import os
import json
import pandas as pd
import re
from janome.tokenizer import Tokenizer
from collections import Counter
import itertools
#Creating a non-broken speech dataset
file_path = './6110_nlp_preprocessing_data/init100/'
file_dir = os.listdir(file_path)
label_text = []
for file in file_dir[:10]:
r = open(file_path + file, 'r', encoding='utf-8')
json_data = json.load(r)
for turn in json_data['turns']:
turn_index = turn['turn-index']
speaker = turn['speaker']
utterance = turn['utterance']
if turn_index != 0:
if speaker == 'U':
u_text = ''
u_text = utterance
else:
a = ''
for annotate in turn['annotations']:
a = annotate['breakdown']
tmp1 = str(a) + '\t' + u_text
tmp2 = tmp1.split('\t')
label_text.append(tmp2)
df_label_text = pd.DataFrame(label_text)
df_label_text = df_label_text.drop_duplicates()
df_label_text_O = df_label_text[df_label_text[0] == 'O']
#Divide and remove unnecessary strings with regular expressions
t = Tokenizer()
wakatiO = []
tmp1 = []
tmp2 = ''
for row in df_label_text_O.values.tolist():
reg_row = re.sub('[0-9a-zA-Z]+', '', row[1])
reg_row = reg_row.replace('\n', '')
tmp1 = t.tokenize(reg_row, wakati=True)
wakatiO.append(tmp1)
tmp1 = []
#Count, sort, and add words to dic
word_freq = Counter(itertools.chain(*wakatiO))
dic = []
for word_uniq in word_freq.most_common():
dic.append(word_uniq[0])
#Create a dictionary by giving IDs to words
# enumerate(dic)Using a for statement`dic`Index from`i`, Elements`word_uniq`Get to
dic_inv = {}
for i, word_uniq in enumerate(dic, start=1):
dic_inv.update({word_uniq: i})
#① Convert words to ID
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
print(wakatiO_n)
dic_inv[Divided words]Is used.
Same content, but if you write it again Converts the list of words in wakatiO to the ID of dic_inv.
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
[218, 2]
The second half for waka in wakatiO
Pass the wakatiO word list to waka one by one.
The middle for word in waka is
Pass the waka word list word by word to word.
First half dic_inv[word]Is
The ID corresponding to word is dictionary dic_Get from inv.
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
[218, 2]
The second half for waka in wakatiO
Pass the wakatiO word list to waka one by one.
The middle for word in waka is
Pass the waka word list word by word to word.
First half dic_inv[word]Is
The ID corresponding to word is dictionary dic_Get from inv.
When parsing natural language data, if the purpose is to classify text Create a word document matrix from word data.
Also, if the purpose is to extract topics from text
N-Create a gram model.
N-gram is a method of dividing text into N consecutive characters.
In a familiar example, it is used as a search system index. 1-gram (unigram) when N = 1 and 2-gram (bigram) when N = 2 When N = 3, it is called 3-gram (trigram).
[Text] Aiueo
【 1-gram] Ah|I|U|e|O
【 2-gram] Ai|Say|up|Eo
【 3-gram]|Iue|Ueo
You can also split the text into N consecutive words.
[Text] It's sunny today.
【 1-gram] Today|Is|Sunny|is| 。
【 2-gram] Today-Is|Is-Sunny|Sunny-is|is-。
【 3-gram] Today-Is-Sunny| Is-Sunny-is| Sunny-is-。
In contrast to the word document matrix representing the co-occurrence (whether it appears in the same sentence) of words contained in the same text. N-gram represents the continuity of words (in what order they appear).
Here, I want to extract topics from words and analyze them. We will create the latter N-gram model, especially the 2-gram model.
First, let's create a 2-gram model with a simple example.
word = ['today', 'Is', 'Sunny', 'is', '。']
bigram = []
for i in range(len(word)-1):
bigram.append([word[i], word[i+1]])
print(bigram)
>>>Output result
[['today', 'Is'], ['Is', 'Sunny'], ['Sunny', 'is'], ['is', '。']]
Create a 2-gram list from the dataset that converts the words created in Convert Words to Numbers into IDs.
2-To create a gram list
Group by grouping against DataFrame()When
Sum to calculate the total value()Is used.
First, let's check how to use groupby () and sum ().
from pandas import DataFrame
#Prepare DataFrame
df=DataFrame([['AA','Camela',150000,20000],
['BB','Camera',70000,10000],
['AA','earphone',2000,200],
['AA','Video',3000,150],
['BB','earphone',200000,8000],
['BB','Camera',50000,5000],
['AA','Video',1000,200]],
columns=['CUSTOMER','PRODUCT','PRICE','DISCOUNT'])
df
>>>Output result
CUSTOMER PRODUCT PRICE DISCOUNT
0 AA Camela 150000 20000
1 BB Camera 70000 10000
2 AA earphone 2000 200
3 AA Video 3000 150
4 BB earphone 200000 8000
5 BB Camera 50000 5000
6 AA Video 1000 200
#Group by under one condition and calculate sum
grouped = df.groupby('CUSTOMER').sum()
grouped
>>>Output result
PRICE DISCOUNT
CUSTOMER
AA 156000 20550
BB 320000 23000
#Group by under multiple conditions and calculate sum
grouped = df.groupby(['CUSTOMER','PRODUCT']).sum()
grouped
>>>Output result
PRICE DISCOUNT
CUSTOMER PRODUCT
AA Camela 150000 20000
Video 4000 350
earphone 2000 200
BB Camera 120000 15000
earphone 200000 8000
2-In creating a gram list Because it means that duplicate 2-grams appear multiple times I think it's an important combination.
Group by groupby () and sum () to calculate the number of occurrences (= weight).
Click here for usage examples
import os
import json
import pandas as pd
import re
from janome.tokenizer import Tokenizer
from collections import Counter
import itertools
#Creating a non-broken speech dataset
file_path = './6110_nlp_preprocessing_data/init100/'
file_dir = os.listdir(file_path)
label_text = []
for file in file_dir[:10]:
r = open(file_path + file, 'r', encoding='utf-8')
json_data = json.load(r)
for turn in json_data['turns']:
turn_index = turn['turn-index']
speaker = turn['speaker']
utterance = turn['utterance']
if turn_index != 0:
if speaker == 'U':
u_text = ''
u_text = utterance
else:
a = ''
for annotate in turn['annotations']:
a = annotate['breakdown']
tmp1 = str(a) + '\t' + u_text
tmp2 = tmp1.split('\t')
label_text.append(tmp2)
df_label_text = pd.DataFrame(label_text)
df_label_text = df_label_text.drop_duplicates()
df_label_text_O = df_label_text[df_label_text[0] == 'O']
#Divide and remove unnecessary strings with regular expressions
t = Tokenizer()
wakatiO = []
tmp1 = []
tmp2 = ''
for row in df_label_text_O.values.tolist():
reg_row = re.sub('[0-9a-zA-Z]+', '', row[1])
reg_row = reg_row.replace('\n', '')
tmp1 = t.tokenize(reg_row, wakati=True)
wakatiO.append(tmp1)
tmp1 = []
#Count, sort, and add words to dic
word_freq = Counter(itertools.chain(*wakatiO))
dic = []
for word_uniq in word_freq.most_common():
dic.append(word_uniq[0])
#Create a dictionary by giving IDs to words
dic_inv = {}
for i, word_uniq in enumerate(dic, start=1):
dic_inv.update({word_uniq: i})
#Convert words to ID
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
# 2-Create a gram list
tmp = []
bigramO = []
for i in range(0, len(wakatiO_n)):
row = wakatiO_n[i]
# 2-Creating a gram
for j in range(len(row)-1):
tmp.append([row[j], row[j+1]])
bigramO.extend(tmp)
tmp = []
#Duplicate 2-Count the number of gram and express it in DataFrame format
#Array`bigramO`To DataFrame and set column
df_bigramO = pd.DataFrame(bigramO)
df_bigramO = df_bigramO.rename(columns={0: 'node1', 1: 'node2'})
# `weight`Add columns and unify values with 1
df_bigramO['weight'] = 1
# 2-Count the number of gram
df_bigramO = df_bigramO.groupby(['node1', 'node2'], as_index=False).sum()
#Extract a list with more than 1 occurrences
#Number of appearances=`weight`Sum value
df_bigramO = df_bigramO[df_bigramO['weight'] > 1]
# 2-Gram display
df_bigramO.head(10)
Node node
Similarity between words as edges and their weights
I created an undirected graph.
here
Node node
The number of occurrences of a word pair as an edge and its weight
Create a directed graph (directed network).
We think that directed graphs are meaningful in the order of appearance. Examples include the order in which words appear and the exchange of communications within an organization. First, let's create a directed graph and visualize it using the 2-gram list created in 2-gram list creation.
Click here for usage examples
import os
import json
import pandas as pd
import re
from janome.tokenizer import Tokenizer
from collections import Counter
import itertools
import networkx as nx
import matplotlib.pyplot as plt
#Creating a non-broken speech dataset
file_path = './6110_nlp_preprocessing_data/init100/'
file_dir = os.listdir(file_path)
label_text = []
for file in file_dir[:10]:
r = open(file_path + file, 'r', encoding='utf-8')
json_data = json.load(r)
for turn in json_data['turns']:
turn_index = turn['turn-index']
speaker = turn['speaker']
utterance = turn['utterance']
if turn_index != 0:
if speaker == 'U':
u_text = ''
u_text = utterance
else:
a = ''
for annotate in turn['annotations']:
a = annotate['breakdown']
tmp1 = str(a) + '\t' + u_text
tmp2 = tmp1.split('\t')
label_text.append(tmp2)
df_label_text = pd.DataFrame(label_text)
df_label_text = df_label_text.drop_duplicates()
df_label_text_O = df_label_text[df_label_text[0] == 'O']
#Divide and remove unnecessary strings with regular expressions
t = Tokenizer()
wakatiO = []
tmp1 = []
tmp2 = ''
for row in df_label_text_O.values.tolist():
reg_row = re.sub('[0-9a-zA-Z]+', '', row[1])
reg_row = reg_row.replace('\n', '')
tmp1 = t.tokenize(reg_row, wakati=True)
wakatiO.append(tmp1)
tmp1 = []
#Count, sort, and add words to dic
word_freq = Counter(itertools.chain(*wakatiO))
dic = []
for word_uniq in word_freq.most_common():
dic.append(word_uniq[0])
#Create a dictionary by giving IDs to words
dic_inv = {}
for i, word_uniq in enumerate(dic, start=1):
dic_inv.update({word_uniq: i})
#Convert words to ID
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
# 2-Create a gram list
tmp = []
bigramO = []
for i in range(0, len(wakatiO_n)):
row = wakatiO_n[i]
# 2-Creating a gram
for j in range(len(row)-1):
tmp.append([row[j], row[j+1]])
bigramO.extend(tmp)
tmp = []
#Array`bigramO`To DataFrame and set column
df_bigramO = pd.DataFrame(bigramO)
df_bigramO = df_bigramO.rename(columns={0: 'node1', 1: 'node2'})
# `weight`Add a column and unify the values with 1
df_bigramO['weight'] = 1
# 2-Count the number of gram
df_bigramO = df_bigramO.groupby(['node1', 'node2'], as_index=False).sum()
#Extract a list with more than 1 occurrences
df_bigramO = df_bigramO[df_bigramO['weight'] > 1]
#Create directed graph
G_bigramO = nx.from_pandas_edgelist(df_bigramO, 'node1', 'node2', ['weight'], nx.DiGraph)
#Visualize the created graph
#Layout settings
pos = nx.spring_layout(G_bigramO)
nx.draw_networkx(G_bigramO, pos)
plt.show()
Specify by adding nx.DiGraph to the argument of nx.from_pandas_edgelist () used to create the undirected graph. For more information on the other arguments, see 2.2.2 Creating a similarity network.
G_corlistO = nx.from_pandas_edgelist(df_corlistO, 'node1', 'node2', ['weight'], nx.DiGraph)
It's just like visualizing an undirected graph.
#Library`Matplotlib`From`pyplot`Import
from matplotlib import pyplot
#Calculate the optimum display position for each node
pos = nx.spring_layout(graph)
#Draw graph
nx.draw_networkx(graph, pos)
#Display graphs using Matplotlib
plt.show()
Even if it is a directed graph, the idea is the same as that of an undirected graph. Since it is difficult to grasp the characteristics even at a glance of the graph visualized by creating a 2-gram network. Quantitatively grasp the characteristics using indicators.
Again, let's calculate the cluster coefficient and mediation centrality.
Click here for usage examples
import os
import json
import pandas as pd
import re
from janome.tokenizer import Tokenizer
from collections import Counter
import itertools
import networkx as nx
import matplotlib.pyplot as plt
#Creating a non-broken speech dataset
file_path = './6110_nlp_preprocessing_data/init100/'
file_dir = os.listdir(file_path)
label_text = []
for file in file_dir[:10]:
r = open(file_path + file, 'r', encoding='utf-8')
json_data = json.load(r)
for turn in json_data['turns']:
turn_index = turn['turn-index']
speaker = turn['speaker']
utterance = turn['utterance']
if turn_index != 0:
if speaker == 'U':
u_text = ''
u_text = utterance
else:
a = ''
for annotate in turn['annotations']:
a = annotate['breakdown']
tmp1 = str(a) + '\t' + u_text
tmp2 = tmp1.split('\t')
label_text.append(tmp2)
df_label_text = pd.DataFrame(label_text)
df_label_text = df_label_text.drop_duplicates()
df_label_text_O = df_label_text[df_label_text[0] == 'O']
#Divide and remove unnecessary strings with regular expressions
t = Tokenizer()
wakatiO = []
tmp1 = []
tmp2 = ''
for row in df_label_text_O.values.tolist():
reg_row = re.sub('[0-9a-zA-Z]+', '', row[1])
reg_row = reg_row.replace('\n', '')
tmp1 = t.tokenize(reg_row, wakati=True)
wakatiO.append(tmp1)
tmp1 = []
#Count, sort, and add words to dic
word_freq = Counter(itertools.chain(*wakatiO))
dic = []
for word_uniq in word_freq.most_common():
dic.append(word_uniq[0])
#Create a dictionary by giving IDs to words
dic_inv = {}
for i, word_uniq in enumerate(dic, start=1):
dic_inv.update({word_uniq: i})
#Convert words to ID
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
# 2-Create a gram list
tmp = []
bigramO = []
for i in range(0, len(wakatiO_n)):
row = wakatiO_n[i]
# 2-Creating a gram
for j in range(len(row)-1):
tmp.append([row[j], row[j+1]])
bigramO.extend(tmp)
tmp = []
#Array`bigramO`To DataFrame and set column
df_bigramO = pd.DataFrame(bigramO)
df_bigramO = df_bigramO.rename(columns={0: 'node1', 1: 'node2'})
# `weight`Add a column and unify the values with 1
df_bigramO['weight'] = 1
# 2-Count the number of gram
df_bigramO = df_bigramO.groupby(['node1', 'node2'], as_index=False).sum()
#Extract a list with more than 1 occurrences
df_bigramO = df_bigramO[df_bigramO['weight'] > 1]
#Creating directed graphs
G_bigramO = nx.from_pandas_edgelist(
df_bigramO, 'node1', 'node2', ['weight'], nx.DiGraph)
#Speaking network that is not bankrupt
#① Calculation of average cluster coefficient
print('Average cluster coefficient')
print(nx.average_clustering(G_bigramO, weight='weight'))
print()
#② Calculation of mediation centrality
bc = nx.betweenness_centrality(G_bigramO, weight='weight')
print('Mediation centrality')
for k, v in sorted(bc.items(), key=lambda x: -x[1]):
print(str(k) + ': ' + str(v))
The higher the average of the cluster coefficients of all nodes, the denser the network. The average of the cluster coefficients is calculated using nx.average_clustering ().
nx.average_clustering(G, weight=None)
G
Specify graph.
(Directed graph G created in the previous section_bigramO)
weight
Specifies the edge with the number to use as the weight. If None, the weight of each edge will be 1.
It is determined by how many nodes are included in the shortest path between all nodes. In other words, the nodes that are most used to convey information efficiently are more intermediary and central.
nx.betweenness_centrality(G, weight=None)
G
Specify graph.
(Directed graph G created in the previous section_bigramO)
weight
Specifies the edge with the number to use as the weight. If None, all edge weights are considered equal.
We have implemented some indicators to quantitatively understand the characteristics of the network. further
Let's use a frequency distribution to see how each word affects each other.
Because word nodes have orientation in a directed network
Degree:One word is influenced by another
Degree:One word affects another
I will consider it separately.
Click here for usage examples
import os
import json
import pandas as pd
import re
from janome.tokenizer import Tokenizer
from collections import Counter
import itertools
import networkx as nx
import matplotlib.pyplot as plt
#Creating a non-broken speech dataset
file_path = './6110_nlp_preprocessing_data/init100/'
file_dir = os.listdir(file_path)
label_text = []
for file in file_dir[:100]:
r = open(file_path + file, 'r', encoding='utf-8')
json_data = json.load(r)
for turn in json_data['turns']:
turn_index = turn['turn-index']
speaker = turn['speaker']
utterance = turn['utterance']
if turn_index != 0:
if speaker == 'U':
u_text = ''
u_text = utterance
else:
a = ''
for annotate in turn['annotations']:
a = annotate['breakdown']
tmp1 = str(a) + '\t' + u_text
tmp2 = tmp1.split('\t')
label_text.append(tmp2)
df_label_text = pd.DataFrame(label_text)
df_label_text = df_label_text.drop_duplicates()
df_label_text_O = df_label_text[df_label_text[0] == 'O']
#Divide and remove unnecessary strings with regular expressions
t = Tokenizer()
wakatiO = []
tmp1 = []
tmp2 = ''
for row in df_label_text_O.values.tolist():
reg_row = re.sub('[0-9a-zA-Z]+', '', row[1])
reg_row = reg_row.replace('\n', '')
tmp1 = t.tokenize(reg_row, wakati=True)
wakatiO.append(tmp1)
tmp1 = []
#Count, sort, and add words to dic
word_freq = Counter(itertools.chain(*wakatiO))
dic = []
for word_uniq in word_freq.most_common():
dic.append(word_uniq[0])
#Create a dictionary by giving IDs to words
dic_inv = {}
for i, word_uniq in enumerate(dic, start=1):
dic_inv.update({word_uniq: i})
#Convert words to ID
wakatiO_n = [[dic_inv[word] for word in waka] for waka in wakatiO]
# 2-Create a gram list
tmp = []
bigramO = []
for i in range(0, len(wakatiO_n)):
row = wakatiO_n[i]
# 2-Creating a gram
for j in range(len(row)-1):
tmp.append([row[j], row[j+1]])
bigramO.extend(tmp)
tmp = []
#Array`bigramO`To DataFrame and set column
df_bigramO = pd.DataFrame(bigramO)
df_bigramO = df_bigramO.rename(columns={0: 'node1', 1: 'node2'})
# `weight`Add a column and unify the values with 1
df_bigramO['weight'] = 1
# 2-Count the number of gram
df_bigramO = df_bigramO.groupby(['node1', 'node2'], as_index=False).sum()
#Extract a list with more than 1 occurrences
df_bigramO = df_bigramO[df_bigramO['weight'] > 1]
#Creating directed graphs
G_bigramO = nx.from_pandas_edgelist(
df_bigramO, 'node1', 'node2', ['weight'], nx.DiGraph)
#Speaking network that is not bankrupt
#Find the degree of the degree
indegree = sorted([d for n, d in G_bigramO.in_degree(weight='weight')], reverse=True)
indegreeCount = Counter(indegree)
indeg, cnt = zip(*indegreeCount.items())
#Find the degree of degree
outdegree = sorted([d for n, d in G_bigramO.out_degree(weight='weight')], reverse=True)
outdegreeCount = Counter(outdegree)
outdeg, cnt = zip(*outdegreeCount.items())
#Creating a frequency distribution
plt.subplot(1, 2, 1)
plt.bar(indeg, cnt, color='r')
plt.title('in_degree')
plt.subplot(1, 2, 2)
plt.bar(outdeg, cnt, color='b')
plt.title('out_degree')
plt.show()
What is in degree? In a directed graph, say the number of edges that enter the apex. In an undirected graph, it refers to the number of connected edges.
To find out the degree of the network Use the in_degree () method. The result is returned in the form of (node number, degree).
# G_Check the degree of bigramO
print(G_bigramO.in_degree(weight='weight'))
>>>Output result
[(1, 208), (2, 155), (4, 148), (5, 126), (7, 47)・ ・ ・]
What is outdegree? In a directed graph, it says the number of edges going out from the apex. In an undirected graph, it refers to the number of connected edges.
To find out the degree, use the out_degree () method. The result is returned in the form of (node number, degree).
# G_Check the degree of bigramO
print(G_bigramO.out_degree(weight='weight'))
>>>Output result
[(1, 248), (2, 12), (4, 83), (5, 65), (7, 57)・ ・ ・]
Recommended Posts