This article is a continuation of Language Processing 100 Knock 2020 [Chapter 6: Machine Learning Answers].
This article deals with machine learning in Chapter 7 (60-69).
I've included only the code in this article. Please refer to the link below for supplements on problem sentences and how to solve them.
Language Processing 100 Knock 2020 Chapter 7: Word Vector
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model['United_States']
model.similarity('United_States','U.S.')
model.most_similar('United_States',topn=10)
model.most_similar(positive=['Spain','Athens'], negative=['Madrid'],topn=10)
with open('questions-words.txt') as f:
questions = f.readlines()
with open('64.txt','w') as f:
for i,question in enumerate(questions):
words = question.split()
if len(words)==4:
ans = model.most_similar(positive=[words[1],words[2]], negative=[words[0]],topn=1)[0]
words += [ans[0], str(ans[1])]
output = ' '.join(words)+'\n'
else:
output = question
f.write(output)
if (i%100==0):
print (i)
cnt = 0
ok = 0
with open('64.txt','r') as f:
questions = f.readlines()
for question in questions:
words = question.split()
if len(words)==6:
cnt += 1
if (words[3]==words[4]):
ok +=1
print (ok/cnt)
import pandas as pd
df = pd.read_csv('wordsim353/combined.csv')
sim = []
for i in range(len(df)):
line = df.iloc[i]
sim.append(model.similarity(line['Word 1'],line['Word 2']))
df['w2v'] = sim
df[['Human (mean)', 'w2v']].corr(method='spearman')
from sklearn.cluster import KMeans
with open('country.txt','r') as f:
lines = f.readlines()
countries = []
for line in lines:
country = line.split(' ')[-1].replace('\n','')
countries.append(country)
dic = {'United States of America':'United_States', 'Russian Federation':'Russia'}
ng = 0
vec = []
target_countries = []
for c in countries:
for k,v in dic.items():
c = c.replace(k,v)
c = c.replace(' ','_').replace('-','_').replace('_and_','_')
try:
vec.append(model[c])
target_countries.append(c)
except:
ng += 1
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(vec)
for c,l in zip(target_countries, kmeans.labels_):
print (c,l)
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
plt.figure(figsize=(32.0, 24.0))
link = linkage(vec, method='ward')
dendrogram(link, labels=target_countries,leaf_rotation=90,leaf_font_size=10)
plt.show()
from sklearn.manifold import TSNE
vec_embedded = TSNE(n_components=2).fit_transform(vec)
vec_embedded_t = list(zip(*vec_embedded)) #Transpose
fig, ax = plt.subplots(figsize=(16, 12))
plt.scatter(*vec_embedded_t)
for i, c in enumerate(target_countries):
ax.annotate(c, (vec_embedded[i][0],vec_embedded[i][1]))
Recommended Posts