New font | Old font | |
---|---|---|
Headwaters | ①U+658E (542,000 people) <fontcolor=red>Headwaters |
②U+9F4B(86,800people) Headwaters(①)Old font |
In fact, Writingmistake |
③U+6589(323,000 people) Headwaters(①)のWritingmistake |
④U+9F4A(37,300people) Oldfont(②)のWritingmistake |
After all, I want ** "sai" ** ((1) source) to be ** representative of all rhinoceros (4 types) ** (in the middle) **.
from umap import UMAP
# Umap decomposition
decomp = UMAP(n_components=2,random_state=42)
# fit_transform umap(Saito 4 character data)
embedding4 = decomp.fit_transform(all.T[[1,12,31,32]])
from sklearn.cluster import KMeans
#clustering (1 cluster)
clustering = KMeans(n_clusters=1,random_state=42,)
# fit_predict cluster
cl_y = clustering.fit_predict(embedding4)
# visualize (Implementation will be described later)
showScatter(
embeddings = embedding4,
clusterlabels = cl_y,
centers = clustering.cluster_centers_,
imgs = all.T[[1,12,31,32]].reshape(-1,h,w)
)
Order of proximity from the center of gravity | letter | Distance from the center of gravity | Note |
---|---|---|---|
1st place | 0.6281 | ②Headwaters(oldfont) | |
2nd place | 0.6889 | ③Mistake(newfont) | |
3rd place | 0.7339 | ①Headwaters(newfont) | |
4th place | 0.8743 | ④Mistake(oldfont) |
from umap import UMAP
# Umap decomposition
decomp = UMAP(n_components=2,random_state=42)
# fit_transform umap(All 33 character data)
embeddings = decomp.fit_transform(all.T)
from sklearn.cluster import KMeans
# clustering(Number of clusters: 1)
clustering = KMeans(n_clusters=1, random_state=42)
# fit_predict cluster
cl_y = clustering.fit_predict(embeddings)
# visualize
showScatter(embeddings, cl_y, clustering.cluster_centers_)
Order of proximity from the center of gravity | letter | Distance from the center of gravity | Note |
---|---|---|---|
1st place | 0.494 | ||
2nd place | 0.787 | ||
3rd place | 1.013 | ||
4th place | 1.014 |
from sklearn.cluster import KMeans
# clustering(Number of clusters: 4)
clustering = KMeans(n_clusters=4, random_state=42)
# fit_predict cluster
cl_y = clustering.fit_predict(embeddings)
# visualize
showScatter(embeddings, cl_y, clustering.cluster_centers_)
No | cluster | Center of gravity | Other characters included |
---|---|---|---|
1 | Red | ||
2 | orange | ||
3 | Blue | ||
4 | Green |
from sklearn.cluster import KMeans
# clustering(Number of clusters: 8)
clustering = KMeans(n_clusters=8, random_state=42)
# fit_predict cluster
cl_y = clustering.fit_predict(embeddings)
# visualize
showScatter(embeddings, cl_y, clustering.cluster_centers_)
No | cluster | clusterに含まれる字 |
---|---|---|
1 | peach | |
2 | Red | |
3 | tea | |
4 | Ash | |
5 | orange | |
6 | Blue | |
7 | purple | |
8 | Green |
Elbow Chart
from yellowbrick.cluster import KElbowVisualizer
vis = KElbowVisualizer(
KMeans(random_state=42),
k=(1,34) #Number of clusters (range on the horizontal axis))
)
vis.fit(embeddings)
vis.show()
from yellowbrick.cluster import KElbowVisualizer
vis = KElbowVisualizer(
KMeans(random_state=42),
k=(4,19) #Number of clusters (range on the horizontal axis))
)
vis.fit(embeddings)
vis.show()
Silhouette Chart
from yellowbrick.cluster import silhouette_visualizer
fig = plt.figure(figsize=(15,25))
#Draw together from 4 to 9 clusters
for i in range(4,10):
ax = fig.add_subplot(4,2,i-1)
silhouette_visualizer(KMeans(i),embeddings)
dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram
Z = linkage(
y = embeddings,
method = 'weighted',
metric = "euclidean",
)
R = dendrogram(
Z=Z,
color_threshold=1.2, #Adjust the number of clusters with this threshold
show_contracted=False,
)
Number of clusters | Dendrogram | comment |
---|---|---|
4 | RedJust a little expensive | |
5 | The height is uniform purpleI'm worried about a few It feels pretty good |
|
8 | The height and number are the same, Is it divided too finely? |
from sklearn.cluster import KMeans
# clustering(Number of clusters: 5)
clustering = KMeans(n_clusters=5, random_state=42)
# fit_predict cluster
cl_y = clustering.fit_predict(embeddings)
# visualize
showScatter(embeddings, cl_y, clustering.cluster_centers_)
No | cluster | Center of gravity | Other characters included |
---|---|---|---|
1 | Blue | ||
2 | purple | ||
3 | Green | ||
4 | Red | ||
5 | orange |
No | How to choose | Representative Saito |
---|---|---|
1 | From the 4 recognized kanji 1 characterIf you choose, the representative is |
|
2 | From all 33 kanji 1 characterIf you choose |
|
3 | From all 33 kanji 4 charactersIf you choose |
|
4 | From all 33 kanji 8 charactersIf you choose |
|
5 | All 33 kanji How many clustersShould be divided into |
About 5 clustersLooks good |
6 | From all 33 kanji 5 charactersIf you choose |
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from matplotlib import offsetbox
from sklearn.preprocessing import MinMaxScaler
from PIL import Image
import matplotlib.patches as patches
rc = {
'font.family': ['sans-serif'],
'font.sans-serif': ['Open Sans', 'Arial Unicode MS'],
'font.size': 12,
'figure.figsize': (8, 6),
'grid.linewidth': 0.5,
'legend.fontsize': 10,
'legend.frameon': True,
'legend.framealpha': 0.6,
'legend.handletextpad': 0.2,
'lines.linewidth': 1,
'axes.facecolor': '#fafafa',
'axes.labelsize': 10,
'axes.titlesize': 14,
'axes.linewidth': 0.5,
'xtick.labelsize': 10,
'xtick.minor.visible': True,
'ytick.labelsize': 10,
'figure.titlesize': 14
}
sns.set('notebook', 'whitegrid', rc=rc)
def colorize(d, color, alpha=1.0):
rgb = np.dstack((d,d,d)) * color
return np.dstack((rgb, d * alpha)).astype(np.uint8)
colors = sns.color_palette('tab10')
def showScatter(
embeddings,
clusterlabels,
centers = [],
imgs = all.T.reshape(-1,h,w),
):
fig, ax = plt.subplots(figsize=(15,15))
#Scaling before drawing scatter plot
scaler = MinMaxScaler()
embeddings = scaler.fit_transform(embeddings)
source = zip(embeddings, imgs ,clusterlabels)
#Draw kanji on a scatter plot
cnt = 0
for pos, d , i in source:
cnt = cnt + 1
img = colorize(d, colors[i], 0.5)
ab = offsetbox.AnnotationBbox(offsetbox.OffsetImage(img),0.03 + pos * 0.94,frameon=False)
ax.add_artist(ab)
#Draw concentric circles from the center of gravity
if len(centers) != 0:
for c in scaler.transform(centers):
for r in np.arange(3,0,-1)*0.05:
circle = patches.Circle(
xy=(c[0], c[1]),
radius=r,
fc='#FFFFFF',
ec='black'
)
circle.set_alpha(0.3)
ax.add_patch(circle)
ax.scatter(c[0],c[1],s=300,marker="X")
#Axis drawing range
limit = [-0.1,1.1]
plt.xlim(limit)
plt.ylim(limit)
plt.show()
Recommended Posts