I was shown a plot by principal component analysis, and when I was told that "the distribution of these two data is not so different", I had the opportunity to think "Is it really?", So I tried to make data that is not so. I did.
Create two types of data data1 and data2 consisting of three dimensions of x, y, and z as follows.
import numpy as np
x = np.random.normal(50,10, size=(500))
y = np.random.normal(50,10, size=(500))
z = np.random.normal(50, 1, size=(500))
x1 = x[np.where(z > 51, True, False)]
y1 = y[np.where(z > 51, True, False)]
z1 = z[np.where(z > 51, True, False)]
x2 = x[np.where(z < 49, True, False)]
y2 = y[np.where(z < 49, True, False)]
z2 = z[np.where(z < 49, True, False)]
data1 = np.concatenate([x1, y1, z1]).reshape(3, (len(x1))).transpose(1, 0)
data2 = np.concatenate([x2, y2, z2]).reshape(3, (len(x2))).transpose(1, 0)
data = np.concatenate([data1, data2])
colors = ["red" if i >len(data1) else "blue" for i in range(len(data1)+len(data2))]
Principal component analysis, "Hey, aren't these two datasets significantly different in distribution?" (Doya face)
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA #Principal component analyzer
#Perform principal component analysis
pca = PCA()
pca.fit(data)
#Mapping data to principal component space=Dimensional compression
feature = pca.transform(data)
#Plot with the first and second principal components
plt.figure(figsize=(8, 8))
plt.scatter(feature[:len(data1), 0], feature[:len(data1), 1], alpha=0.8)
plt.scatter(feature[len(data1):, 0], feature[len(data1):, 1], alpha=0.8)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid()
plt.show()
"If so, I'll show you the cumulative contribution rate. Isn't the cumulative contribution rate up to the second main component (PC2) almost 100%?" (Doya face)
#Illustrate the cumulative contribution rate
import matplotlib.ticker as ticker
import numpy as np
plt.gca().get_xaxis().set_major_locator(ticker.MaxNLocator(integer=True))
plt.plot([0] + list( np.cumsum(pca.explained_variance_ratio_)), "-o")
plt.xlabel("Number of principal components")
plt.ylabel("Cumulative contribution ratio")
plt.grid()
plt.show()
"Can you show me the scatter plot procession?"
(Dockin !!)
import pandas as pd
from pandas.tools import plotting
plotting.scatter_matrix(pd.DataFrame(feature, columns=['PC1', 'PC2', 'PC3']), figsize=(8, 8), color=colors)
plt.show()
"Isn't it a completely different thing ?! Show me the scatterplot matrix of the original data!"
"Susu Susumen"
import pandas as pd
from pandas.tools import plotting
plotting.scatter_matrix(pd.DataFrame(data, columns=['x', 'y', 'z']), figsize=(8, 8), color=colors)
plt.show()
You have to look at the data from multiple angles. One of the methods is data standardization. Just doing that will change the way the data looks.
import numpy as np
def zscore(x, axis = None):
xmean = x.mean(axis=axis, keepdims=True)
xstd = np.std(x, axis=axis, keepdims=True)
zscore = (x-xmean)/xstd
return zscore
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA #Principal component analyzer
#Perform principal component analysis
pca = PCA()
pca.fit(zscore(data, axis=0))
#Mapping data to principal component space=Dimensional compression
feature = pca.transform(zscore(data, axis=0))
#Plot with the first and second principal components
plt.figure(figsize=(8, 8))
plt.scatter(feature[:len(data1), 0], feature[:len(data1), 1], alpha=0.8)
plt.scatter(feature[len(data1):, 0], feature[len(data1):, 1], alpha=0.8)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid()
plt.show()
#Illustrate the cumulative contribution rate
import matplotlib.ticker as ticker
import numpy as np
plt.gca().get_xaxis().set_major_locator(ticker.MaxNLocator(integer=True))
plt.plot([0] + list( np.cumsum(pca.explained_variance_ratio_)), "-o")
plt.xlabel("Number of principal components")
plt.ylabel("Cumulative contribution ratio")
plt.grid()
plt.show()
import pandas as pd
from pandas.tools import plotting
plotting.scatter_matrix(pd.DataFrame(feature, columns=['PC1', 'PC2', 'PC3']), figsize=(8, 8), color=colors)
plt.show()
import pandas as pd
from pandas.tools import plotting
plotting.scatter_matrix(pd.DataFrame(zscore(data, axis=0), columns=['x', 'y', 'z']), figsize=(8, 8), color=colors)
plt.show()
/Users/kot/miniconda3/envs/py3new/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.
This is separate from the ipykernel package so we can avoid doing imports until