I was shown a plot by principal component analysis, and when I was told that "the distribution of these two data is not so different", I had the opportunity to think "Is it really?", So I tried to make data that is not so. I did.

Data creation

Create two types of data data1 and data2 consisting of three dimensions of x, y, and z as follows.

import numpy as np

x = np.random.normal(50,10, size=(500)) 
y = np.random.normal(50,10, size=(500)) 
z = np.random.normal(50, 1, size=(500))

x1 = x[np.where(z > 51, True, False)]
y1 = y[np.where(z > 51, True, False)]
z1 = z[np.where(z > 51, True, False)]

x2 = x[np.where(z < 49, True, False)]
y2 = y[np.where(z < 49, True, False)]
z2 = z[np.where(z < 49, True, False)]

data1 = np.concatenate([x1, y1, z1]).reshape(3, (len(x1))).transpose(1, 0)
data2 = np.concatenate([x2, y2, z2]).reshape(3, (len(x2))).transpose(1, 0)

data = np.concatenate([data1, data2])

colors = ["red" if i >len(data1) else "blue" for i in range(len(data1)+len(data2))]

Principal component analysis

Principal component analysis, "Hey, aren't these two datasets significantly different in distribution?" (Doya face)

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA #Principal component analyzer

#Perform principal component analysis
pca = PCA()
pca.fit(data)
#Mapping data to principal component space=Dimensional compression
feature = pca.transform(data)
#Plot with the first and second principal components
plt.figure(figsize=(8, 8))
plt.scatter(feature[:len(data1), 0], feature[:len(data1), 1], alpha=0.8)
plt.scatter(feature[len(data1):, 0], feature[len(data1):, 1], alpha=0.8)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid()
plt.show()

Cumulative contribution rate

"If so, I'll show you the cumulative contribution rate. Isn't the cumulative contribution rate up to the second main component (PC2) almost 100%?" (Doya face)

#Illustrate the cumulative contribution rate
import matplotlib.ticker as ticker
import numpy as np
plt.gca().get_xaxis().set_major_locator(ticker.MaxNLocator(integer=True))
plt.plot([0] + list( np.cumsum(pca.explained_variance_ratio_)), "-o")
plt.xlabel("Number of principal components")
plt.ylabel("Cumulative contribution ratio")
plt.grid()
plt.show()

Scatterplot matrix

"Can you show me the scatter plot procession?"

(Dockin !!)

import pandas as pd
from pandas.tools import plotting 
plotting.scatter_matrix(pd.DataFrame(feature, columns=['PC1', 'PC2', 'PC3']), figsize=(8, 8), color=colors) 
plt.show()

"Isn't it a completely different thing ?! Show me the scatterplot matrix of the original data!"

"Susu Susumen"

import pandas as pd
from pandas.tools import plotting 
plotting.scatter_matrix(pd.DataFrame(data,  columns=['x', 'y', 'z']), figsize=(8, 8), color=colors) 
plt.show()

Data standardization

You have to look at the data from multiple angles. One of the methods is data standardization. Just doing that will change the way the data looks.

import numpy as np
def zscore(x, axis = None):
    xmean = x.mean(axis=axis, keepdims=True)
    xstd  = np.std(x, axis=axis, keepdims=True)
    zscore = (x-xmean)/xstd
    return zscore

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA #Principal component analyzer

#Perform principal component analysis
pca = PCA()
pca.fit(zscore(data, axis=0))
#Mapping data to principal component space=Dimensional compression
feature = pca.transform(zscore(data, axis=0))
#Plot with the first and second principal components
plt.figure(figsize=(8, 8))
plt.scatter(feature[:len(data1), 0], feature[:len(data1), 1], alpha=0.8)
plt.scatter(feature[len(data1):, 0], feature[len(data1):, 1], alpha=0.8)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid()
plt.show()

#Illustrate the cumulative contribution rate
import matplotlib.ticker as ticker
import numpy as np
plt.gca().get_xaxis().set_major_locator(ticker.MaxNLocator(integer=True))
plt.plot([0] + list( np.cumsum(pca.explained_variance_ratio_)), "-o")
plt.xlabel("Number of principal components")
plt.ylabel("Cumulative contribution ratio")
plt.grid()
plt.show()

import pandas as pd
from pandas.tools import plotting 
plotting.scatter_matrix(pd.DataFrame(feature, columns=['PC1', 'PC2', 'PC3']), figsize=(8, 8), color=colors) 
plt.show()

import pandas as pd
from pandas.tools import plotting 
plotting.scatter_matrix(pd.DataFrame(zscore(data, axis=0),  columns=['x', 'y', 'z']), figsize=(8, 8), color=colors) 
plt.show()

/Users/kot/miniconda3/envs/py3new/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.
  This is separate from the ipykernel package so we can avoid doing imports until

When I was shown a plot such as principal component analysis, "the distributions of these two data are not so different"?

Data creation

Principal component analysis

Cumulative contribution rate

Scatterplot matrix

Data standardization