--Outlier detection using One Class SVM --Use sample data (normal distribution, mixed normal distribution) generated using random numbers --2.7. Novelty and Outlier Detection in Scikit-learn's User Guide (http://scikit-learn.org/stable/user_guide.html) /covariance/plot_outlier_detection.html)
In addition, the Robust Covariance Estimator (outlier detection method using Mahalanobis distance, assuming that normal data follows a normal distribution) is also introduced on the above user guide page, but this time it is omitted.
One-Class SVM Outlier detection method using SVM. Mapped to the feature space using the kernel, the points isolated on the original space are distributed near the origin in the feature space. Kernel is the default rbf, and try changing nu (range from 0 to 1, def. = 0.5) that determines the ratio of abnormal data. Scikit-learn One Class SVM Object Page
Prepare three types of data sets in which normal data is represented by a mixed normal distribution. The leftmost is a single normal distribution, and the two on the right are superpositions of four normal distributions. The anomalous data were uniformly distributed.
Unlike the Mahalanobis distance, there is a discriminant boundary for each mass of distribution that is separated in the original space. The smaller the nu, the more sensitive and complex each piece of data is, and the default of 0.5 is very simple.
Refer to the user guide.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.font_manager
from scipy import stats
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
# Example settings
n_samples = 400 #Number of specimens
outliers_fraction = 0.05 #Percentage of abnormal data in the total number of samples
clusters_separation = [0, 1, 2]
#Generation of grid-like data for 2D drawing
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
#Generation of normal data and abnormal data
n_inliers = int((1. - outliers_fraction) * n_samples) #Number of samples of normal data
n_outliers = int(outliers_fraction * n_samples) #Number of samples of anomalous data
ground_truth = np.ones(n_samples, dtype=int) #Label data
ground_truth[-n_outliers:] = 0
# Fit the problem with varying cluster separation
# [enumerate function](http://python.civic-apps.com/zip-enumerate/)Loops with the index
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
#Normal data generation
X1 = 0.3 * np.random.randn(0.25 * n_inliers, 2) - offset #Normal distribution N(μ= -offset, σ=0.3)
X2 = 0.3 * np.random.randn(0.25 * n_inliers, 2) + offset #Normal distribution N(μ= +offset, σ=0.3)
X3 = np.c_[
0.3 * np.random.randn(0.25 * n_inliers, 1) - 3*offset, #Normal distribution N(μ= -3*offset, σ=0.3)
0.3 * np.random.randn(0.25 * n_inliers, 1) + 3*offset #Normal distribution N(μ= +3*offset, σ=0.3)
]
X4 = np.c_[
0.3 * np.random.randn(0.25 * n_inliers, 1) + 3*offset, #Normal distribution N(μ= +3*offset, σ=0.3)
0.3 * np.random.randn(0.25 * n_inliers, 1) - 3*offset #Normal distribution N(μ= -3*offset, σ=0.3)
]
X = np.r_[X1, X2, X3, X4] #Join by line
#Outlier data generation
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] #Uniform distribution-6 <= X <= +6
# Fit the model with the One-Class SVM
plt.figure(figsize=(10, 12))
#Outlier detection tool, 1-class SVM and Robust Covariance Estimator
# classifiers = {
# "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
# kernel="rbf", gamma=0.1),
# "robust covariance estimator": EllipticEnvelope(contamination=.1)} #Covariance estimation
nu_l = [0.05, 0.1, 0.5]
for j, nu in enumerate(nu_l):
# clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1)
clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma='auto')
clf.fit(X)
y_pred = clf.decision_function(X).ravel() #Distance to hyperplane of each data, ravel()Convert the array to 1D with
threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) #Threshold setting for abnormality judgment in percentile
y_pred = y_pred > threshold
n_errors = (y_pred != ground_truth).sum() #Number of false positives
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) #Outputs the distance to the hyperplane in a grid pattern
Z = Z.reshape(xx.shape)
subplot = plt.subplot(3, 3,i*3+j+1)
subplot.set_title("Outlier detection nu=%s" % nu)
#Prediction result
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r)
#Hyperplane
a = subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')
#Normal range
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
#Normal data
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')
#Abnormal data
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=11))
# subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlabel("%d. One class SVM (errors: %d)" % (i+1, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
plt.subplots_adjust(0.04, 0.1, 1.2, 0.84, 0.1, 0.26)
plt.show()