Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for i, (estimator, features) in enumerate(
zip(estimators, estimators_features)):
if hasattr(estimator, 'decision_function'):
estimator_score = estimator.decision_function(
X[:, features])
scores[:, i] = estimator_score
else:
raise NotImplementedError(
'current base detector has no decision_function')
return scores
# TODO: should support parallelization at the model level
# TODO: detector score combination through BFS should be implemented
# See https://github.com/yzhao062/pyod/issues/59
class FeatureBagging(BaseDetector):
""" A feature bagging detector is a meta estimator that fits a number of
base detectors on various sub-samples of the dataset and use averaging
or other combination methods to improve the predictive accuracy and
control over-fitting.
The sub-sample size is always the same as the original input sample size
but the features are randomly sampled from half of the features to all
features.
By default, LOF is used as the base estimator. However, any estimator
could be used as the base estimator, such as kNN and ABOD.
Feature bagging first construct n subsamples by random selecting a subset
of features, which induces the diversity of base estimators.
Finally, the prediction score is generated by averaging/taking the maximum
wcos_list = []
curr_pair_inds = list(combinations(X_ind, 2))
for j, (a_ind, b_ind) in enumerate(curr_pair_inds):
a = X[a_ind, :]
b = X[b_ind, :]
# skip if no angle can be formed
if np.array_equal(a, curr_pt) or np.array_equal(b, curr_pt):
continue
# add the weighted cosine to the list
wcos_list.append(_wcos(curr_pt, a, b))
return np.var(wcos_list)
# noinspection PyPep8Naming
class ABOD(BaseDetector):
"""ABOD class for Angle-base Outlier Detection.
For an observation, the variance of its weighted cosine scores to all
neighbors could be viewed as the outlying score.
See :cite:`kriegel2008angle` for details.
Two version of ABOD are supported:
- Fast ABOD: use k nearest neighbors to approximate.
- Original ABOD: consider all training points with high time complexity at
O(n^3).
Parameters
----------
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set, i.e.
the proportion of outliers in the data set. Used when fitting to
from xgboost.sklearn import XGBClassifier
from .base import BaseDetector
from .knn import KNN
from .lof import LOF
from .iforest import IForest
from .hbos import HBOS
from .ocsvm import OCSVM
from ..utils.utility import check_parameter
from ..utils.utility import check_detector
from ..utils.utility import standardizer
from ..utils.utility import precision_n_scores
class XGBOD(BaseDetector):
"""XGBOD class for outlier detection.
It first use the passed in unsupervised outlier detectors to extract
richer representation of the data and then concatenate the newly
generated features to the original feature for constructing the augmented
feature space. An XGBoost classifier is then applied on this augmented
feature space. Read more in the :cite:`zhao2018xgbod`.
Parameters
----------
estimator_list : list, optional (default=None)
The list of pyod detectors passed in for unsupervised learning
standardization_flag_list : list, optional (default=None)
The list of boolean flags for indicating whether to take
standardization for each detector.
"""
n = ind.shape[0]
_count = np.zeros(shape=(n, ref_set_), dtype=nb.uint16)
for i in nb.prange(n):
temp = np.empty(n, dtype=nb.int16)
test_element_set = set(ind[i])
for j in nb.prange(n):
temp[j] = len(set(ind[j]).intersection(test_element_set))
temp[i] = np.iinfo(np.uint16).max
_count[i] = np.argsort(temp)[::-1][1:ref_set_ + 1]
return _count
class SOD(BaseDetector):
"""Subspace outlier detection (SOD) schema aims to detect outlier in
varying subspaces of a high dimensional feature space. For each data
object, SOD explores the axis-parallel subspace spanned by the data
object's neighbors and determines how much the object deviates from the
neighbors in this subspace.
See :cite:`kriegel2009outlier` for details.
Parameters
----------
n_neighbors : int, optional (default=20)
Number of neighbors to use by default for k neighbors queries.
ref_set: int, optional (default=10)
specifies the number of shared nearest neighbors to create the
reference set. Note that ref_set must be smaller than n_neighbors.
if dis_measure not in ('aad', 'var', 'iqr'):
raise ValueError("Unknown dissimilarity measure type, "
"dis_measure should be in "
"(\'aad\', \'var\', \'iqr\'), "
"got %s" % dis_measure)
# TO-DO: 'mad': Median Absolute Deviation to be added
# once Scipy stats version 1.3.0 is released
else:
raise TypeError("dis_measure should be str, got %s" % dis_measure)
return check_random_state(random_state), _aad if dis_measure == 'aad' \
else (np.var if dis_measure == 'var' else (
stats.iqr if dis_measure == 'iqr' else None))
class LMDD(BaseDetector):
"""Linear Method for Deviation-based Outlier Detection.
LMDD employs the concept of the smoothing factor which
indicates how much the dissimilarity can be reduced by
removing a subset of elements from the data-set.
Read more in the :cite:`arning1996linear`.
Note: this implementation has minor modification to make it output scores
instead of labels.
Parameters
----------
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set, i.e.
the proportion of outliers in the data set. Used when fitting to
define the threshold on the decision function.
return indices
# access the timestamp for logging purpose
today = datetime.datetime.now()
timestamp = today.strftime("%Y%m%d_%H%M%S")
# set numpy parameters
np.set_printoptions(suppress=True, precision=4)
# TODO: Clean up unnecessary residual comments
# TODO: Add proper documentation
# TODO: Design unit tests
class LSCP(BaseDetector):
def __init__(self, estimator_list, n_iterations=20, local_region_size=30, local_max_features=1.0, n_bins=10,
random_state=42, contamination=0.1):
super(LSCP, self).__init__(contamination=contamination)
self.estimator_list = estimator_list
self.n_clf = len(self.estimator_list)
self.n_iterations = n_iterations
self.local_region_size = local_region_size
self.local_region_min = 30
self.local_region_max = 100
self.local_max_features = local_max_features
self.local_min_features = 0.5
self.local_region_iterations = 20
self.local_region_threshold = int(self.local_region_iterations / 2)
self.n_bins = n_bins
import warnings
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import check_array
from sklearn.utils.estimator_checks import check_estimator
from .base import BaseDetector
from ..utils.utility import check_parameter
from ..utils.stat_models import pairwise_distances_no_broadcast
__all__ = ['CBLOF']
class CBLOF(BaseDetector):
"""The CBLOF operator calculates the outlier score based on cluster-based
local outlier factor.
CBLOF takes as an input the data set and the cluster model that was
generated by a clustering algorithm. It classifies the clusters into small
clusters and large clusters using the parameters alpha and beta.
The anomaly score is then calculated based on the size of the cluster the
point belongs to as well as the distance to the nearest large cluster.
Use weighting for outlier factor based on the sizes of the clusters as
proposed in the original publication. Since this might lead to unexpected
behavior (outliers close to small clusters are not found), it is disabled
by default.Outliers scores are solely computed based on their distance to
the closest large cluster center.
By default, kMeans is used for clustering algorithm instead of
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from keras.losses import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from ..utils.utility import check_parameter
from ..utils.stat_models import pairwise_distances_no_broadcast
from .base import BaseDetector
# noinspection PyUnresolvedReferences,PyPep8Naming,PyTypeChecker
class AutoEncoder(BaseDetector):
"""Auto Encoder (AE) is a type of neural networks for learning useful data
representations unsupervisedly. Similar to PCA, AE could be used to
detect outlying objects in the data by calculating the reconstruction
errors. See :cite:`aggarwal2015outlier` Chapter 3 for details.
Parameters
----------
hidden_neurons : list, optional (default=[64, 32, 32, 64])
The number of neurons per hidden layers.
hidden_activation : str, optional (default='relu')
Activation function to use for hidden layers.
All hidden layers are forced to use the same type of activation.
See https://keras.io/activations/
output_activation : str, optional (default='sigmoid')
"""Compute the perplexity and the A-row for a specific value of the
precision of a Gaussian distribution.
Parameters
----------
D : array, shape (n_samples, )
The dissimilarity matrix of the training samples.
"""
A = np.exp(-D * beta)
sumA = np.sum(A)
H = np.log(sumA) + beta * np.sum(D * A) / sumA
return H, A
class SOS(BaseDetector):
"""Stochastic Outlier Selection.
SOS employs the concept of affinity to quantify
the relationship from one data point to another data point. Affinity is
proportional to the similarity between two data points. So, a data point
has little affinity with a dissimilar data point. A data point is
selected as an outlier when all the other data points have insufficient
affinity with it.
Read more in the :cite:`janssens2012stochastic`.
Parameters
----------
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set, i.e.
the proportion of outliers in the data set. Used when fitting to
define the threshold on the decision function.
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import check_random_state
# PyOD imports
from pyod.models.base import BaseDetector
from pyod.utils.stat_models import pearsonr
from pyod.utils.utility import argmaxn
from pyod.utils.utility import generate_bagging_indices
from pyod.utils.utility import standardizer
from pyod.utils.utility import check_detector
# TODO: find random state that is causing runtime warning in pearson
class LSCP(BaseDetector):
""" Locally Selection Combination in Parallel Outlier Ensembles
LSCP is an unsupervised parallel outlier detection ensemble which selects
competent detectors in the local region of a test instance. This
implementation uses an Average of Maximum strategy. First, a heterogeneous
list of base detectors is fit to the training data and then generates a
pseudo ground truth for each train instance is generated by
taking the maximum outlier score.
For each test instance:
1) The local region is defined to be the set of nearest training points in
randomly sampled feature subspaces which occur more frequently than
a defined threshold over multiple iterations.
2) Using the local region, a local pseudo ground truth is defined and the
pearson correlation is calculated between each base detector's training