How to use the pyod.models.base.BaseDetector function in pyod

To help you get started, we’ve selected a few pyod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github yzhao062 / pyod / pyod / models / feature_bagging.py View on Github external
for i, (estimator, features) in enumerate(
            zip(estimators, estimators_features)):
        if hasattr(estimator, 'decision_function'):
            estimator_score = estimator.decision_function(
                X[:, features])
            scores[:, i] = estimator_score
        else:
            raise NotImplementedError(
                'current base detector has no decision_function')
    return scores


# TODO: should support parallelization at the model level
# TODO: detector score combination through BFS should be implemented
# See https://github.com/yzhao062/pyod/issues/59
class FeatureBagging(BaseDetector):
    """ A feature bagging detector is a meta estimator that fits a number of
    base detectors on various sub-samples of the dataset and use averaging
    or other combination methods to improve the predictive accuracy and
    control over-fitting.

    The sub-sample size is always the same as the original input sample size
    but the features are randomly sampled from half of the features to all
    features.

    By default, LOF is used as the base estimator. However, any estimator
    could be used as the base estimator, such as kNN and ABOD.

    Feature bagging first construct n subsamples by random selecting a subset
    of features, which induces the diversity of base estimators.

    Finally, the prediction score is generated by averaging/taking the maximum
github yzhao062 / pyod / pyod / models / abod.py View on Github external
wcos_list = []
    curr_pair_inds = list(combinations(X_ind, 2))
    for j, (a_ind, b_ind) in enumerate(curr_pair_inds):
        a = X[a_ind, :]
        b = X[b_ind, :]

        # skip if no angle can be formed
        if np.array_equal(a, curr_pt) or np.array_equal(b, curr_pt):
            continue
        # add the weighted cosine to the list
        wcos_list.append(_wcos(curr_pt, a, b))
    return np.var(wcos_list)


# noinspection PyPep8Naming
class ABOD(BaseDetector):
    """ABOD class for Angle-base Outlier Detection.
    For an observation, the variance of its weighted cosine scores to all
    neighbors could be viewed as the outlying score.
    See :cite:`kriegel2008angle` for details.

    Two version of ABOD are supported:

    - Fast ABOD: use k nearest neighbors to approximate.
    - Original ABOD: consider all training points with high time complexity at
      O(n^3).

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
github yzhao062 / pyod / pyod / models / xgbod.py View on Github external
from xgboost.sklearn import XGBClassifier

from .base import BaseDetector
from .knn import KNN
from .lof import LOF
from .iforest import IForest
from .hbos import HBOS
from .ocsvm import OCSVM

from ..utils.utility import check_parameter
from ..utils.utility import check_detector
from ..utils.utility import standardizer
from ..utils.utility import precision_n_scores


class XGBOD(BaseDetector):
    """XGBOD class for outlier detection.
    It first use the passed in unsupervised outlier detectors to extract
    richer representation of the data and then concatenate the newly
    generated features to the original feature for constructing the augmented
    feature space. An XGBoost classifier is then applied on this augmented
    feature space. Read more in the :cite:`zhao2018xgbod`.

    Parameters
    ----------
    estimator_list : list, optional (default=None)
        The list of pyod detectors passed in for unsupervised learning

    standardization_flag_list : list, optional (default=None)
        The list of boolean flags for indicating whether to take
        standardization for each detector.
github yzhao062 / pyod / pyod / models / sod.py View on Github external
"""
    n = ind.shape[0]
    _count = np.zeros(shape=(n, ref_set_), dtype=nb.uint16)
    for i in nb.prange(n):
        temp = np.empty(n, dtype=nb.int16)
        test_element_set = set(ind[i])
        for j in nb.prange(n):
            temp[j] = len(set(ind[j]).intersection(test_element_set))
        temp[i] = np.iinfo(np.uint16).max
        _count[i] = np.argsort(temp)[::-1][1:ref_set_ + 1]

    return _count


class SOD(BaseDetector):
    """Subspace outlier detection (SOD) schema aims to detect outlier in
    varying subspaces of a high dimensional feature space. For each data
    object, SOD explores the axis-parallel subspace spanned by the data
    object's neighbors and determines how much the object deviates from the
    neighbors in this subspace.

    See :cite:`kriegel2009outlier` for details.

    Parameters
    ----------
    n_neighbors : int, optional (default=20)
        Number of neighbors to use by default for k neighbors queries.

    ref_set: int, optional (default=10)
        specifies the number of shared nearest neighbors to create the
        reference set. Note that ref_set must be smaller than n_neighbors.
github yzhao062 / pyod / pyod / models / lmdd.py View on Github external
if dis_measure not in ('aad', 'var', 'iqr'):
            raise ValueError("Unknown dissimilarity measure type, "
                             "dis_measure should be in "
                             "(\'aad\', \'var\', \'iqr\'), "
                             "got %s" % dis_measure)
        # TO-DO: 'mad': Median Absolute Deviation to be added
        # once Scipy stats version 1.3.0 is released
    else:
        raise TypeError("dis_measure should be str, got %s" % dis_measure)

    return check_random_state(random_state), _aad if dis_measure == 'aad' \
        else (np.var if dis_measure == 'var' else (
        stats.iqr if dis_measure == 'iqr' else None))


class LMDD(BaseDetector):
    """Linear Method for Deviation-based Outlier Detection.

    LMDD employs the concept of the smoothing factor which
    indicates how much the dissimilarity can be reduced by
    removing a subset of elements from the data-set.
    Read more in the :cite:`arning1996linear`.

    Note: this implementation has minor modification to make it output scores
    instead of labels.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
github yzhao062 / pyod / examples / temp_do_not_use_lscp.py View on Github external
return indices


# access the timestamp for logging purpose
today = datetime.datetime.now()
timestamp = today.strftime("%Y%m%d_%H%M%S")

# set numpy parameters
np.set_printoptions(suppress=True, precision=4)


# TODO: Clean up unnecessary residual comments
# TODO: Add proper documentation
# TODO: Design unit tests

class LSCP(BaseDetector):

    def __init__(self, estimator_list, n_iterations=20, local_region_size=30, local_max_features=1.0, n_bins=10,
                 random_state=42, contamination=0.1):

        super(LSCP, self).__init__(contamination=contamination)
        self.estimator_list = estimator_list
        self.n_clf = len(self.estimator_list)
        self.n_iterations = n_iterations
        self.local_region_size = local_region_size
        self.local_region_min = 30
        self.local_region_max = 100
        self.local_max_features = local_max_features
        self.local_min_features = 0.5
        self.local_region_iterations = 20
        self.local_region_threshold = int(self.local_region_iterations / 2)
        self.n_bins = n_bins
github yzhao062 / pyod / pyod / models / cblof.py View on Github external
import warnings
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import check_array
from sklearn.utils.estimator_checks import check_estimator

from .base import BaseDetector
from ..utils.utility import check_parameter
from ..utils.stat_models import pairwise_distances_no_broadcast

__all__ = ['CBLOF']


class CBLOF(BaseDetector):
    """The CBLOF operator calculates the outlier score based on cluster-based
    local outlier factor.

    CBLOF takes as an input the data set and the cluster model that was
    generated by a clustering algorithm. It classifies the clusters into small
    clusters and large clusters using the parameters alpha and beta.
    The anomaly score is then calculated based on the size of the cluster the
    point belongs to as well as the distance to the nearest large cluster.

    Use weighting for outlier factor based on the sizes of the clusters as
    proposed in the original publication. Since this might lead to unexpected
    behavior (outliers close to small clusters are not found), it is disabled
    by default.Outliers scores are solely computed based on their distance to
    the closest large cluster center.

    By default, kMeans is used for clustering algorithm instead of
github yzhao062 / pyod / pyod / models / auto_encoder.py View on Github external
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from keras.losses import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from ..utils.utility import check_parameter
from ..utils.stat_models import pairwise_distances_no_broadcast

from .base import BaseDetector


# noinspection PyUnresolvedReferences,PyPep8Naming,PyTypeChecker
class AutoEncoder(BaseDetector):
    """Auto Encoder (AE) is a type of neural networks for learning useful data
    representations unsupervisedly. Similar to PCA, AE could be used to
    detect outlying objects in the data by calculating the reconstruction
    errors. See :cite:`aggarwal2015outlier` Chapter 3 for details.

    Parameters
    ----------
    hidden_neurons : list, optional (default=[64, 32, 32, 64])
        The number of neurons per hidden layers.

    hidden_activation : str, optional (default='relu')
        Activation function to use for hidden layers.
        All hidden layers are forced to use the same type of activation.
        See https://keras.io/activations/

    output_activation : str, optional (default='sigmoid')
github yzhao062 / pyod / pyod / models / sos.py View on Github external
"""Compute the perplexity and the A-row for a specific value of the
    precision of a Gaussian distribution.

    Parameters
    ----------
    D : array, shape (n_samples, )
        The dissimilarity matrix of the training samples.
    """

    A = np.exp(-D * beta)
    sumA = np.sum(A)
    H = np.log(sumA) + beta * np.sum(D * A) / sumA
    return H, A


class SOS(BaseDetector):
    """Stochastic Outlier Selection.
    
    SOS employs the concept of affinity to quantify
    the relationship from one data point to another data point. Affinity is 
    proportional to the similarity between two data points. So, a data point 
    has little affinity with a dissimilar data point. A data point is 
    selected as an outlier when all the other data points have insufficient
    affinity with it.
    Read more in the :cite:`janssens2012stochastic`.
    
    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1) 
        The amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
github yzhao062 / pyod / pyod / models / lscp.py View on Github external
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import check_random_state

# PyOD imports
from pyod.models.base import BaseDetector
from pyod.utils.stat_models import pearsonr
from pyod.utils.utility import argmaxn
from pyod.utils.utility import generate_bagging_indices
from pyod.utils.utility import standardizer
from pyod.utils.utility import check_detector


# TODO: find random state that is causing runtime warning in pearson

class LSCP(BaseDetector):
    """ Locally Selection Combination in Parallel Outlier Ensembles

    LSCP is an unsupervised parallel outlier detection ensemble which selects
    competent detectors in the local region of a test instance. This
    implementation uses an Average of Maximum strategy. First, a heterogeneous
    list of base detectors is fit to the training data and then generates a
    pseudo ground truth for each train instance is generated by
    taking the maximum outlier score.

    For each test instance:
    1) The local region is defined to be the set of nearest training points in
    randomly sampled feature subspaces which occur more frequently than
    a defined threshold over multiple iterations.

    2) Using the local region, a local pseudo ground truth is defined and the
    pearson correlation is calculated between each base detector's training