How to use the sklearn.base.BaseEstimator function in sklearn

To help you get started, we’ve selected a few sklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scikit-learn / scikit-learn / sklearn / preprocessing / __init__.py View on Github external
return Y
            else:
                # Lists of tuples format
                return [tuple(self.classes_[np.flatnonzero(Y[i])])
                        for i in range(Y.shape[0])]

        if len(Y.shape) == 1 or Y.shape[1] == 1:
            y = np.array(Y.ravel() > threshold, dtype=int)

        else:
            y = Y.argmax(axis=1)

        return self.classes_[y]


class KernelCenterer(BaseEstimator, TransformerMixin):
    """Center a kernel matrix

    This is equivalent to centering phi(X) with
    sklearn.preprocessing.Scaler(with_std=False).
    """

    def fit(self, K):
        """Fit KernelCenterer

        Parameters
        ----------
        K : numpy array of shape [n_samples, n_samples]
            Kernel matrix.

        Returns
        -------
github theislab / scanpy / scanpy / neighbors / umap / umap_.py View on Github external
best matches an offset exponential decay.
    """

    def curve(x, a, b):
        return 1.0 / (1.0 + a * x ** (2 * b))

    xv = np.linspace(0, spread * 3, 300)
    yv = np.zeros(xv.shape)
    yv[xv < min_dist] = 1.0
    yv[xv >= min_dist] = np.exp(
        -(xv[xv >= min_dist] - min_dist) / spread)
    params, covar = curve_fit(curve, xv, yv)
    return params[0], params[1]


class UMAP(BaseEstimator):
    """Uniform Manifold Approximation and Projection

    Finds a low dimensional embedding of the data that approximates
    an underlying manifold.

    Parameters
    ----------
    n_neighbors: float (optional, default 15)
        The size of local neighborhood (in terms of number of neighboring
        sample points) used for manifold approximation. Larger values
        result in more global views of the manifold, while smaller
        values result in more local data being preserved. In general
        values should be in the range 2 to 100.

    n_components: int (optional, default 2)
        The dimension of the space to embed into. This defaults to 2 to
github pescadores / pescador / pescador / core.py View on Github external
This function will force an infinite stream, restarting
        the generator even if a StopIteration is raised.

        Yields
        ------
        batch
            Items from the contained generator.
        '''
        # ??? What more does this need?
        while True:
            for item in self.generate():
                yield item


class StreamLearner(sklearn.base.BaseEstimator):
    '''A class to facilitate iterative learning from a generator.

    Attributes
    ----------
    estimator : sklearn.base.BaseEstimator
        An estimator object to wrap.  Must implement `partial_fit()`

    max_steps : None or int > 0
        The maximum number of calls to issue to `partial_fit()`.
        If `None`, run until the generator is exhausted.
    '''

    def __init__(self, estimator, max_steps=None):
        '''Learning on generators

        Parameters
github arnefmeyer / lnpy / lnpy / lnp / glm.py View on Github external
grid_params['param_info'] = param_info

        model = _GaussianGLM(alpha=self.alpha,
                             verbose=self.verbose, tolerance=self.tolerance,
                             optimize=self.optimize, prior=self.prior,
                             scorer=self.metric, **grid_params)

        t0 = time.time()
        model.fit(X, Y)
        self.t_fit = time.time() - t0

        self._split_coef_spikefilt(model.coef_)
        self.intercept_ = model.intercept_


class _SGDGLM(SKBaseEstimator):
    """SGD-based GLM"""

    def __init__(self, metric='logli_poissonexp', family='poisson',
                 link='log', alpha=1.0,
                 n_epochs=1, algorithm='sgd', weighting='permutation',
                 avg_decay=2., warm_start=False, eta0=.1,
                 bias_multiplier=1.):

        self.family = family
        self.metric = metric
        self.link = link
        self.n_epochs = n_epochs
        self.coef_ = None
        self.intercept_ = None
        self.algorithm = algorithm
        self.weighting = weighting
github FreeDiscovery / FreeDiscovery / freediscovery / feature_weighting.py View on Github external
raise ValueError(('Document frequency weighting {} '
                          'not supported, must be one of ntp')
                         .format(scheme_d))
    if scheme_n not in ['n', 'c', 'l', 'u', 'cp', 'lp', 'up', 'b']:
        raise ValueError(('Document normalization {} '
                          'not supported, must be of the form [nclub][p]?')
                         .format(scheme_n))
    if scheme_n not in ['n', 'c', 'l', 'u', 'cp', 'lp', 'up', ]:
        raise NotImplementedError(
                   ('Document normalization {}'
                    'is not yet implemented, must be of the form [nclu][p]?')
                   .format(scheme_n))
    return scheme_t, scheme_d, scheme_n


class SmartTfidfTransformer(BaseEstimator, TransformerMixin):
    """TF-IDF weighting and normalization with the SMART IR notation

    This class is similar to
    :class:`sklearn.feature_extraction.text.TfidfTransformer` but supports
    a larger number of TF-IDF weighting and normalization schemes.
    It should be fitted on the document-term matrix computed by
    :class:`sklearn.feature_extraction.text.CountVectorizer`.

    The TF-IDF transform consists of three subsequent operations, determined
    by the ``weighting`` parameter,

    1. Term frequency weighing:

       natural (``n``), log (``l``), augmented
       (``a``),  boolean (``b``), log average (``L``)
github nilearn / nilearn / nilearn / decoding / searchlight.py View on Github external
percent = float(i) / len(list_rows)
                percent = round(percent * 100, 2)
                dt = time.time() - t0
                # We use a max to avoid a division by zero
                remaining = (100. - percent) / max(0.01, percent) * dt
                sys.stderr.write(
                    "Job #%d, processed %d/%d voxels "
                    "(%0.2f%%, %i seconds remaining)%s"
                    % (thread_id, i, len(list_rows), percent, remaining, crlf))
    return par_scores


##############################################################################
# Class for search_light #####################################################
##############################################################################
class SearchLight(BaseEstimator):
    """Implement search_light analysis using an arbitrary type of classifier.

    Parameters
    -----------
    mask_img : Niimg-like object
        See http://nilearn.github.io/manipulating_images/input_output.html
        boolean image giving location of voxels containing usable signals.

    process_mask_img : Niimg-like object, optional
        See http://nilearn.github.io/manipulating_images/input_output.html
        boolean image giving voxels on which searchlight should be
        computed.

    radius : float, optional
        radius of the searchlight ball, in millimeters. Defaults to 2.
github geomstats / geomstats / geomstats / learning / kmeans.py View on Github external
"""K-means clustering."""

import logging
from random import randint

from sklearn.base import BaseEstimator, ClusterMixin

import geomstats.backend as gs
from geomstats.learning._template import TransformerMixin
from geomstats.learning.frechet_mean import FrechetMean


class RiemannianKMeans(TransformerMixin, ClusterMixin, BaseEstimator):
    """Class for k-means clustering on manifolds.

    K-means algorithm using Riemannian manifolds.

    Parameters
    ----------
    n_clusters : int
        Number of clusters (k value of the k-means).
        Optional, default: 8.
    riemannian_metric : object of class RiemannianMetric
        The geomstats Riemmanian metric associate to the space used.
    init : str
        How to initialize centroids at the beginning of the algorithm. The
        choice 'random' will select training points as initial centroids
        uniformly at random.
        Optional, default: 'random'.
github AndriyMulyar / sklearn-oblique-tree / sklearn_oblique_tree / oblique / oblique.py View on Github external
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, check_random_state
from sklearn.utils.multiclass import unique_labels
from ._oblique import Tree

class ObliqueTree(BaseEstimator, ClassifierMixin):


    def __init__(self, splitter="oc1, axis_parallel", number_of_restarts=20, max_perturbations=5, random_state=1):
        """

        :param splitter: 'oc1' for stochastic hill climbing, 'cart' for CART multivariate, 'axis_parallel' for traditional.
        'oc1, axis_parallel' will also consider axis parallel splits when computing best oblique split. Setting 'cart' overrides other options.
        :param number_of_restarts: number of times to restart in effort to escape local minimums
        :param max_perturbations: number of random vector perturbations
        :param random_state: an integer serving as the seed (NOT a numpy random state object)
        """
        self.random_state = random_state
        self.splitter = splitter
        self.number_of_restarts = number_of_restarts
        self.max_perturbations = max_perturbations
github automl / auto-sklearn / autosklearn / pipeline / implementations / OneHotEncoder.py View on Github external
elif n_selected == n_features:
        # All features selected.
        return transform(X)
    else:
        X_sel = transform(X[:, ind[sel]])
        X_not_sel = X[:, ind[not_sel]]

        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
            # This is pretty memory-intense, making the memory usage for OpenML
            # task 146810 go over 3GB
            return sparse.hstack((X_sel, X_not_sel), format='csr')
        else:
            return np.hstack((X_sel, X_not_sel))


class OneHotEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical integer features using a one-hot aka one-of-K scheme.

    The input to this transformer should be a matrix of integers, denoting
    the values taken on by categorical (discrete) features. The output will be
    a sparse matrix were each column corresponds to one possible value of one
    feature. It is assumed that input features take on values in the range
    [0, n_values).

    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.

    Parameters
    ----------

    categorical_features: "all" or array of indices or mask
        Specify what features are treated as categorical.
github scikit-learn / scikit-learn / sklearn / naive_bayes.py View on Github external
from .base import BaseEstimator, ClassifierMixin
from .preprocessing import binarize
from .preprocessing import LabelBinarizer
from .preprocessing import label_binarize
from .utils import check_X_y, check_array, deprecated
from .utils.extmath import safe_sparse_dot
from .utils.fixes import logsumexp
from .utils.multiclass import _check_partial_fit_first_call
from .utils.validation import check_is_fitted, check_non_negative, column_or_1d
from .utils.validation import _check_sample_weight

__all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB',
           'CategoricalNB']


class _BaseNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
    """Abstract base class for naive Bayes estimators"""

    @abstractmethod
    def _joint_log_likelihood(self, X):
        """Compute the unnormalized posterior log probability of X

        I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of
        shape [n_classes, n_samples].

        Input is passed to _joint_log_likelihood as-is by predict,
        predict_proba and predict_log_proba.
        """

    @abstractmethod
    def _check_X(self, X):
        """Validate input X