How to use the sklearn.cluster.KMeans function in sklearn

To help you get started, we’ve selected a few sklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github onnx / sklearn-onnx / tests / test_algebra_onnx_operator_mixin_syntax.py View on Github external
def test_way4_mixin_fit(self):

        X = np.arange(20).reshape(10, 2)
        try:
            tr = wrap_as_onnx_mixin(KMeans(n_clusters=2))
        except KeyError as e:
            assert "SklearnGaussianProcessRegressor" in str(e)
            return
        tr.fit(X)

        onx = tr.to_onnx(X.astype(np.float32))

        dump_data_and_model(
            X.astype(np.float32), tr, onx,
            basename="MixinWay4OnnxMixin2")
github scikit-tda / kepler-mapper / examples / breast-cancer / breast-cancer.py View on Github external
model = ensemble.IsolationForest(random_state=1729)
model.fit(X)
lens1 = model.decision_function(X).reshape((X.shape[0], 1))

# We create another 1-D lens with L2-norm
mapper = km.KeplerMapper(verbose=3)
lens2 = mapper.fit_transform(X, projection="l2norm")

# Combine both lenses to create a 2-D [Isolation Forest, L^2-Norm] lens
lens = np.c_[lens1, lens2]

# Create the simplicial complex
graph = mapper.map(lens,
                   X,
                   cover=km.Cover(n_cubes=15, perc_overlap=0.7),
                   clusterer=sklearn.cluster.KMeans(n_clusters=2,
                                                    random_state=1618033))

# Visualization
mapper.visualize(graph,
                 path_html="breast-cancer.html",
                 title="Wisconsin Breast Cancer Dataset",
                 custom_tooltips=y)
github chappers / scikit-feature / skfeast / utility / unsupervised_evaluation.py View on Github external
-----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels

    Output
    ------
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=None, copy_x=True, n_jobs=1)

    k_means.fit(X_selected)
    y_predict = k_means.labels_

    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict)

    # calculate ACC
    y_permuted_predict = best_map(y, y_predict)
    acc = accuracy_score(y, y_permuted_predict)

    return nmi, acc
github junlulocky / AirTicketPredicting / Classification / ClassficationBase.py View on Github external
def kmeansRemovingOutlierForClassifier():
    """
    use k-means to do outlier removal
    :return: NA
    """
    # load data
    X_train = np.load('inputClf_small/X_train.npy')
    y_train = np.load('inputClf_small/y_train.npy')
    y_train_price = np.load('inputClf_small/y_train_price.npy')

    # cluster initializing
    X_train1 = X_train[np.where(y_train==0)[0], :]
    X_train2 = X_train[np.where(y_train==1)[0], :]
    cluster1 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train1)
    cluster1 = cluster1.cluster_centers_
    cluster2 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train2)
    cluster2 = cluster2.cluster_centers_
    clusters = np.concatenate((cluster1, cluster2), axis=0)


    y_pred = KMeans(init='random', n_clusters=2, random_state=2).fit_predict(X_train)
    y_pred = y_pred.reshape((y_pred.shape[0], 1))
    y_pred = y_pred
    tmp = np.concatenate((y_train, y_pred), axis=1)

    sam = y_train == y_pred
    print "# total: {}".format(y_train.shape[0])
    print "# datas left: {}".format(np.sum(sam))
    # Keep 63.62% data.
    print "Keep {}% data.".format(round(np.sum(sam)*100.0/y_train.shape[0], 2))
github dragonfly / dragonfly / dragonfly / exd / exd_utils.py View on Github external
def random_sampling_kmeans_cts(dim, num_samples):
  """ Picks a large number of points uniformly at random and then runs k-means to
      select num_samples points. """
  try:
    from sklearn.cluster import KMeans
    num_candidates = np.clip(100*(dim**2), 4*num_samples, 20*num_samples)
    candidates = random_sampling_cts(dim, num_candidates)
    centres = KMeans(n_clusters=num_samples).fit(candidates)
    return centres.cluster_centers_
  except ImportError:
    return random_sampling_cts(dim, num_samples)
github benedekrozemberczki / GEMSEC / src / gemsec / calculation_helper.py View on Github external
def classical_modularity_calculator(graph, embedding, args):
    """
    Function to calculate the DeepWalk cluster centers and assignments.
    """    
    kmeans = KMeans(n_clusters=args.cluster_number, random_state=0, n_init = 1).fit(embedding)
    assignments = {str(i): int(kmeans.labels_[i]) for i in range(0, embedding.shape[0])}
    modularity = community.modularity(assignments,graph)
    return modularity, assignments
github tyiannak / pyAudioAnalysis / pyAudioAnalysis / audioSegmentation.py View on Github external
for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio);        
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []
    
    for iSpeakers in s_range:        
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_        
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []; sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
github jiali-ms / JLM / train / comp.py View on Github external
Output will be compressed code like 0-255 and its centroids as its codebook.

    It takes about 1 hour to train a embedding in size like (50k, 512) with 8 CPUs cores all running.

    :param weight:
    :param bit:
    :return: code, centroids(codebook)
    """

    shape = weight.shape
    weight = weight.reshape(-1, 1)

    assert bit <= 32
    clusters = 2 ** bit
    print('{} clusters'.format(clusters))
    kmeans = KMeans(n_clusters=clusters, n_jobs=4)
    kmeans.fit(weight)
    code = kmeans.predict(weight)

    if bit <= 8:
        code = code.astype(np.uint8)

    centroids = kmeans.cluster_centers_
    return code.reshape(shape), centroids.astype('f')
github cylance / NMAP-Cluster / clusteringnmap / Divisive_Cluster.py View on Github external
def _divide(self, clustering_method="kmeans"):
        if clustering_method == "kmeans":
            kmeans = KMeans(n_clusters=2)
            cluster_result = kmeans.fit_predict(self.data)
            IdsA = list(np.where(cluster_result == 1)[0])
            IdsB = list(np.where(cluster_result == 0)[0])
        elif clustering_method == "divide":
            A = set(range(self.data.shape[0]))
            B = set()
            IdsA = list(A)
            IdsB = list(B)
            Ids = np.argmax(sum(self.distance_matrix))
            while self.d_point_set(Ids, A) > self.d_point_set(Ids, B) and len(A) > 1:
                A.remove(IdsA[Ids])
                B.add(IdsA[Ids])
                IdsA = list(A)
                IdsB = list(B)
                Ids = np.argmax(np.sum(self.distance_matrix[IdsA][:, IdsA], axis = 1)/(len(A)-1)
                                - np.sum(self.distance_matrix[IdsA][:, IdsB], axis = 1)/len(B))
github Rocionightwater / ML-NIDS-for-SCADA / src / preprocess-data_blstm.py View on Github external
  KM = lambda k: KMeans(n_clusters=k)
  score_func = lambda km, Xs: km.score(Xs)