Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_way4_mixin_fit(self):
X = np.arange(20).reshape(10, 2)
try:
tr = wrap_as_onnx_mixin(KMeans(n_clusters=2))
except KeyError as e:
assert "SklearnGaussianProcessRegressor" in str(e)
return
tr.fit(X)
onx = tr.to_onnx(X.astype(np.float32))
dump_data_and_model(
X.astype(np.float32), tr, onx,
basename="MixinWay4OnnxMixin2")
model = ensemble.IsolationForest(random_state=1729)
model.fit(X)
lens1 = model.decision_function(X).reshape((X.shape[0], 1))
# We create another 1-D lens with L2-norm
mapper = km.KeplerMapper(verbose=3)
lens2 = mapper.fit_transform(X, projection="l2norm")
# Combine both lenses to create a 2-D [Isolation Forest, L^2-Norm] lens
lens = np.c_[lens1, lens2]
# Create the simplicial complex
graph = mapper.map(lens,
X,
cover=km.Cover(n_cubes=15, perc_overlap=0.7),
clusterer=sklearn.cluster.KMeans(n_clusters=2,
random_state=1618033))
# Visualization
mapper.visualize(graph,
path_html="breast-cancer.html",
title="Wisconsin Breast Cancer Dataset",
custom_tooltips=y)
-----
X_selected: {numpy array}, shape (n_samples, n_selected_features}
input data on the selected features
n_clusters: {int}
number of clusters
y: {numpy array}, shape (n_samples,)
true labels
Output
------
nmi: {float}
Normalized Mutual Information
acc: {float}
Accuracy
"""
k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
tol=0.0001, precompute_distances=True, verbose=0,
random_state=None, copy_x=True, n_jobs=1)
k_means.fit(X_selected)
y_predict = k_means.labels_
# calculate NMI
nmi = normalized_mutual_info_score(y, y_predict)
# calculate ACC
y_permuted_predict = best_map(y, y_predict)
acc = accuracy_score(y, y_permuted_predict)
return nmi, acc
def kmeansRemovingOutlierForClassifier():
"""
use k-means to do outlier removal
:return: NA
"""
# load data
X_train = np.load('inputClf_small/X_train.npy')
y_train = np.load('inputClf_small/y_train.npy')
y_train_price = np.load('inputClf_small/y_train_price.npy')
# cluster initializing
X_train1 = X_train[np.where(y_train==0)[0], :]
X_train2 = X_train[np.where(y_train==1)[0], :]
cluster1 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train1)
cluster1 = cluster1.cluster_centers_
cluster2 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train2)
cluster2 = cluster2.cluster_centers_
clusters = np.concatenate((cluster1, cluster2), axis=0)
y_pred = KMeans(init='random', n_clusters=2, random_state=2).fit_predict(X_train)
y_pred = y_pred.reshape((y_pred.shape[0], 1))
y_pred = y_pred
tmp = np.concatenate((y_train, y_pred), axis=1)
sam = y_train == y_pred
print "# total: {}".format(y_train.shape[0])
print "# datas left: {}".format(np.sum(sam))
# Keep 63.62% data.
print "Keep {}% data.".format(round(np.sum(sam)*100.0/y_train.shape[0], 2))
def random_sampling_kmeans_cts(dim, num_samples):
""" Picks a large number of points uniformly at random and then runs k-means to
select num_samples points. """
try:
from sklearn.cluster import KMeans
num_candidates = np.clip(100*(dim**2), 4*num_samples, 20*num_samples)
candidates = random_sampling_cts(dim, num_candidates)
centres = KMeans(n_clusters=num_samples).fit(candidates)
return centres.cluster_centers_
except ImportError:
return random_sampling_cts(dim, num_samples)
def classical_modularity_calculator(graph, embedding, args):
"""
Function to calculate the DeepWalk cluster centers and assignments.
"""
kmeans = KMeans(n_clusters=args.cluster_number, random_state=0, n_init = 1).fit(embedding)
assignments = {str(i): int(kmeans.labels_[i]) for i in range(0, embedding.shape[0])}
modularity = community.modularity(assignments,graph)
return modularity, assignments
for i in range(Labels.shape[0]):
Labels[i] = int(i*st_win/LDAstepRatio);
clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
clf.fit(mt_feats_to_red.T, Labels)
mt_feats_norm = (clf.transform(mt_feats_norm.T)).T
if n_speakers <= 0:
s_range = range(2, 10)
else:
s_range = [n_speakers]
clsAll = []
sil_all = []
centersAll = []
for iSpeakers in s_range:
k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
k_means.fit(mt_feats_norm.T)
cls = k_means.labels_
means = k_means.cluster_centers_
# Y = distance.squareform(distance.pdist(mt_feats_norm.T))
clsAll.append(cls)
centersAll.append(means)
sil_1 = []; sil_2 = []
for c in range(iSpeakers):
# for each speaker (i.e. for each extracted cluster)
clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
float(len(cls))
if clust_per_cent < 0.020:
sil_1.append(0.0)
sil_2.append(0.0)
else:
Output will be compressed code like 0-255 and its centroids as its codebook.
It takes about 1 hour to train a embedding in size like (50k, 512) with 8 CPUs cores all running.
:param weight:
:param bit:
:return: code, centroids(codebook)
"""
shape = weight.shape
weight = weight.reshape(-1, 1)
assert bit <= 32
clusters = 2 ** bit
print('{} clusters'.format(clusters))
kmeans = KMeans(n_clusters=clusters, n_jobs=4)
kmeans.fit(weight)
code = kmeans.predict(weight)
if bit <= 8:
code = code.astype(np.uint8)
centroids = kmeans.cluster_centers_
return code.reshape(shape), centroids.astype('f')
def _divide(self, clustering_method="kmeans"):
if clustering_method == "kmeans":
kmeans = KMeans(n_clusters=2)
cluster_result = kmeans.fit_predict(self.data)
IdsA = list(np.where(cluster_result == 1)[0])
IdsB = list(np.where(cluster_result == 0)[0])
elif clustering_method == "divide":
A = set(range(self.data.shape[0]))
B = set()
IdsA = list(A)
IdsB = list(B)
Ids = np.argmax(sum(self.distance_matrix))
while self.d_point_set(Ids, A) > self.d_point_set(Ids, B) and len(A) > 1:
A.remove(IdsA[Ids])
B.add(IdsA[Ids])
IdsA = list(A)
IdsB = list(B)
Ids = np.argmax(np.sum(self.distance_matrix[IdsA][:, IdsA], axis = 1)/(len(A)-1)
- np.sum(self.distance_matrix[IdsA][:, IdsB], axis = 1)/len(B))
KM = lambda k: KMeans(n_clusters=k)
score_func = lambda km, Xs: km.score(Xs)