How to use the fastcluster.linkage function in fastcluster

To help you get started, we’ve selected a few fastcluster examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mwaskom / seaborn / seaborn / matrix.py View on Github external
def _calculate_linkage_fastcluster(self):
        import fastcluster
        # Fastcluster has a memory-saving vectorized version, but only
        # with certain linkage methods, and mostly with euclidean metric
        vector_methods = ('single', 'centroid', 'median', 'ward')
        euclidean_methods = ('centroid', 'median', 'ward')
        euclidean = self.metric == 'euclidean' and self.method in \
            euclidean_methods
        if euclidean or self.method == 'single':
            return fastcluster.linkage_vector(self.array,
                                              method=self.method,
                                              metric=self.metric)
        else:
            pairwise_dists = distance.pdist(self.array, metric=self.metric)
            linkage = fastcluster.linkage(pairwise_dists, method=self.method)
            del pairwise_dists
            return linkage
github GallupGovt / multivac / src / rdf_graph / rdf_graph.py View on Github external
def cluster_entities(self, embeddings_path, link_method='average'):
        embeddings_dict = self.load_embeddings(embeddings_path,
                                               self.unique_entities)
        # Create distance matrix (vector) using cosine similarity
        # between all entity strings
        embeddings_array = np.array([embedding for embedding in embeddings_dict.values()])
        dist_vec = pdist(embeddings_array, 'cosine')

        # Cluster distance matrix to find co-referring entities
        Z = fastcluster.linkage(dist_vec, method=link_method)
        cluster_labels = fcluster(Z, t=self.clust_dist_thres,
                                  criterion='distance')
        cluster_members_all = []

        entity_list = np.array([entity for entity in embeddings_dict.keys()])
        for clus_label in np.unique(cluster_labels):
            clus_indx = cluster_labels == clus_label
            cluster_members = list(entity_list[clus_indx])
            cluster_members_all.append(cluster_members)

        output = {'cluster_members': cluster_members_all,
                  'cluster_labels': cluster_labels,
                  'cluster_rep': self.get_cluster_representatives(cluster_members_all)}

        self.entity_cluster_results = output
github skutac / InCHlib.js / inchlib_clust / inchlib_clust_dev.py View on Github external
def __cluster_columns__(self, column_distance, column_linkage):
        self.data = [list(col) for col in zip(*self.data)]
        if not self.missing_value is False:
            self.data, missing_values_indexes = self.__impute_missing_values__(self.data)
        
        self.column_clustering = fastcluster.linkage(self.data, method=column_linkage, metric=column_distance)
        self.data_order = hcluster.leaves_list(self.column_clustering)

        if not self.missing_value is False:
            self.data = self.__return_missing_values__(self.data, missing_values_indexes)
        
        self.data = zip(*self.data)
        self.data = self.__reorder_data__(self.data, self.data_order)
        self.original_data = self.__reorder_data__(self.original_data, self.data_order)
        if self.header:
            self.header = self.__reorder_data__([self.header], self.data_order)[0]
github victor-gil-sepulveda / pyProCT / pyproct / clustering / algorithms / hierarchical / hierarchicalAlgorithm.py View on Github external
hie_mat = None

        try:
            method = kwargs["method"]
        except KeyError:
            method = 'complete'

        if hie_mat != None:
            self.hie_mat = hie_mat
#            print "[HIERARCHICAL] Matrix provided."
        else:
            if self.hie_mat == None:
                #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False)
#                print "[HIERARCHICAL] Calculating Matrix"
                #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method)
                self.hie_mat = hcluster_fast.linkage(self.condensed_matrix.get_data(), method = method)
#            else:
#                print "[HIERARCHICAL] Matrix was already stored"

        algorithm_details = "Hierarchical with "+method+" method (cutoff = " +str(cutoff)+")"

        if cutoff != None:
            # Then apply the cutoff, this doesn't work much as expected
#            print "[HIERARCHICAL] getting clustering."+algorithm_details
            group_list = hcluster.fcluster(self.hie_mat,cutoff)
#            print "[HIERARCHICAL] Clustering done."+algorithm_details
            # Then let's generate the clusters
            clusters = gen_clusters_from_class_list(group_list)
            return Clustering(clusters,details = algorithm_details)
        else:
            return None
github LinkageIO / Camoco / camoco / COB.py View on Github external
)
        else:
            # Fetch the Expr Matrix
            dm = self.expr(
                genes=genes,
                accessions=accessions,
                raw=raw,
                gene_normalize=gene_normalize,
            )
        # set the outliers to the maximium value for the heatmap
        dm[dm > expr_boundaries] = expr_boundaries
        dm[dm < -1*expr_boundaries] = -1 * expr_boundaries
        # Get the Gene clustering order
        if cluster_method in hier_cluster_methods:
            self.log("Ordering rows by leaf")
            expr_linkage = fastcluster.linkage(dm.fillna(0), method=cluster_method)
            order = leaves_list(expr_linkage)
            dm = dm.iloc[order, :]
        elif cluster_method == "mcl":
            self.log("Ordering rows by MCL cluster")
            order = (
                self.clusters.loc[dm.index]
                .fillna(np.inf)
                .sort_values(by="cluster")
                .index.values
            )
            dm = dm.loc[order, :]
        else:
            # No cluster order.
            self.log("Unknown gene ordering: {}, no ordering performed", cluster_method)

        # Get leaves of accessions
github pauldb89 / OxLM / scripts / cluster_word_vectors.py View on Github external
help="File containing the vocabulary")
  parser.add_option("-v", "--vectors-file", dest="vectors_file",
                    help="File containing the word_vectors")
  parser.add_option("-m", "--method", dest="method", help="Clustering method")
  parser.add_option("-s", "--metric", dest="metric", help="Clusterig metric")
  parser.add_option("-o", "--output-file", dest="output_file",
                    help="File containing the word hierarchy")
  options, _ = parser.parse_args()

  words = read_words(options.vocab_file)
  print "Read", len(words), "words..."

  vectors = read_vectors(options.vectors_file)
  print "Read", len(vectors), "vectors..."

  cluster_data = fastcluster.linkage(
      vectors, method=options.method, metric=options.metric)
  hierarchy = convert(cluster_data)
  print "Tree depth:", depth(words, hierarchy)

  write_tree(words, hierarchy, options.output_file)
github rsrandhawa / Vec2Topic / vec2topic.py View on Github external
else:
                                model_comb[w]=model_wiki_vec[str(w)]
                        model_comb_vocab.append(w)


        sentences=sentences_bigrammed

        ##Create a frequency count of words in email
        words=[w for text in sentences_nouns for w in text]
        Vocab=set(words)

        #Run Agglomerative clustering
        logger.info('Clustering for depth...')

        data_d2v,word_d2v=create_word_list(model_comb,model_comb_vocab,25*local_vec+300,sentences_nouns,repeat=False,normalized=True,min_count=0,l2_threshold=0)
        spcluster=fastcluster.linkage(data_d2v,method='average',metric='cosine')


        ##Calculate depth of words
        num_points=len(data_d2v)
        depth=calculate_depth(spcluster,word_d2v,num_points)

        logger.info('Computing co-occurence graph')

        T=[' '.join(w) for w in sentences_nouns]

        ##Co-occurence matrix
        cv=CountVectorizer(token_pattern=u'(?u)\\b([^\\s]+)')
        bow_matrix = cv.fit_transform(T)
        id2word={}
        for key, value in cv.vocabulary_.items():
            id2word[value]=key
github xiezhq / ISEScan / pred.py View on Github external
# len(ids) * 2 matrix, where only intersected hits are retained and each row is a hit with
	# two features (genome coordinates of a hit).
	# 
	data = []
	for id in idsList:
		data.append(hits[id]['bd'])

	Y = numpy.array(data, int)
	print('data: {}\n{}'.format(Y.shape, Y))

	distMatrix = scipy.spatial.distance.pdist(Y, tools.distFunction)
	#distMatrix = scipy.spatial.distance.pdist(Y, metric='euclidean')
	#print('distMatrix: {}\n{}'.format(distMatrix.shape, distMatrix))

	# fastcluster requires the dissimilarity matrix instead of similarity matrix!
	hclusters = fastcluster.linkage(distMatrix, method='single', preserve_input='False')
	del distMatrix
	#cophenet = scipy.cluster.hierarchy.cophenet(hclusters, distMatrix)
	#print('cophenetCorrelation = {}'.format(cophenet[0]))
	#nids = len(ids)
	#print('nids={} timesOfMergingCluster={}'.format(nids, len(hclusters)))
	#for i, cluster in enumerate(hclusters):
	#	print('cluster {:>3} {:>6} {:>6} {:>9.2g} {:>6}'.format(i, int(cluster[0]), int(cluster[1]), cluster[2], int(cluster[3])))
	for i, id in enumerate(idsList):
		print('intersected hits', i, hits[id]['bd'], hits[id]['orf'], hits[id]['occurence'], hits[id]['hmmhit'], hits[id]['tirs'])

	# dengrogram of hierachical clustering
	#scipy.cluster.hierarchy.dendrogram(hclusters)

	# form flat clusters from the hierarchical clustering
	# Note: t=1.1 instead of 1.0 ensures that the intersected hits with only 1 bp intersect are included in same cluster. 
	t = 1.1
github hnolCol / instantclue / modules / plots / hierarchical_clustering.py View on Github external
def cluster_data(self, dataFrame, metric, method):
		'''
		Clusters the data
		'''
		try:
			if  metric ==  'euclidean':   
				linkage = fastcluster.linkage(dataFrame, method = method, metric = metric)   
			
			else:
			
				distanceMatrix = scd.pdist(dataFrame, metric = metric)
				linkage = sch.linkage(distanceMatrix,method = method)
				del distanceMatrix
		
		except:
				tk.messagebox.showinfo('Error ..','Data could not be clustered. This might be due to rows that contain exactly the same values.')
				return None, None
		
		maxD = 0.7*max(linkage[:,2])
		return linkage, maxD