Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
with open("recall_vectors.json", 'r') as f:
data = json.load(f)
with open("recall_queries.json", 'r') as f:
queries = json.load(f)
with open("recall_truths.json", 'r') as f:
truths = json.load(f)
num_elements = len(data)
dim = len(data[0])
data_labels = np.arange(num_elements)
# Declaring index
p = hnswlib.Index(space = 'cosine', dim = dim) # possible options are l2, cosine or ip
# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 2000, M = 100)
before = time.time()
# Element insertion (can be called several times):
p.add_items(data, data_labels)
print("import took {}".format(time.time() - before))
# Controlling the recall by setting ef:
p.set_ef(100) # ef should always be > k
# Query dataset, k - number of closest elements (returns 2 numpy arrays)
results, distances = p.knn_query(queries, k = 1)
relevant=0
def fit(self, X, metric="l2", M=16, ef=100, ef_construction=100, random_state=0):
try:
import hnswlib
except ImportError:
print(
"In order to use fast approx neighbor search, "
"you need to `pip install hnswlib`\n"
)
ef_c, ef = max(ef_construction, self.n_neighbors), max(self.n_neighbors, ef)
metric = "l2" if metric == "euclidean" else metric
X = X.A if issparse(X) else X
ns, dim = X.shape
knn = hnswlib.Index(space=metric, dim=dim)
knn.init_index(
max_elements=ns, ef_construction=ef_c, M=M, random_seed=random_state
)
knn.add_items(X)
knn.set_ef(ef)
knn_indices, knn_distances = knn.knn_query(
X, k=self.n_neighbors, num_threads=self.num_threads
)
n_neighbors = self.n_neighbors
if metric == "l2":
knn_distances = np.sqrt(knn_distances)
self.distances, self.connectivities = compute_connectivities_umap(
print("Adding first batch of %d elements" % (len(data1)))
p.add_items(data1)
# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)
print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n")
# Serializing and deleting the index:
index_path='first_half.bin'
print("Saving index to '%s'" % index_path)
p.save_index("first_half.bin")
del p
# Reiniting, loading the index
p = hnswlib.Index(space='l2', dim=dim) # the space can be changed - keeps the data, alters the distance function.
print("\nLoading index from 'first_half.bin'\n")
# Increase the total capacity (max_elements), so that it will handle the new data
p.load_index("first_half.bin", max_elements = num_elements)
print("Adding the second batch of %d elements" % (len(data2)))
p.add_items(data2)
# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
def _load_cache(self):
self.ef = self.model_config.get('ef')
self.cache_texts, self.cache_embeddings, self.cache_labels = self.get_embedding_from_file(self.cache_file)
self.num_cache, self.embedding_dim = self.cache_embeddings.shape
# application of hnswlib
# declaring index
self.index_nms = hnswlib.Index(space='cosine', dim=self.embedding_dim)
# initializing index - the maximum number of elements should be know beforehand
self.index_nms.init_index(max_elements=self.num_cache, ef_construction=100, M=100)
# element insertion (can be called several times)
self.index_nms.add_items(data=self.cache_embeddings, ids=range(self.num_cache))
# self.index_nms.save_index(os.path.join(self.model_file, 'cache.index'))
# controlling the recall by setting ef:
self.index_nms.set_ef(ef=self.ef) # ef should always be > k (knn)
def load_cache(self, cache_file=None):
if cache_file:
self.cache_file = cache_file
if self.cache_file:
self.cache_texts, self.cache_embeddings, self.cache_labels = self.get_embedding_from_file(cache_file)
self.num_cache, self.embedding_dim = self.cache_embeddings.shape
# application of hnswlib
# declaring index
self.index_nms = hnswlib.Index(space='cosine', dim=self.embedding_dim)
# initializing index - the maximum number of elements should be know beforehand
self.index_nms.init_index(max_elements=self.num_cache, ef_construction=100, M=100)
# element insertion (can be called several times)
self.index_nms.add_items(data=self.cache_embeddings, ids=range(self.num_cache))
self.index_nms.save_index(os.path.join(self.pb_model_file, 'cache.index'))
# controlling the recall by setting ef:
self.index_nms.set_ef(ef=self.ef) # ef should always be > k (knn)
# copy cache.txt to pb_model_file
shutil.copy(cache_file, os.path.join(self.pb_model_file, 'cache.txt'))
os.makedirs(data_dir)
index_url = os.environ.get('INDEX_URL', INDEX_URL)
idx_to_ASIN_url = os.environ.get('IDX_ASIN_URL', IDX_ASIN_URL)
ASIN_to_data_url = os.environ.get('ASIN_DATA_URL', ASIN_DATA_URL)
mx.test_utils.download(index_url, dirname=data_dir)
mx.test_utils.download(idx_to_ASIN_url, dirname=data_dir)
mx.test_utils.download(ASIN_to_data_url, dirname=data_dir)
############################################
############################################
logging.info('Loading Resources files')
self.idx_ASIN = pickle.load(open(os.path.join(data_dir, 'idx_ASIN.pkl'), 'rb'))
self.ASIN_data = pickle.load(open(os.path.join(data_dir,'ASIN_data.pkl'), 'rb'))
self.p = hnswlib.Index(space = 'l2', dim = EMBEDDING_SIZE)
self.p.load_index(os.path.join(data_dir,'index.idx'))
############################################
logging.info('Resources files loaded')
self.p.set_ef(EF)
self.k = K