Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Returns
-------
int
the KMeans K
"""
check_types([
("X", X, [list], False),
("input_relation", input_relation, [str], False),
("n_cluster", n_cluster, [list, tuple], False),
("init", init, ["kmeanspp", "random"], True),
("max_iter", max_iter, [int, float], False),
("tol", tol, [int, float], False),
("elbow_score_stop", elbow_score_stop, [int, float], False)])
if not(cursor):
conn = read_auto_connect()
cursor = conn.cursor()
else:
conn = False
check_cursor(cursor)
if not(type(n_cluster) == list):
L = range(n_cluster[0], n_cluster[1])
else:
L = n_cluster
L.sort()
schema, relation = schema_relation(input_relation)
schema = str_column(schema)
relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
for i in L:
cursor.execute("DROP MODEL IF EXISTS {}.__vpython_kmeans_tmp_model_{}__".format(schema, relation_alpha))
model = KMeans("{}.__vpython_kmeans_tmp_model_{}__".format(schema, relation_alpha), cursor, i, init, max_iter, tol)
model.fit(input_relation, X)
path += "/all/{}.vertica".format(name)
file = open(path, "r")
except:
raise Exception("No auto connection is available. To create an auto connection, use the new_auto_connection function of the verticapy.connections.connect module.")
dsn = file.read()
dsn = dsn.split("\n")
if (dsn[0] == "vertica_python"):
import vertica_python
conn = vertica_python.connect(** {"host": dsn[1], "port": dsn[2], "database": dsn[3], "user": dsn[4], "password": dsn[5]}, autocommit = True)
elif (dsn[0] == "pyodbc"):
import pyodbc
conn = pyodbc.connect(dsn[1], autocommit = True)
elif (dsn[0] == "jaydebeapi"):
import jaydebeapi
jdbc_driver_name = "com.vertica.jdbc.Driver"
jdbc_driver_loc = os.path.dirname(verticapy.__file__) + "/connections/vertica-jdbc-9.3.1-0.jar"
conn = jaydebeapi.connect(jdbc_driver_name, dsn[1], {'user': dsn[2], 'password': dsn[3]}, jars = jdbc_driver_loc)
else:
raise Exception("The auto connection format is incorrect. To create a new auto connection, use the new_auto_connection function of the verticapy.connections.connect module.")
return(conn)
#---#
parameters = info[1].split(",")
if (model_type != "svd"):
parameters = [item.split("=") for item in parameters]
parameters_dict = {}
for item in parameters:
parameters_dict[item[0]] = item[1]
info = info[0]
for elem in parameters_dict:
if type(parameters_dict[elem]) == str:
parameters_dict[elem] = parameters_dict[elem].replace("'", "")
if (model_type == "rf_regressor"):
from verticapy.learn.ensemble import RandomForestRegressor
model = RandomForestRegressor(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "rf_classifier"):
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "logistic_reg"):
from verticapy.learn.linear_model import LogisticRegression
model = LogisticRegression(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "linear_reg"):
from verticapy.learn.linear_model import ElasticNet
model = ElasticNet(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "naive_bayes"):
from verticapy.learn.naive_bayes import MultinomialNB
model = MultinomialNB(name, cursor, float(parameters_dict['alpha']))
elif (model_type == "svm_regressor"):
from verticapy.learn.svm import LinearSVR
model = LinearSVR(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], float(parameters_dict['error_tolerance']), int(parameters_dict['max_iterations']))
elif (model_type == "svm_classifier"):
from verticapy.learn.svm import LinearSVC
model = LinearSVC(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], [float(item) for item in parameters_dict['class_weights'].split(",")], int(parameters_dict['max_iterations']))
elif (model_type == "kmeans"):
rand_int = random.randint(0, 10000000)
temp_information = ("{}.VERTICAPY_TEMP_VIEW_{}".format(schema, rand_int), "{}.VERTICAPY_TEMP_MODEL_{}".format(schema, rand_int))
if (bins < 2):
raise ValueError("Parameter 'bins' must be greater or equals to 2 in case of discretization using the method 'smart'")
columns_check([response], self.parent)
response = vdf_columns_names([response], self.parent)[0]
try:
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP VIEW IF EXISTS {}".format(temp_information[0]))
except:
try:
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(temp_information[1]))
except:
pass
self.parent.to_db(temp_information[0])
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(temp_information[1], self.parent._VERTICAPY_VARIABLES_["cursor"], n_estimators = 20, max_depth = 3, nbins = 100, min_samples_leaf = min_bin_size)
model.fit(temp_information[0], [self.alias], response)
query = ["(SELECT READ_TREE(USING PARAMETERS model_name = '{}', tree_id = {}, format = 'tabular'))".format(temp_information[1], i) for i in range(20)]
query = "SELECT split_value FROM (SELECT split_value, COUNT(*) FROM ({}) x WHERE split_value IS NOT NULL GROUP BY 1 ORDER BY 2 DESC LIMIT {}) y ORDER BY split_value::float".format(" UNION ALL ".join(query), bins - 1)
self.parent.__executeSQL__(query = query, title = "Computes the optimized histogram bins using Random Forest.")
result = self.parent._VERTICAPY_VARIABLES_["cursor"].fetchall()
result = [elem[0] for elem in result]
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP VIEW IF EXISTS {}".format(temp_information[0]))
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(temp_information[1]))
result = [self.min()] + result + [self.max()]
elif (method == "topk"):
if (k < 2):
raise ValueError("Parameter 'k' must be greater or equals to 2 in case of discretization using the method 'topk'")
distinct = self.topk(k).values["index"]
trans = ("(CASE WHEN {} IN ({}) THEN {} || '' ELSE '{}' END)".format(convert_special_type(self.category(), False), ', '.join(["'{}'".format(str(elem).replace("'", "''")) for elem in distinct]), convert_special_type(self.category(), False), new_category.replace("'", "''")), "varchar", "text")
elif (self.isnum() and method == "same_freq"):
if (bins < 2):
info = info[0]
for elem in parameters_dict:
if type(parameters_dict[elem]) == str:
parameters_dict[elem] = parameters_dict[elem].replace("'", "")
if (model_type == "rf_regressor"):
from verticapy.learn.ensemble import RandomForestRegressor
model = RandomForestRegressor(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "rf_classifier"):
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "logistic_reg"):
from verticapy.learn.linear_model import LogisticRegression
model = LogisticRegression(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "linear_reg"):
from verticapy.learn.linear_model import ElasticNet
model = ElasticNet(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "naive_bayes"):
from verticapy.learn.naive_bayes import MultinomialNB
model = MultinomialNB(name, cursor, float(parameters_dict['alpha']))
elif (model_type == "svm_regressor"):
from verticapy.learn.svm import LinearSVR
model = LinearSVR(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], float(parameters_dict['error_tolerance']), int(parameters_dict['max_iterations']))
elif (model_type == "svm_classifier"):
from verticapy.learn.svm import LinearSVC
model = LinearSVC(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], [float(item) for item in parameters_dict['class_weights'].split(",")], int(parameters_dict['max_iterations']))
elif (model_type == "kmeans"):
from verticapy.learn.cluster import KMeans
model = KMeans(name, cursor, -1, parameters_dict['init_method'], int(parameters_dict['max_iterations']), float(parameters_dict['epsilon']))
elif (model_type == "pca"):
from verticapy.learn.decomposition import PCA
model = PCA(name, cursor, 0, bool(parameters_dict['scale']))
elif (model_type == "svd"):
def read_dsn(dsn: str):
"""
---------------------------------------------------------------------------
Reads the DSN information from the ODBCINI environment variable.
Parameters
----------
dsn: str
DSN name
Returns
-------
dict
dictionary with all the credentials
"""
check_types([("dsn", dsn, [str], False)])
f = open(os.environ['ODBCINI'], "r")
odbc = f.read()
f.close()
if ("[{}]".format(dsn) not in odbc):
raise ValueError("The DSN '{}' doesn't exist".format(dsn))
odbc = odbc.split("[{}]\n".format(dsn))[1].split("\n\n")[0].split("\n")
dsn = {}
for elem in odbc:
info = elem.replace(' ','').split('=')
dsn[info[0].lower()] = info[1]
return (dsn)
#---#
conn = read_auto_connect()
cursor = conn.cursor()
else:
conn = False
check_cursor(cursor)
if not(type(n_cluster) == list):
L = range(n_cluster[0], n_cluster[1])
else:
L = n_cluster
L.sort()
schema, relation = schema_relation(input_relation)
schema = str_column(schema)
relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
for i in L:
cursor.execute("DROP MODEL IF EXISTS {}.__vpython_kmeans_tmp_model_{}__".format(schema, relation_alpha))
model = KMeans("{}.__vpython_kmeans_tmp_model_{}__".format(schema, relation_alpha), cursor, i, init, max_iter, tol)
model.fit(input_relation, X)
score = model.metrics.values["value"][3]
if (score > elbow_score_stop):
return i
score_prev = score
if (conn):
conn.close()
print("\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}".format(i, score))
return i
#---#
cursor = conn.cursor()
else:
conn = False
check_cursor(cursor)
schema, relation = schema_relation(input_relation)
schema = str_column(schema)
relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
all_within_cluster_SS = []
if not(type(n_cluster) == list):
L = [i for i in range(n_cluster[0], n_cluster[1])]
else:
L = n_cluster
L.sort()
for i in L:
cursor.execute("DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format(schema, relation_alpha))
model = KMeans("{}.VERTICAPY_KMEANS_TMP_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol)
model.fit(input_relation, X)
all_within_cluster_SS += [float(model.metrics.values["value"][3])]
model.drop()
if (conn):
conn.close()
plt.figure(figsize = (10,8))
plt.rcParams['axes.facecolor'] = '#F4F4F4'
plt.grid()
plt.plot(L, all_within_cluster_SS, marker = "s", color = "#FE5016")
plt.title("Elbow Curve")
plt.xlabel('Number of Clusters')
plt.ylabel('Between-Cluster SS / Total SS')
plt.subplots_adjust(left = 0.2)
plt.show()
values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
return tablesample(values = values, table_info = False)
model = LogisticRegression(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "linear_reg"):
from verticapy.learn.linear_model import ElasticNet
model = ElasticNet(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "naive_bayes"):
from verticapy.learn.naive_bayes import MultinomialNB
model = MultinomialNB(name, cursor, float(parameters_dict['alpha']))
elif (model_type == "svm_regressor"):
from verticapy.learn.svm import LinearSVR
model = LinearSVR(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], float(parameters_dict['error_tolerance']), int(parameters_dict['max_iterations']))
elif (model_type == "svm_classifier"):
from verticapy.learn.svm import LinearSVC
model = LinearSVC(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], [float(item) for item in parameters_dict['class_weights'].split(",")], int(parameters_dict['max_iterations']))
elif (model_type == "kmeans"):
from verticapy.learn.cluster import KMeans
model = KMeans(name, cursor, -1, parameters_dict['init_method'], int(parameters_dict['max_iterations']), float(parameters_dict['epsilon']))
elif (model_type == "pca"):
from verticapy.learn.decomposition import PCA
model = PCA(name, cursor, 0, bool(parameters_dict['scale']))
elif (model_type == "svd"):
from verticapy.learn.decomposition import SVD
model = SVD(name, cursor)
elif (model_type == "one_hot_encoder_fit"):
from verticapy.learn.preprocessing import OneHotEncoder
model = OneHotEncoder(name, cursor)
model.input_relation = info.split(",")[1].replace("'", '').replace('\\', '')
model.test_relation = test_relation if (test_relation) else model.input_relation
if (model_type not in ("kmeans", "pca", "svd", "one_hot_encoder_fit")):
model.X = info.split(",")[3:len(info.split(","))]
model.X = [item.replace("'", '').replace('\\', '') for item in model.X]
model.y = info.split(",")[2].replace("'", '').replace('\\', '')
elif (model_type in ("pca")):
del parameters[0]
parameters += ["class_weights=" + info[1].split("class_weights=")[1].split("'")[1]]
elif (model_type != "svd"):
parameters = info[1].split(",")
if (model_type != "svd"):
parameters = [item.split("=") for item in parameters]
parameters_dict = {}
for item in parameters:
parameters_dict[item[0]] = item[1]
info = info[0]
for elem in parameters_dict:
if type(parameters_dict[elem]) == str:
parameters_dict[elem] = parameters_dict[elem].replace("'", "")
if (model_type == "rf_regressor"):
from verticapy.learn.ensemble import RandomForestRegressor
model = RandomForestRegressor(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "rf_classifier"):
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "logistic_reg"):
from verticapy.learn.linear_model import LogisticRegression
model = LogisticRegression(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "linear_reg"):
from verticapy.learn.linear_model import ElasticNet
model = ElasticNet(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "naive_bayes"):
from verticapy.learn.naive_bayes import MultinomialNB
model = MultinomialNB(name, cursor, float(parameters_dict['alpha']))
elif (model_type == "svm_regressor"):
from verticapy.learn.svm import LinearSVR
model = LinearSVR(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], float(parameters_dict['error_tolerance']), int(parameters_dict['max_iterations']))
elif (model_type == "svm_classifier"):