Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_model_tfidf_vectorizer22(self):
corpus = numpy.array([
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]).reshape((4, 1))
vect = TfidfVectorizer(ngram_range=(2, 2), norm=None)
vect.fit(corpus.ravel())
model_onnx = convert_sklearn(vect, "TfidfVectorizer",
[("input", StringTensorType([1]))],
options=self.get_options())
self.assertTrue(model_onnx is not None)
dump_data_and_model(
corpus,
vect,
model_onnx,
basename="SklearnTfidfVectorizer22-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')",
def test_model_tfidf_vectorizer12_normL2(self):
corpus = numpy.array([
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]).reshape((4, 1))
vect = TfidfVectorizer(ngram_range=(1, 2), norm="l2")
vect.fit(corpus.ravel())
model_onnx = convert_sklearn(vect, "TfidfVectorizer",
[("input", StringTensorType([1]))],
options=self.get_options())
self.assertTrue(model_onnx is not None)
dump_data_and_model(
corpus,
vect,
model_onnx,
basename="SklearnTfidfVectorizer22L2-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')",
def test_model_label_encoder(self):
model = LabelEncoder()
data = ["str3", "str2", "str0", "str1", "str3"]
model.fit(data)
model_onnx = convert_sklearn(
model,
"scikit-learn label encoder",
[("input", StringTensorType([None]))],
)
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
dump_data_and_model(
np.array(data),
model,
model_onnx,
basename="SklearnLabelEncoder",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4),
),
])
preprocessor = ColumnTransformer(transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
])
model = Pipeline(steps=[("precprocessor",
preprocessor), ("classifier", classifier)])
model.fit(X_train, y_train)
initial_type = [
("numfeat", FloatTensorType([None, 3])),
("strfeat", StringTensorType([None, 2])),
]
X_train = X_train[:11]
model_onnx = convert_sklearn(model, initial_types=initial_type)
dump_data_and_model(
X_train,
model,
model_onnx,
basename="SklearnPipelineColumnTransformerPipeliner",
allow_failure="StrictVersion(onnx.__version__)"
" < StrictVersion('1.3') or "
"StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')",
)
def test_model_tfidf_vectorizer11_compose(self):
corpus = numpy.array([
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]).reshape((4, 1))
corpus = numpy.hstack([corpus, corpus])
y = numpy.array([0, 1, 0, 1])
model = ColumnTransformer([
('a', TfidfVectorizer(), 0),
('b', TfidfVectorizer(), 1),
])
model.fit(corpus, y)
model_onnx = convert_sklearn(model, "TfIdfcomp",
[("input", StringTensorType([4, 2]))],
options=self.get_options())
sess = InferenceSession(model_onnx.SerializeToString())
res = sess.run(None, {'input': corpus})[0]
exp = model.transform(corpus)
assert_almost_equal(res, exp)
def test_model_tfidf_vectorizer11_empty_string(self):
corpus = numpy.array([
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'',
]).reshape((4, 1))
vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
vect.fit(corpus.ravel())
model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
[('input', StringTensorType([1]))],
options=self.get_options())
self.assertTrue(model_onnx is not None)
# TfidfVectorizer in onnxruntime fails with empty strings
dump_data_and_model(
corpus, vect, model_onnx,
basename="SklearnTfidfVectorizer11EmptyStringRegex-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__) "
"<= StrictVersion('0.4.0')")
def test_guess_type(self):
dtypes = [
(np.int32, Int32TensorType),
(np.int64, Int64TensorType),
(np.float32, FloatTensorType),
(np.str, StringTensorType)
]
for dtype, exp in dtypes:
if dtype == np.str:
mat = np.empty((3, 3), dtype=dtype)
mat[:, :] = ""
else:
mat = np.zeros((3, 3), dtype=dtype)
res = _guess_type(mat)
assert isinstance(res, exp)
dtypes = [np.float64]
for dtype in dtypes:
mat = np.zeros((3, 3), dtype=dtype)
_guess_type(mat, )
def convert_dataframe_schema(df, drop=None):
inputs = []
for k, v in zip(df.columns, df.dtypes):
if drop is not None and k in drop:
continue
if v == 'int64':
t = Int64TensorType([None, 1])
elif v == 'float64':
t = FloatTensorType([None, 1])
else:
t = StringTensorType([None, 1])
inputs.append((k, t))
return inputs
def convert_dataframe_schema(df, drop=None):
inputs = []
for k, v in zip(df.columns, df.dtypes):
if drop is not None and k in drop:
continue
if v == 'int64':
t = Int64TensorType([None, 1])
elif v == 'float64':
t = FloatTensorType([None, 1])
else:
t = StringTensorType([None, 1])
inputs.append((k, t))
return inputs
def calculate_sklearn_label_encoder_output_shapes(operator):
"""
This function just copy the input shape to the output because label
encoder only alters input features' values, not their shape.
"""
check_input_and_output_numbers(operator, output_count_range=1)
check_input_and_output_types(operator, good_input_types=[
FloatTensorType, Int64TensorType,
StringTensorType])
input_shape = copy.deepcopy(operator.inputs[0].type.shape)
operator.outputs[0].type = Int64TensorType(copy.deepcopy(input_shape))