Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for option in reader_options.items():
reader = reader.option(*option)
if reader_method == ReaderMethods.CSV:
df = reader.csv(path)
elif reader_method == ReaderMethods.parquet:
df = reader.parquet(path)
elif reader_method == ReaderMethods.delta:
df = reader.format("delta").load(path)
else:
raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs)
elif "query" in batch_kwargs:
df = self.spark.sql(batch_kwargs["query"])
elif "dataset" in batch_kwargs and isinstance(batch_kwargs["dataset"], (DataFrame, SparkDFDataset)):
df = batch_kwargs.get("dataset")
# We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
batch_kwargs = {k: batch_kwargs[k] for k in batch_kwargs if k != 'dataset'}
if isinstance(df, SparkDFDataset):
# Grab just the spark_df reference, since we want to override everything else
df = df.spark_df
# Record this in the kwargs *and* the id
batch_kwargs["SparkDFRef"] = True
batch_id["SparkDFRef"] = True
else:
raise BatchKwargsError("Unrecognized batch_kwargs for spark_source", batch_kwargs)
if "limit" in batch_kwargs:
df = df.limit(batch_kwargs['limit'])
DataFrame.smvSelectMinus = lambda df, *cols: DataFrame(helper(df).smvSelectMinus(df._jdf, smv_copy_array(df._sc, *cols)), df.sql_ctx)
def dedup_top_n(df, n, group_col, order_cols = []):
"""
Used get the top N records (after ordering according to the provided order columns) in each group.
:param df: DataFrame to operate on
:param n: number of records to return from each group
:param group_col: column to group by the records
:param order_cols: columns to order the records according to
:return: DataFrame representing the data after the operation
"""
java_cols = _cols_to_java_cols(order_cols)
jdf = _get_utils(df).dedupTopN(df._jdf, n, group_col._jc, java_cols)
return DataFrame(jdf, df.sql_ctx)
def to_spark(self, t, flatten):
t = t.expand_types()
if flatten:
t = t.flatten()
return pyspark.sql.DataFrame(self._to_java_ir(t._tir).pyToDF(), Env.spark_session()._wrapped)
from pyspark.ml.param.shared import *
from pyspark.sql import DataFrame
def streamToAzureSearch(df, **options):
jvm = SparkContext.getOrCreate()._jvm
writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
return writer.stream(df._jdf, options)
setattr(pyspark.sql.DataFrame, 'streamToAzureSearch', streamToAzureSearch)
def writeToAzureSearch(df, **options):
jvm = SparkContext.getOrCreate()._jvm
writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
writer.write(df._jdf, options)
setattr(pyspark.sql.DataFrame, 'writeToAzureSearch', writeToAzureSearch)
def __doFill(*valueCols):
return DataFrame(self.sgd.smvFillNullWithPrevValue(smv_copy_array(self.df._sc, *orderCols), smv_copy_array(self.df._sc, *valueCols)), self.df.sql_ctx)
return __doFill
def _smvConcatHist(df, cols): return helper(df).smvConcatHist(df._jdf, smv_copy_array(df._sc, *cols))
def _smvFreqHist(df, *cols): return dfhelper(df)._smvFreqHist(_to_seq(cols))
def _smvCountHist(df, keys, binSize): return dfhelper(df)._smvCountHist(_to_seq(keys), binSize)
def _smvBinHist(df, *colWithBin):
for elem in colWithBin:
assert type(elem) is tuple, "smvBinHist takes a list of tuple(string, double) as paraeter"
assert len(elem) == 2, "smvBinHist takes a list of tuple(string, double) as parameter"
insureDouble = map(lambda t: (t[0], t[1] * 1.0), colWithBin)
return helper(df).smvBinHist(df._jdf, smv_copy_array(df._sc, *insureDouble))
def _smvEddCompare(df, df2, ignoreColName): return dfhelper(df)._smvEddCompare(df2._jdf, ignoreColName)
DataFrame.smvEdd = lambda df, *cols: println(_smvEdd(df, *cols))
DataFrame.smvHist = lambda df, *cols: println(_smvHist(df, *cols))
DataFrame.smvConcatHist = lambda df, cols: println(_smvConcatHist(df, cols))
DataFrame.smvFreqHist = lambda df, *cols: println(_smvFreqHist(df, *cols))
DataFrame.smvEddCompare = lambda df, df2, ignoreColName=False: println(_smvEddCompare(df, df2, ignoreColName))
def __smvCountHistFn(df, keys, binSize = 1):
if (isinstance(keys, basestring)):
return println(_smvCountHist(df, [keys], binSize))
else:
return println(_smvCountHist(df, keys, binSize))
DataFrame.smvCountHist = __smvCountHistFn
DataFrame.smvBinHist = lambda df, *colWithBin: println(_smvBinHist(df, *colWithBin))
def __smvDiscoverPK(df, n):
res = helper(df).smvDiscoverPK(df._jdf, n)
println("[{}], {}".format(", ".join(map(str, res._1())), res._2()))
DataFrame.smvDiscoverPK = lambda df, n=10000: __smvDiscoverPK(df, n)
def copy(self):
df = DataFrame(self._jdf, self.sql_ctx)
df._metadata = self._metadata.copy()
return df
def _transform(self, dataset):
self._transfer_params_to_java()
return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
if sys.version >= '3':
basestring = str
import pyspark
from pyspark import SparkContext
from pyspark import sql
from pyspark.ml.param.shared import *
from pyspark.sql import DataFrame
def streamToAzureSearch(df, **options):
jvm = SparkContext.getOrCreate()._jvm
writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
return writer.stream(df._jdf, options)
setattr(pyspark.sql.DataFrame, 'streamToAzureSearch', streamToAzureSearch)
def writeToAzureSearch(df, **options):
jvm = SparkContext.getOrCreate()._jvm
writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
writer.write(df._jdf, options)
setattr(pyspark.sql.DataFrame, 'writeToAzureSearch', writeToAzureSearch)