How to use the pyspark.sql.DataFrame function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github great-expectations / great_expectations / great_expectations / datasource / sparkdf_datasource.py View on Github external
for option in reader_options.items():
                reader = reader.option(*option)

            if reader_method == ReaderMethods.CSV:
                df = reader.csv(path)
            elif reader_method == ReaderMethods.parquet:
                df = reader.parquet(path)
            elif reader_method == ReaderMethods.delta:
                df = reader.format("delta").load(path)
            else:
                raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs)

        elif "query" in batch_kwargs:
            df = self.spark.sql(batch_kwargs["query"])

        elif "dataset" in batch_kwargs and isinstance(batch_kwargs["dataset"], (DataFrame, SparkDFDataset)):
            df = batch_kwargs.get("dataset")
            # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
            batch_kwargs = {k: batch_kwargs[k] for k in batch_kwargs if k != 'dataset'}
            if isinstance(df, SparkDFDataset):
                # Grab just the spark_df reference, since we want to override everything else
                df = df.spark_df
            # Record this in the kwargs *and* the id
            batch_kwargs["SparkDFRef"] = True
            batch_id["SparkDFRef"] = True

        else:
            raise BatchKwargsError("Unrecognized batch_kwargs for spark_source", batch_kwargs)

        if "limit" in batch_kwargs:
            df = df.limit(batch_kwargs['limit'])
github TresAmigosSD / SMV / server / smv.py View on Github external
DataFrame.smvSelectMinus = lambda df, *cols: DataFrame(helper(df).smvSelectMinus(df._jdf, smv_copy_array(df._sc, *cols)), df.sql_ctx)
github apache / datafu / datafu-spark / src / main / resources / pyspark_utils / df_utils.py View on Github external
def dedup_top_n(df, n, group_col, order_cols = []):
    """
    Used get the top N records (after ordering according to the provided order columns) in each group.
    :param df: DataFrame to operate on
    :param n: number of records to return from each group
    :param group_col: column to group by the records
    :param order_cols: columns to order the records according to
    :return: DataFrame representing the data after the operation
    """
    java_cols = _cols_to_java_cols(order_cols)
    jdf = _get_utils(df).dedupTopN(df._jdf, n, group_col._jc, java_cols)
    return DataFrame(jdf, df.sql_ctx)
github hail-is / hail / hail / python / hail / backend / backend.py View on Github external
def to_spark(self, t, flatten):
        t = t.expand_types()
        if flatten:
            t = t.flatten()
        return pyspark.sql.DataFrame(self._to_java_ir(t._tir).pyToDF(), Env.spark_session()._wrapped)
github Azure / mmlspark / src / main / python / mmlspark / cognitive / AzureSearchWriter.py View on Github external
from pyspark.ml.param.shared import *
from pyspark.sql import DataFrame

def streamToAzureSearch(df, **options):
    jvm = SparkContext.getOrCreate()._jvm
    writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
    return writer.stream(df._jdf, options)

setattr(pyspark.sql.DataFrame, 'streamToAzureSearch', streamToAzureSearch)

def writeToAzureSearch(df, **options):
    jvm = SparkContext.getOrCreate()._jvm
    writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
    writer.write(df._jdf, options)

setattr(pyspark.sql.DataFrame, 'writeToAzureSearch', writeToAzureSearch)
github TresAmigosSD / SMV / server / smv.py View on Github external
def __doFill(*valueCols):
            return DataFrame(self.sgd.smvFillNullWithPrevValue(smv_copy_array(self.df._sc, *orderCols), smv_copy_array(self.df._sc, *valueCols)), self.df.sql_ctx)
        return __doFill
github TresAmigosSD / SMV / server / smv.py View on Github external
def _smvConcatHist(df, cols): return helper(df).smvConcatHist(df._jdf, smv_copy_array(df._sc, *cols))
def _smvFreqHist(df, *cols): return dfhelper(df)._smvFreqHist(_to_seq(cols))
def _smvCountHist(df, keys, binSize): return dfhelper(df)._smvCountHist(_to_seq(keys), binSize)
def _smvBinHist(df, *colWithBin):
    for elem in colWithBin:
        assert type(elem) is tuple, "smvBinHist takes a list of tuple(string, double) as paraeter"
        assert len(elem) == 2, "smvBinHist takes a list of tuple(string, double) as parameter"
    insureDouble = map(lambda t: (t[0], t[1] * 1.0), colWithBin)
    return helper(df).smvBinHist(df._jdf, smv_copy_array(df._sc, *insureDouble))

def _smvEddCompare(df, df2, ignoreColName): return dfhelper(df)._smvEddCompare(df2._jdf, ignoreColName)

DataFrame.smvEdd = lambda df, *cols: println(_smvEdd(df, *cols))
DataFrame.smvHist = lambda df, *cols: println(_smvHist(df, *cols))
DataFrame.smvConcatHist = lambda df, cols: println(_smvConcatHist(df, cols))
DataFrame.smvFreqHist = lambda df, *cols: println(_smvFreqHist(df, *cols))
DataFrame.smvEddCompare = lambda df, df2, ignoreColName=False: println(_smvEddCompare(df, df2, ignoreColName))

def __smvCountHistFn(df, keys, binSize = 1):
    if (isinstance(keys, basestring)):
        return println(_smvCountHist(df, [keys], binSize))
    else:
        return println(_smvCountHist(df, keys, binSize))
DataFrame.smvCountHist = __smvCountHistFn

DataFrame.smvBinHist = lambda df, *colWithBin: println(_smvBinHist(df, *colWithBin))

def __smvDiscoverPK(df, n):
    res = helper(df).smvDiscoverPK(df._jdf, n)
    println("[{}], {}".format(", ".join(map(str, res._1())), res._2()))

DataFrame.smvDiscoverPK = lambda df, n=10000: __smvDiscoverPK(df, n)
github databricks / koalas / databricks / koala / structures.py View on Github external
def copy(self):
        df = DataFrame(self._jdf, self.sql_ctx)
        df._metadata = self._metadata.copy()
        return df
github qubole / spark-on-lambda / python / pyspark / ml / wrapper.py View on Github external
def _transform(self, dataset):
        self._transfer_params_to_java()
        return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
github Azure / mmlspark / src / main / python / mmlspark / cognitive / AzureSearchWriter.py View on Github external
if sys.version >= '3':
    basestring = str

import pyspark
from pyspark import SparkContext
from pyspark import sql
from pyspark.ml.param.shared import *
from pyspark.sql import DataFrame

def streamToAzureSearch(df, **options):
    jvm = SparkContext.getOrCreate()._jvm
    writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
    return writer.stream(df._jdf, options)

setattr(pyspark.sql.DataFrame, 'streamToAzureSearch', streamToAzureSearch)

def writeToAzureSearch(df, **options):
    jvm = SparkContext.getOrCreate()._jvm
    writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter
    writer.write(df._jdf, options)

setattr(pyspark.sql.DataFrame, 'writeToAzureSearch', writeToAzureSearch)