How to use the pyspark.sql.functions.lit function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github MrPowers / quinn / quinn / functions.py View on Github external
def week_end_date(col, week_end_day = 'Sat'):
    _raise_if_invalid_day(week_end_day)
    # these are the default Spark mappings.  Spark considers Sunday the first day of the week.
    day_of_week_mapping = {'Sun': 1, 'Mon': 2, 'Tue': 3, 'Wed': 4, 'Thu': 5, 'Fri': 6, 'Sat': 7}
    return F.when(F.dayofweek(col).eqNullSafe(F.lit(day_of_week_mapping[week_end_day])), col).otherwise(F.next_day(col, week_end_day))
github wikimedia / search-MjoLniR / mjolnir / utilities / make_folds.py View on Github external
def make_folds(sc, sqlContext, input_dir, output_dir, wikis, zero_features, num_folds, num_workers, max_executors):
    hdfs_mkdir(output_dir)
    df = sqlContext.read.parquet(input_dir) \
        .select('wikiid', 'query', 'features', 'label', 'norm_query_id')
    if wikis:
        df = df.where(F.col('wikiid').isin(wikis))

    counts = df.groupBy('wikiid').agg(F.count(F.lit(1)).alias('n_obs')).collect()
    counts = {row.wikiid: row.n_obs for row in counts}

    if not wikis:
        wikis = counts.keys()
    else:
        missing = set(wikis).difference(counts.keys())
        for wiki in missing:
            print('No observations available for ' + wiki)
        wikis = list(set(wikis).intersection(counts.keys()))
    if not wikis:
        raise Exception('No wikis provided')

    # sort to descending size, so mapping over them does the largest first
    wikis.sort(reverse=True, key=lambda wiki: counts[wiki])

    if zero_features:
github databricks / koalas / databricks / koalas / indexing.py View on Github external
if len(cond) > 0:
                    return reduce(lambda x, y: x & y, cond), None
            else:
                LocIndexer._raiseNotImplemented("Cannot use slice for MultiIndex with Spark.")
        elif isinstance(rows_sel, str):
            LocIndexer._raiseNotImplemented(
                "Cannot use a scalar value for row selection with Spark.")
        else:
            try:
                rows_sel = list(rows_sel)
            except TypeError:
                LocIndexer._raiseNotImplemented(
                    "Cannot use a scalar value for row selection with Spark.")
            if len(rows_sel) == 0:
                return F.lit(False), None
            elif len(self._internal.index_columns) == 1:
                index_column = self._kdf_or_kser.index.to_series()
                index_data_type = index_column.spark_type
                if len(rows_sel) == 1:
                    return index_column._scol == F.lit(rows_sel[0]).cast(index_data_type), None
                else:
                    return index_column._scol.isin(
                        [F.lit(r).cast(index_data_type) for r in rows_sel]), None
            else:
                LocIndexer._raiseNotImplemented("Cannot select with MultiIndex with Spark.")
github Azure-Samples / MachineLearningSamples-BigData / Code / O16Npreprocessing.py View on Github external
# aggreagte per five minutes
joindf.createOrReplaceTempView("joindf")
sqlStatement = """
    SELECT ServerIP, SessionStartFiveMin , min(SessionStartHour) SessionStartHour,
    sum(TotalLoad) SumTotalLoad, count(*) NumSession,
    sum(MbytesTransferred) SumMBytes, 
    sum(SubService_1_Load) SumLoad1, sum(SubSerivce_2_Load) SumLoad2, sum(SubSerivce_3_Load) SumLoad3, 
    sum(SubSerivce_4_Load) SumLoad4, sum(SubSerivce_5_Load) SumLoad5, sum(SecureBytes_Load) SumLoadSecure
    FROM joindf group by ServerIP, SessionStartFiveMin
"""
aggregatedf = spark.sql(sqlStatement);
from pyspark.sql.functions import col, udf
aggregatedf = aggregatedf.withColumn('SessionStartHourTime', col('SessionStartHour').cast('timestamp'))

aggregatedf = aggregatedf.withColumn("key", concat(aggregatedf.ServerIP,lit("_"),aggregatedf.SessionStartHourTime.cast('string')))

aggregatedf = aggregatedf.fillna(0, subset=['SumTotalLoad'])


maxByGroup = (aggregatedf.rdd
  .map(lambda x: (x[-1], x))  # Convert to PairwiseRD
  # Take maximum of the passed arguments by the last element (key)
  # equivalent to:
  # lambda x, y: x if x[-1] > y[-1] else y
  # 3 is the SumTotalLoad
  .reduceByKey(lambda x1, x2: max(x1, x2, key=lambda x: x[3])) 
  .values()) # Drop keys
aggregatemaxdf = maxByGroup.toDF()
# get the peakload every five minutes (non-overlapping) per hour
featureeddf = None
aggregatemaxdf.createOrReplaceTempView("aggregatemaxdf")
github databricks / koalas / databricks / koalas / base.py View on Github external
>>> df = ks.Series([True, False, None]).rename("a").to_frame()
        >>> df.set_index("a").index.all()
        False
        """
        axis = validate_axis(axis)
        if axis != 0:
            raise ValueError('axis should be either 0 or "index" currently.')

        sdf = self._internal._sdf.select(self._scol)
        col = scol_for(sdf, sdf.columns[0])

        # Note that we're ignoring `None`s here for now.
        # any and every was added as of Spark 3.0
        # ret = sdf.select(F.expr("every(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0]
        # Here we use min as its alternative:
        ret = sdf.select(F.min(F.coalesce(col.cast('boolean'), F.lit(True)))).collect()[0][0]
        if ret is None:
            return True
        else:
            return ret
github mozilla / telemetry-airflow / jobs / update_orphaning_dashboard_etl.py View on Github external
def merge_enumerated_histogram_col(df, col_name, n_values):
        return df.withColumn(col_name+"_merged", merge_enumerated_histograms_udf(col_name, F.lit(col_name), F.lit(n_values))).drop(col_name).withColumnRenamed(col_name+"_merged", col_name)
    def merge_enumerated_histogram_columns(df, cols_n_values):
github Sotera / watchman / services / silk-specter / fast_text_modeler.py View on Github external
# u_add_hashtags = F.udf(add_hashtags, ArrayType(StringType()))

        df_topics = df_topics\
        .withColumn('hashtags', u_flatten('all_hashtags'))\
        .drop('all_hashtags')
        # .withColumn('top_hashtags', u_add_hashtags('topic')) # hack to add literal array for each row

        df_topics = df_topics\
        .withColumn('_post_ids', u_trunc_array('post_ids'))

        df_topics = df_topics\
        .drop('post_ids')\
        .withColumnRenamed('_post_ids', 'post_ids')

        df_topics = df_topics.select('*',
            F.lit(datetime.now()).alias('created'),
            F.lit(start_time).alias('start_time'),
            F.lit(end_time).alias('end_time')
        )

        self.counts['topics*campaigns'] = df_topics.count()

        # toLocalIterator has bug: https://issues.apache.org/jira/browse/SPARK-18281
        # topics_iter = df_topics.toLocalIterator()
        topics = list(map(lambda s: json.loads(s), df_topics.toJSON().collect()))

        # for row in topics_iter:
        for row in topics:
            deliver(row, kafka_url, kafka_topic)

        self.save(df_topics)
github wikimedia / search-MjoLniR / mjolnir / utilities / feature_selection.py View on Github external
})))
            .write.parquet(output_dir))
    finally:
        try:
            mjolnir.utils.hdfs_rmdir(exploded_output_dir)
        except Exception:
            pass

    # Write out some extra stats that our spark utility will use to size
    # executors for fold generation and training.  The filename must start
    # with _ to be ignored by parquet loader.
    stats_path = os.path.join(output_dir, '_stats.json')
    counts = (
        sqlContext.read.parquet(output_dir)
        .groupBy('wikiid')
        .agg(F.count(F.lit(1)).alias('num_obs'))
        .collect())
    with mjolnir.utils.as_output_file(stats_path, 'w') as f:
        f.write(json.dumps({
            'num_features': len(pre_selected) if pre_selected else num_features,
            'num_obs': {row.wikiid: row.num_obs for row in counts}
        }))
github devlace / datadevops / src / ddo_transform / ddo_transform / transform.py View on Github external
fact_parking = sensordata_sdf\
        .join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\
        .join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\
        .join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\
        .select(
            lit(dim_date_id).alias("dim_date_id"),
            lit(dim_time_id).alias("dim_time_id"),
            when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"),
            when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("l.dim_location_id")).alias("dim_location_id"),
            when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"),
            "status",
            lit(load_id).alias("load_id"),
            lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on")
        )
    return fact_parking