How to use the pyspark.sql.functions.col function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github devlace / datadevops / src / ddo_transform / ddo_transform / standardize.py View on Github external
def standardize_sensordata(sensordata_sdf: DataFrame, load_id, loaded_on):
    t_sensordata_sdf = (
        sensordata_sdf
        .select(
            col("bay_id").cast("int").alias("bay_id"),
            "st_marker_id",
            col("lat").cast("float").alias("lat"),
            col("lon").cast("float").alias("lon"),
            "location",
            "status",
            lit(load_id).alias("load_id"),
            lit(loaded_on.isoformat()).alias("loaded_on")
        )
    ).cache()
    # Data Validation
    good_records = t_sensordata_sdf.filter(col("bay_id").isNotNull())
    bad_records = t_sensordata_sdf.filter(col("bay_id").isNull())
    return good_records, bad_records
github capitalone / datacompy / datacompy / sparkcompare.py View on Github external
def helper(c):
            # Create a predicate for each match type, comparing column values to the match type value
            predicates = [F.col(c) == k.value for k in MatchType]
            # Create a tuple(number of match types found for each match type in this column)
            return F.struct([F.lit(F.sum(pred.cast("integer"))) for pred in predicates]).alias(c)
github dmwm / CMSSpark / src / python / CMSSpark / data_aggregation.py View on Github external
def split_dataset_col(df, dcol):
    """
    Split dataset name in DataFrame into primary_name, processing_name , data_tier components.
    Keep original column
    """
    ndf = df.withColumn("primary_name", split(col(dcol), "/").alias('primary_name').getItem(1))\
            .withColumn("processing_name", split(col(dcol), "/").alias('processing_name').getItem(2))\
            .withColumn("data_tier", split(col(dcol), "/").alias('data_tier').getItem(3))
    return ndf
github jgperrin / net.jgp.books.spark.ch03 / src / main / python / lab220_json_ingestion_schema_manipulation / jsonIngestionSchemaManipulationApp.py View on Github external
df =  df.withColumn("county", lit("Durham")) \
        .withColumn("datasetId", col("fields.id")) \
        .withColumn("name", col("fields.premise_name")) \
        .withColumn("address1", col("fields.premise_address1")) \
        .withColumn("address2", col("fields.premise_address2")) \
        .withColumn("city", col("fields.premise_city")) \
        .withColumn("state", col("fields.premise_state")) \
        .withColumn("zip", col("fields.premise_zip")) \
        .withColumn("tel", col("fields.premise_phone")) \
        .withColumn("dateStart", col("fields.opening_date")) \
        .withColumn("dateEnd", col("fields.closing_date")) \
        .withColumn("type", split(col("fields.type_description"), " - ").getItem(1)) \
        .withColumn("geoX", col("fields.geolocation").getItem(0)) \
        .withColumn("geoY", col("fields.geolocation").getItem(1))

df = df.withColumn("id", concat(col("state"), lit("_"), col("county"), lit("_"), col("datasetId")))

print("*** Dataframe transformed")
df.select('id',"state", "county", "datasetId").show(5)
df.printSchema()

print("*** Looking at partitions")
partitionCount = df.rdd.getNumPartitions()
print("Partition count before repartition: {}".format(partitionCount))

df = df.repartition(4)
print("Partition count after repartition: {}".format(df.rdd.getNumPartitions()))
github dmwm / CMSSpark / src / python / CMSSpark / dbs_hdfs_crab.py View on Github external
CRAB_DataBlock is not NULL  AND
                              timestamp >= {} AND
                              timestamp <= {}""".format(
                start, end
            )
        )
        .repartition("CRAB_DataBlock")
        .drop_duplicates(["GlobalJobId"])
        .withColumnRenamed("CMSPrimaryPrimaryDataset", "PrimaryDataset")
        .withColumn("Dataset", regexp_extract("CRAB_DataBlock","^(.*)/([^/]*)#.*$",1))
        .withColumn("Datatier", regexp_extract("CRAB_DataBlock","^(.*)/([^/]*)#.*$",2))
    )
    dfs_crabdb = (
        dfs_crabdb.groupBy("Datatier","PrimaryDataset","Dataset")
        .agg(
            _max(col("RecordTime")),
            _min(col("RecordTime")),
            count(lit(1)),
            countDistinct("CRAB_Workflow"),
            _sum(col("ChirpCMSSWReadBytes")),
        )
        .withColumnRenamed("count(1)", "job_count")
        .withColumnRenamed("count(DISTINCT CRAB_Workflow)", "workflow_count")
        .withColumnRenamed("sum(ChirpCMSSWReadBytes)", "ChirpCMSSWReadBytes")
        .na.fill("Unknown",["Datatier","PrimaryDataset","Dataset"])
    )
    return dfs_crabdb.toPandas()
github wikimedia / search-MjoLniR / mjolnir / utilities / collect_features.py View on Github external
if len(num_rows_collected) != 1:
        raise ValueError("Not all features were collected properly: " + str(fnames_accu.value))
    num_rows_collected = num_rows_collected.pop()
    print('Collected %d datapoints' % (num_rows_collected))
    # TODO: count and check that this value is sane, this would require computing the number
    # of request sent

    features = list(fnames_accu.value.keys())
    df_hits_with_features = (
        df_hits
        # TODO: We almost don't need the join and could defer it to something
        # that does, basically make_folds. But it's convenient to think of each
        # utility as adding to the previous and having a simple line as the
        # dependency chain.
        .join(df_features, how='inner', on=['wikiid', 'query', 'hit_page_id'])
        .withColumn('features', mjolnir.spark.add_meta(sc, F.col('features'), {
            'features': features,
            'feature_definitions': ltr_feature_definitions,
            'collected_at': datetime.datetime.now().isoformat(),
            'used_kafka': brokers is not None,
            'search_cluster': search_cluster if search_cluster else "",
            # TODO: Where does this metadata go? It seems a bit more top-level
            # but could be useful to remember.
            'input_dir': input_dir,
            'output_dir': output_dir,
        })))

    df_hits_with_features.write.parquet(output_dir)

    # Emit some statistics that will allow the spark utility to automatically
    # size the executors for building datasets out of this.
    counts = (
github NYUBigDataProject / SparkClean / sparkclean / df_transformer.py View on Github external
def col_trim(columns):
            exprs = [trim(col(c)).alias(c)
                     if (c in columns) and (c in valid_cols)
                     else c
                     for (c, t) in self._df.dtypes]
            self._df = self._df.select(*exprs)
github wikimedia / search-MjoLniR / mjolnir / utilities / feature_selection.py View on Github external
def process_wiki(wiki):
            df_wiki = df_exploded.where(F.col('wikiid') == wiki)
            if pre_selected:
                selected = pre_selected
            else:
                selected = mjolnir.feature_engineering.select_features(
                    sc, df_wiki, all_features, num_features, quantile_pool,
                    algo=algo)
            df_selected = df_wiki.select(*(default_cols + selected))
            assembler = VectorAssembler(
                inputCols=selected, outputCol='features')
            wiki_features[wiki] = selected
            return assembler.transform(df_selected).drop(*selected)
github yinyajun / Details-In-Recommendation / data / RecSys18_causal_embedding_skew_dataset / spark_skew_dataset.py View on Github external
def _set_label(self, data):
        return data.ratings.withColumn('score', F.when(F.col('score') >= self.threshold, 1.0).otherwise(0.0))
    def _load(self):
github NYUBigDataProject / SparkClean / sparkclean / df_transformer.py View on Github external
"""
        # Asserting columns is string or list:
        assert isinstance(columns, list) and isinstance(columns[0], tuple), \
            "Error: Column argument must be a list of tuples"

        col_not_valids = (
            set([column[0] for column in columns]).difference(set([column for column in self._df.columns])))

        assert (col_not_valids == set()), 'Error: The following columns do not exits in dataFrame: %s' % col_not_valids

        old_names = [column[0] for column in columns]

        not_in_type = filter(lambda c: c not in old_names, self._df.columns)

        exprs = [col(column[0]).alias(column[1]) for column in columns] + [col(column) for column in not_in_type]

        self._add_transformation()  # checkpoint in case

        self._df = self._df.select(*exprs)
        return self