Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def standardize_sensordata(sensordata_sdf: DataFrame, load_id, loaded_on):
t_sensordata_sdf = (
sensordata_sdf
.select(
col("bay_id").cast("int").alias("bay_id"),
"st_marker_id",
col("lat").cast("float").alias("lat"),
col("lon").cast("float").alias("lon"),
"location",
"status",
lit(load_id).alias("load_id"),
lit(loaded_on.isoformat()).alias("loaded_on")
)
).cache()
# Data Validation
good_records = t_sensordata_sdf.filter(col("bay_id").isNotNull())
bad_records = t_sensordata_sdf.filter(col("bay_id").isNull())
return good_records, bad_records
def helper(c):
# Create a predicate for each match type, comparing column values to the match type value
predicates = [F.col(c) == k.value for k in MatchType]
# Create a tuple(number of match types found for each match type in this column)
return F.struct([F.lit(F.sum(pred.cast("integer"))) for pred in predicates]).alias(c)
def split_dataset_col(df, dcol):
"""
Split dataset name in DataFrame into primary_name, processing_name , data_tier components.
Keep original column
"""
ndf = df.withColumn("primary_name", split(col(dcol), "/").alias('primary_name').getItem(1))\
.withColumn("processing_name", split(col(dcol), "/").alias('processing_name').getItem(2))\
.withColumn("data_tier", split(col(dcol), "/").alias('data_tier').getItem(3))
return ndf
df = df.withColumn("county", lit("Durham")) \
.withColumn("datasetId", col("fields.id")) \
.withColumn("name", col("fields.premise_name")) \
.withColumn("address1", col("fields.premise_address1")) \
.withColumn("address2", col("fields.premise_address2")) \
.withColumn("city", col("fields.premise_city")) \
.withColumn("state", col("fields.premise_state")) \
.withColumn("zip", col("fields.premise_zip")) \
.withColumn("tel", col("fields.premise_phone")) \
.withColumn("dateStart", col("fields.opening_date")) \
.withColumn("dateEnd", col("fields.closing_date")) \
.withColumn("type", split(col("fields.type_description"), " - ").getItem(1)) \
.withColumn("geoX", col("fields.geolocation").getItem(0)) \
.withColumn("geoY", col("fields.geolocation").getItem(1))
df = df.withColumn("id", concat(col("state"), lit("_"), col("county"), lit("_"), col("datasetId")))
print("*** Dataframe transformed")
df.select('id',"state", "county", "datasetId").show(5)
df.printSchema()
print("*** Looking at partitions")
partitionCount = df.rdd.getNumPartitions()
print("Partition count before repartition: {}".format(partitionCount))
df = df.repartition(4)
print("Partition count after repartition: {}".format(df.rdd.getNumPartitions()))
CRAB_DataBlock is not NULL AND
timestamp >= {} AND
timestamp <= {}""".format(
start, end
)
)
.repartition("CRAB_DataBlock")
.drop_duplicates(["GlobalJobId"])
.withColumnRenamed("CMSPrimaryPrimaryDataset", "PrimaryDataset")
.withColumn("Dataset", regexp_extract("CRAB_DataBlock","^(.*)/([^/]*)#.*$",1))
.withColumn("Datatier", regexp_extract("CRAB_DataBlock","^(.*)/([^/]*)#.*$",2))
)
dfs_crabdb = (
dfs_crabdb.groupBy("Datatier","PrimaryDataset","Dataset")
.agg(
_max(col("RecordTime")),
_min(col("RecordTime")),
count(lit(1)),
countDistinct("CRAB_Workflow"),
_sum(col("ChirpCMSSWReadBytes")),
)
.withColumnRenamed("count(1)", "job_count")
.withColumnRenamed("count(DISTINCT CRAB_Workflow)", "workflow_count")
.withColumnRenamed("sum(ChirpCMSSWReadBytes)", "ChirpCMSSWReadBytes")
.na.fill("Unknown",["Datatier","PrimaryDataset","Dataset"])
)
return dfs_crabdb.toPandas()
if len(num_rows_collected) != 1:
raise ValueError("Not all features were collected properly: " + str(fnames_accu.value))
num_rows_collected = num_rows_collected.pop()
print('Collected %d datapoints' % (num_rows_collected))
# TODO: count and check that this value is sane, this would require computing the number
# of request sent
features = list(fnames_accu.value.keys())
df_hits_with_features = (
df_hits
# TODO: We almost don't need the join and could defer it to something
# that does, basically make_folds. But it's convenient to think of each
# utility as adding to the previous and having a simple line as the
# dependency chain.
.join(df_features, how='inner', on=['wikiid', 'query', 'hit_page_id'])
.withColumn('features', mjolnir.spark.add_meta(sc, F.col('features'), {
'features': features,
'feature_definitions': ltr_feature_definitions,
'collected_at': datetime.datetime.now().isoformat(),
'used_kafka': brokers is not None,
'search_cluster': search_cluster if search_cluster else "",
# TODO: Where does this metadata go? It seems a bit more top-level
# but could be useful to remember.
'input_dir': input_dir,
'output_dir': output_dir,
})))
df_hits_with_features.write.parquet(output_dir)
# Emit some statistics that will allow the spark utility to automatically
# size the executors for building datasets out of this.
counts = (
def col_trim(columns):
exprs = [trim(col(c)).alias(c)
if (c in columns) and (c in valid_cols)
else c
for (c, t) in self._df.dtypes]
self._df = self._df.select(*exprs)
def process_wiki(wiki):
df_wiki = df_exploded.where(F.col('wikiid') == wiki)
if pre_selected:
selected = pre_selected
else:
selected = mjolnir.feature_engineering.select_features(
sc, df_wiki, all_features, num_features, quantile_pool,
algo=algo)
df_selected = df_wiki.select(*(default_cols + selected))
assembler = VectorAssembler(
inputCols=selected, outputCol='features')
wiki_features[wiki] = selected
return assembler.transform(df_selected).drop(*selected)
def _set_label(self, data):
return data.ratings.withColumn('score', F.when(F.col('score') >= self.threshold, 1.0).otherwise(0.0))
def _load(self):
"""
# Asserting columns is string or list:
assert isinstance(columns, list) and isinstance(columns[0], tuple), \
"Error: Column argument must be a list of tuples"
col_not_valids = (
set([column[0] for column in columns]).difference(set([column for column in self._df.columns])))
assert (col_not_valids == set()), 'Error: The following columns do not exits in dataFrame: %s' % col_not_valids
old_names = [column[0] for column in columns]
not_in_type = filter(lambda c: c not in old_names, self._df.columns)
exprs = [col(column[0]).alias(column[1]) for column in columns] + [col(column) for column in not_in_type]
self._add_transformation() # checkpoint in case
self._df = self._df.select(*exprs)
return self