Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def week_end_date(col, week_end_day = 'Sat'):
_raise_if_invalid_day(week_end_day)
# these are the default Spark mappings. Spark considers Sunday the first day of the week.
day_of_week_mapping = {'Sun': 1, 'Mon': 2, 'Tue': 3, 'Wed': 4, 'Thu': 5, 'Fri': 6, 'Sat': 7}
return F.when(F.dayofweek(col).eqNullSafe(F.lit(day_of_week_mapping[week_end_day])), col).otherwise(F.next_day(col, week_end_day))
def make_folds(sc, sqlContext, input_dir, output_dir, wikis, zero_features, num_folds, num_workers, max_executors):
hdfs_mkdir(output_dir)
df = sqlContext.read.parquet(input_dir) \
.select('wikiid', 'query', 'features', 'label', 'norm_query_id')
if wikis:
df = df.where(F.col('wikiid').isin(wikis))
counts = df.groupBy('wikiid').agg(F.count(F.lit(1)).alias('n_obs')).collect()
counts = {row.wikiid: row.n_obs for row in counts}
if not wikis:
wikis = counts.keys()
else:
missing = set(wikis).difference(counts.keys())
for wiki in missing:
print('No observations available for ' + wiki)
wikis = list(set(wikis).intersection(counts.keys()))
if not wikis:
raise Exception('No wikis provided')
# sort to descending size, so mapping over them does the largest first
wikis.sort(reverse=True, key=lambda wiki: counts[wiki])
if zero_features:
if len(cond) > 0:
return reduce(lambda x, y: x & y, cond), None
else:
LocIndexer._raiseNotImplemented("Cannot use slice for MultiIndex with Spark.")
elif isinstance(rows_sel, str):
LocIndexer._raiseNotImplemented(
"Cannot use a scalar value for row selection with Spark.")
else:
try:
rows_sel = list(rows_sel)
except TypeError:
LocIndexer._raiseNotImplemented(
"Cannot use a scalar value for row selection with Spark.")
if len(rows_sel) == 0:
return F.lit(False), None
elif len(self._internal.index_columns) == 1:
index_column = self._kdf_or_kser.index.to_series()
index_data_type = index_column.spark_type
if len(rows_sel) == 1:
return index_column._scol == F.lit(rows_sel[0]).cast(index_data_type), None
else:
return index_column._scol.isin(
[F.lit(r).cast(index_data_type) for r in rows_sel]), None
else:
LocIndexer._raiseNotImplemented("Cannot select with MultiIndex with Spark.")
# aggreagte per five minutes
joindf.createOrReplaceTempView("joindf")
sqlStatement = """
SELECT ServerIP, SessionStartFiveMin , min(SessionStartHour) SessionStartHour,
sum(TotalLoad) SumTotalLoad, count(*) NumSession,
sum(MbytesTransferred) SumMBytes,
sum(SubService_1_Load) SumLoad1, sum(SubSerivce_2_Load) SumLoad2, sum(SubSerivce_3_Load) SumLoad3,
sum(SubSerivce_4_Load) SumLoad4, sum(SubSerivce_5_Load) SumLoad5, sum(SecureBytes_Load) SumLoadSecure
FROM joindf group by ServerIP, SessionStartFiveMin
"""
aggregatedf = spark.sql(sqlStatement);
from pyspark.sql.functions import col, udf
aggregatedf = aggregatedf.withColumn('SessionStartHourTime', col('SessionStartHour').cast('timestamp'))
aggregatedf = aggregatedf.withColumn("key", concat(aggregatedf.ServerIP,lit("_"),aggregatedf.SessionStartHourTime.cast('string')))
aggregatedf = aggregatedf.fillna(0, subset=['SumTotalLoad'])
maxByGroup = (aggregatedf.rdd
.map(lambda x: (x[-1], x)) # Convert to PairwiseRD
# Take maximum of the passed arguments by the last element (key)
# equivalent to:
# lambda x, y: x if x[-1] > y[-1] else y
# 3 is the SumTotalLoad
.reduceByKey(lambda x1, x2: max(x1, x2, key=lambda x: x[3]))
.values()) # Drop keys
aggregatemaxdf = maxByGroup.toDF()
# get the peakload every five minutes (non-overlapping) per hour
featureeddf = None
aggregatemaxdf.createOrReplaceTempView("aggregatemaxdf")
>>> df = ks.Series([True, False, None]).rename("a").to_frame()
>>> df.set_index("a").index.all()
False
"""
axis = validate_axis(axis)
if axis != 0:
raise ValueError('axis should be either 0 or "index" currently.')
sdf = self._internal._sdf.select(self._scol)
col = scol_for(sdf, sdf.columns[0])
# Note that we're ignoring `None`s here for now.
# any and every was added as of Spark 3.0
# ret = sdf.select(F.expr("every(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0]
# Here we use min as its alternative:
ret = sdf.select(F.min(F.coalesce(col.cast('boolean'), F.lit(True)))).collect()[0][0]
if ret is None:
return True
else:
return ret
def merge_enumerated_histogram_col(df, col_name, n_values):
return df.withColumn(col_name+"_merged", merge_enumerated_histograms_udf(col_name, F.lit(col_name), F.lit(n_values))).drop(col_name).withColumnRenamed(col_name+"_merged", col_name)
def merge_enumerated_histogram_columns(df, cols_n_values):
# u_add_hashtags = F.udf(add_hashtags, ArrayType(StringType()))
df_topics = df_topics\
.withColumn('hashtags', u_flatten('all_hashtags'))\
.drop('all_hashtags')
# .withColumn('top_hashtags', u_add_hashtags('topic')) # hack to add literal array for each row
df_topics = df_topics\
.withColumn('_post_ids', u_trunc_array('post_ids'))
df_topics = df_topics\
.drop('post_ids')\
.withColumnRenamed('_post_ids', 'post_ids')
df_topics = df_topics.select('*',
F.lit(datetime.now()).alias('created'),
F.lit(start_time).alias('start_time'),
F.lit(end_time).alias('end_time')
)
self.counts['topics*campaigns'] = df_topics.count()
# toLocalIterator has bug: https://issues.apache.org/jira/browse/SPARK-18281
# topics_iter = df_topics.toLocalIterator()
topics = list(map(lambda s: json.loads(s), df_topics.toJSON().collect()))
# for row in topics_iter:
for row in topics:
deliver(row, kafka_url, kafka_topic)
self.save(df_topics)
})))
.write.parquet(output_dir))
finally:
try:
mjolnir.utils.hdfs_rmdir(exploded_output_dir)
except Exception:
pass
# Write out some extra stats that our spark utility will use to size
# executors for fold generation and training. The filename must start
# with _ to be ignored by parquet loader.
stats_path = os.path.join(output_dir, '_stats.json')
counts = (
sqlContext.read.parquet(output_dir)
.groupBy('wikiid')
.agg(F.count(F.lit(1)).alias('num_obs'))
.collect())
with mjolnir.utils.as_output_file(stats_path, 'w') as f:
f.write(json.dumps({
'num_features': len(pre_selected) if pre_selected else num_features,
'num_obs': {row.wikiid: row.num_obs for row in counts}
}))
fact_parking = sensordata_sdf\
.join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\
.join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\
.join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\
.select(
lit(dim_date_id).alias("dim_date_id"),
lit(dim_time_id).alias("dim_time_id"),
when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID))
.otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"),
when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID))
.otherwise(col("l.dim_location_id")).alias("dim_location_id"),
when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID))
.otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"),
"status",
lit(load_id).alias("load_id"),
lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on")
)
return fact_parking