Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_simple_key_uncompressed(self):
df = self.spark.createDataFrame(
data=[
('k1', 'k14', [1, 14, 141]),
('k1', 'k12', [1, 12, 121]),
('k1', 'k11', [1, 11, 111]),
('k1', 'k13', [1, 13, 131]),
],
schema=T.StructType([
T.StructField('key_1', T.StringType()),
T.StructField('key_2', T.StringType()),
T.StructField('aux_data', T.ArrayType(T.IntegerType())),
])
)
df.write_ext.redis(
key_by=['key_2'],
max_pipeline_size=3,
host='redis.docker',
)
redis_client = redis.StrictRedis('redis.docker')
self.assertRowsEqual(
redis_client.keys(),
[b'k11', b'k12', b'k13', b'k14'],
ignore_order=True,
)
dataset = dataset.select(col(tenant_id + ".*"))
dataset.show()
# Filter the data
timeframe = str(config_properties.get("timeframe"))
if timeframe != 'None':
filterByTime = str(datetime.datetime.now() - datetime.timedelta(minutes=int(timeframe)))
dataset = dataset.filter(dataset["date"] >= lit(str(filterByTime)))
print("Number of rows after filtering : " + str(dataset.count()))
# Convert isHoliday boolean value to Int
pd = dataset.withColumn("isHoliday", col("isHoliday").cast(IntegerType()))
# Get the week and year from date
pd = pd.withColumn("week", date_format(to_date("date", "MM/dd/yy"), "w").cast(IntegerType()))
pd = pd.withColumn("year", date_format(to_date("date", "MM/dd/yy"), "Y").cast(IntegerType()))
# Convert the date to TimestampType
pd = pd.withColumn("tx_date", to_date(unix_timestamp(pd["date"], "MM/dd/yy").cast("timestamp")))
# Convert categorical data
indexer = StringIndexer(inputCol="storeType", outputCol="storeTypeIndex")
pd = indexer.fit(pd).transform(pd)
# Get the WeeklySalesAhead and WeeklySalesLag column values
window = Window.orderBy("tx_date").partitionBy("store")
pd = pd.withColumn("weeklySalesLag", lag("weeklySales", 1).over(window)).na.drop(subset=["weeklySalesLag"])
pd = pd.withColumn("weeklySalesAhead", lag("weeklySales", -1).over(window)).na.drop(subset=["weeklySalesAhead"])
pd = pd.withColumn("weeklySalesScaled", lag("weeklySalesAhead", -1).over(window)).na.drop(subset=["weeklySalesScaled"])
pd = pd.withColumn("weeklySalesDiff", (pd['weeklySales'] - pd['weeklySalesLag'])/pd['weeklySalesLag'])
pd = pd.na.drop()
def sqlType(cls):
return StructType([
StructField("type", ByteType(), False),
StructField("numRows", IntegerType(), False),
StructField("numCols", IntegerType(), False),
StructField("colPtrs", ArrayType(IntegerType(), False), True),
StructField("rowIndices", ArrayType(IntegerType(), False), True),
StructField("values", ArrayType(DoubleType(), False), True),
StructField("isTransposed", BooleanType(), False)])
def extract_from_sds_sort_cols(self, ms_sds):
return DataCatalogTransformer.generate_idx_for_df(ms_sds, 'SD_ID', 'sortColumns',
col_schema=StructType([
StructField('column', StringType(), True),
StructField('order', IntegerType(), True)
]))\
.select('SD_ID', 'INTEGER_IDX', 'col.*')\
.withColumnRenamed('column', 'COLUMN_NAME')\
.withColumnRenamed('order', 'ORDER')
def get_shows():
shows = spark.load('query_show').alias('shows')
show_hosts = spark.load('query_show_hosts')
show_id_to_host_count = Counter()
for e in show_hosts.collect():
show_id_to_host_count[e.show_id] += 1
def num_hosts_helper(show_id):
return int(show_id_to_host_count[show_id])
my_udf = func.udf(num_hosts_helper, IntegerType())
shows = shows.withColumn('num_hosts', my_udf('id'))
return shows
@staticmethod
def generate_idx_for_df(df, id_name, col_name, col_schema):
"""
generate_idx_for_df, explodes rows with array as a column into a new row for each element in
the array, with 'INTEGER_IDX' indicating its index in the original array.
:param df: dataframe with array columns
:param id_name: the id field of df
:param col_name: the col of df to explode
:param col_schema: the schema of each element in col_name array
:return: new df with exploded rows.
"""
idx_udf = UserDefinedFunction(
DataCatalogTransformer.udf_array_to_map,
MapType(IntegerType(), col_schema, True))
return df.withColumn('idx_columns', idx_udf(col(col_name)))\
.select(id_name, explode('idx_columns').alias("INTEGER_IDX", "col"))
parameters = getattr(tuple_type, "__args__")
return _DataFrame([as_spark_type(t) for t in parameters])
inner = as_spark_type(tpe)
if inner is None:
return _Unknown(tpe)
else:
return _Scalar(inner)
# First element of the list is the python base type
_base = {
types.StringType(): [str, 'str', 'string'],
types.BinaryType(): [bytes],
types.ByteType(): [np.int8, 'int8', 'byte'],
types.ShortType(): [np.int16, 'int16', 'short'],
types.IntegerType(): [int, 'int', np.int, np.int32],
types.LongType(): [np.int64, 'int64', 'long', 'bigint'],
types.FloatType(): [float, 'float', np.float],
types.DoubleType(): [np.float64, 'float64', 'double'],
types.TimestampType(): [datetime.datetime, np.datetime64],
types.DateType(): [datetime.date],
types.BooleanType(): [bool, 'boolean', 'bool', np.bool],
types.ArrayType(types.StringType()): []
}
def _build_type_dict():
return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l]
+ [(spark_type, spark_type) for (spark_type, _) in _base.items()])
def _build_py_type_dict():
from pyspark import RDD, SparkContext
from pyspark.sql import SparkSession, Row, DataFrame
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from qanta import logging
from qanta.util.constants import BUZZ_FOLDS, FEATURE_NAMES
from qanta.datasets.quiz_bowl import QuestionDatabase
from qanta.preprocess import format_guess
log = logging.get(__name__)
SCHEMA = StructType([
StructField('fold', StringType(), False),
StructField('qnum', IntegerType(), False),
StructField('sentence', IntegerType(), False),
StructField('token', IntegerType(), False),
StructField('guess', StringType(), False),
StructField('feature_name', StringType(), False),
StructField('feature_value', StringType(), False)
])
def create_output(path: str):
df = read_dfs(path).cache()
question_db = QuestionDatabase()
answers = question_db.all_answers()
for qnum in answers:
answers[qnum] = format_guess(answers[qnum])
sc = SparkContext.getOrCreate() # type: SparkContext
b_answers = sc.broadcast(answers)
return points
def map_partitions_handler(records_iterator):
records_with_address_info = []
for record in records_iterator:
records_with_address_info.append(handle_record(record))
return records_with_address_info
schema = StructType([
StructField("partition", IntegerType()),
StructField("lat", FloatType()),
StructField("long", FloatType()),
StructField("street_name", StringType()),
StructField("building_no", IntegerType()),
StructField("zip_code", StringType()),
StructField("city", StringType()),
StructField("state", StringType()),
StructField("country", StringType())
])
def main():
spark_session = SparkSession.builder \
.enableHiveSupport() \
.getOrCreate()
with open("sample-data.csv", "wb") as sample_data_file:
sample_data_file.write("lat,long\n")
for i in xrange(10):
sample_data_file.write("%d,40.714224,-73.961452\n" % (i % 10))