How to use the pyspark.sql.types.IntegerType function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tubular / sparkly / tests / integration / test_writer.py View on Github external
def test_simple_key_uncompressed(self):
        df = self.spark.createDataFrame(
            data=[
                ('k1', 'k14', [1, 14, 141]),
                ('k1', 'k12', [1, 12, 121]),
                ('k1', 'k11', [1, 11, 111]),
                ('k1', 'k13', [1, 13, 131]),
            ],
            schema=T.StructType([
                T.StructField('key_1', T.StringType()),
                T.StructField('key_2', T.StringType()),
                T.StructField('aux_data', T.ArrayType(T.IntegerType())),
            ])
        )

        df.write_ext.redis(
            key_by=['key_2'],
            max_pipeline_size=3,
            host='redis.docker',
        )

        redis_client = redis.StrictRedis('redis.docker')

        self.assertRowsEqual(
            redis_client.keys(),
            [b'k11', b'k12', b'k13', b'k14'],
            ignore_order=True,
        )
github adobe / experience-platform-dsw-reference / recipes / pyspark / pysparkretailapp / helper.py View on Github external
dataset = dataset.select(col(tenant_id + ".*"))
        dataset.show()

    # Filter the data
    timeframe = str(config_properties.get("timeframe"))
    if timeframe != 'None':
        filterByTime = str(datetime.datetime.now() - datetime.timedelta(minutes=int(timeframe)))
        dataset = dataset.filter(dataset["date"] >= lit(str(filterByTime)))
        print("Number of rows after filtering : " + str(dataset.count()))

    # Convert isHoliday boolean value to Int
    pd = dataset.withColumn("isHoliday", col("isHoliday").cast(IntegerType()))

    # Get the week and year from date
    pd = pd.withColumn("week", date_format(to_date("date", "MM/dd/yy"), "w").cast(IntegerType()))
    pd = pd.withColumn("year", date_format(to_date("date", "MM/dd/yy"), "Y").cast(IntegerType()))

    # Convert the date to TimestampType
    pd = pd.withColumn("tx_date", to_date(unix_timestamp(pd["date"], "MM/dd/yy").cast("timestamp")))

    # Convert categorical data
    indexer = StringIndexer(inputCol="storeType", outputCol="storeTypeIndex")
    pd = indexer.fit(pd).transform(pd)

    # Get the WeeklySalesAhead and WeeklySalesLag column values
    window = Window.orderBy("tx_date").partitionBy("store")
    pd = pd.withColumn("weeklySalesLag", lag("weeklySales", 1).over(window)).na.drop(subset=["weeklySalesLag"])
    pd = pd.withColumn("weeklySalesAhead", lag("weeklySales", -1).over(window)).na.drop(subset=["weeklySalesAhead"])
    pd = pd.withColumn("weeklySalesScaled", lag("weeklySalesAhead", -1).over(window)).na.drop(subset=["weeklySalesScaled"])
    pd = pd.withColumn("weeklySalesDiff", (pd['weeklySales'] - pd['weeklySalesLag'])/pd['weeklySalesLag'])

    pd = pd.na.drop()
github qubole / spark-on-lambda / python / pyspark / mllib / linalg / __init__.py View on Github external
def sqlType(cls):
        return StructType([
            StructField("type", ByteType(), False),
            StructField("numRows", IntegerType(), False),
            StructField("numCols", IntegerType(), False),
            StructField("colPtrs", ArrayType(IntegerType(), False), True),
            StructField("rowIndices", ArrayType(IntegerType(), False), True),
            StructField("values", ArrayType(DoubleType(), False), True),
            StructField("isTransposed", BooleanType(), False)])
github aws-samples / aws-glue-samples / utilities / Hive_metastore_migration / src / hive_metastore_migration.py View on Github external
def extract_from_sds_sort_cols(self, ms_sds):

        return DataCatalogTransformer.generate_idx_for_df(ms_sds, 'SD_ID', 'sortColumns',
                                                          col_schema=StructType([
                                                              StructField('column', StringType(), True),
                                                              StructField('order', IntegerType(), True)
                                                          ]))\
            .select('SD_ID', 'INTEGER_IDX', 'col.*')\
            .withColumnRenamed('column', 'COLUMN_NAME')\
            .withColumnRenamed('order', 'ORDER')
github scanner-research / esper-tv / app / esper / spark_util.py View on Github external
def get_shows():
    shows = spark.load('query_show').alias('shows')

    show_hosts = spark.load('query_show_hosts')
    show_id_to_host_count = Counter()
    for e in show_hosts.collect():
        show_id_to_host_count[e.show_id] += 1

    def num_hosts_helper(show_id):
        return int(show_id_to_host_count[show_id])

    my_udf = func.udf(num_hosts_helper, IntegerType())
    shows = shows.withColumn('num_hosts', my_udf('id'))
    return shows
github aws-samples / aws-glue-samples / utilities / Hive_metastore_migration / src / hive_metastore_migration.py View on Github external
    @staticmethod
    def generate_idx_for_df(df, id_name, col_name, col_schema):
        """
        generate_idx_for_df, explodes rows with array as a column into a new row for each element in
        the array, with 'INTEGER_IDX' indicating its index in the original array.
        :param df: dataframe with array columns
        :param id_name: the id field of df
        :param col_name: the col of df to explode
        :param col_schema: the schema of each element in col_name array
        :return: new df with exploded rows.
        """
        idx_udf = UserDefinedFunction(
            DataCatalogTransformer.udf_array_to_map,
            MapType(IntegerType(), col_schema, True))

        return df.withColumn('idx_columns', idx_udf(col(col_name)))\
            .select(id_name, explode('idx_columns').alias("INTEGER_IDX", "col"))
github databricks / koalas / databricks / koalas / typedef.py View on Github external
parameters = getattr(tuple_type, "__args__")
        return _DataFrame([as_spark_type(t) for t in parameters])
    inner = as_spark_type(tpe)
    if inner is None:
        return _Unknown(tpe)
    else:
        return _Scalar(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, 'str', 'string'],
    types.BinaryType(): [bytes],
    types.ByteType(): [np.int8, 'int8', 'byte'],
    types.ShortType(): [np.int16, 'int16', 'short'],
    types.IntegerType(): [int, 'int', np.int, np.int32],
    types.LongType(): [np.int64, 'int64', 'long', 'bigint'],
    types.FloatType(): [float, 'float', np.float],
    types.DoubleType(): [np.float64, 'float64', 'double'],
    types.TimestampType(): [datetime.datetime, np.datetime64],
    types.DateType(): [datetime.date],
    types.BooleanType(): [bool, 'boolean', 'bool', np.bool],
    types.ArrayType(types.StringType()): []
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items() for other_type in l]
                + [(spark_type, spark_type) for (spark_type, _) in _base.items()])


def _build_py_type_dict():
github Pinafore / qb / qanta / util / spark_features.py View on Github external
from pyspark import RDD, SparkContext
from pyspark.sql import SparkSession, Row, DataFrame
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

from qanta import logging
from qanta.util.constants import BUZZ_FOLDS, FEATURE_NAMES
from qanta.datasets.quiz_bowl import QuestionDatabase
from qanta.preprocess import format_guess

log = logging.get(__name__)

SCHEMA = StructType([
    StructField('fold', StringType(), False),
    StructField('qnum', IntegerType(), False),
    StructField('sentence', IntegerType(), False),
    StructField('token', IntegerType(), False),
    StructField('guess', StringType(), False),
    StructField('feature_name', StringType(), False),
    StructField('feature_value', StringType(), False)
])


def create_output(path: str):
    df = read_dfs(path).cache()
    question_db = QuestionDatabase()
    answers = question_db.all_answers()
    for qnum in answers:
        answers[qnum] = format_guess(answers[qnum])

    sc = SparkContext.getOrCreate()  # type: SparkContext
    b_answers = sc.broadcast(answers)
github Rookout / deployment-examples / python-spark / example.py View on Github external
return points


def map_partitions_handler(records_iterator):
    records_with_address_info = []
    for record in records_iterator:
        records_with_address_info.append(handle_record(record))
    return records_with_address_info


schema = StructType([
    StructField("partition", IntegerType()),
    StructField("lat", FloatType()),
    StructField("long", FloatType()),
    StructField("street_name", StringType()),
    StructField("building_no", IntegerType()),
    StructField("zip_code", StringType()),
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("country", StringType())
])


def main():

    spark_session = SparkSession.builder \
        .enableHiveSupport() \
        .getOrCreate()
    with open("sample-data.csv", "wb") as sample_data_file:
        sample_data_file.write("lat,long\n")
        for i in xrange(10):
            sample_data_file.write("%d,40.714224,-73.961452\n" % (i % 10))