How to use the pyspark.sql.types.StructField function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github apache / incubator-spot / spot-ingest / pipelines / flow / streaming.py View on Github external
StructField('stos', IntegerType(), True),
                StructField('ipkt', LongType(), True),
                StructField('ibyt', LongType(), True),
                StructField('opkt', LongType(), True),
                StructField('obyt', LongType(), True),
                StructField('input', IntegerType(), True),
                StructField('output', IntegerType(), True),
                StructField('sas', IntegerType(), True),
                StructField('das', IntegerType(), True),
                StructField('dtos', IntegerType(), True),
                StructField('dir', IntegerType(), True),
                StructField('rip', StringType(), True),
                StructField('y', ShortType(), True),
                StructField('m', ShortType(), True),
                StructField('d', ShortType(), True),
                StructField('h', ShortType(), True)
            ]
github narayave / Insight-GDELT-Feed / src / spark / gdelt_schema_v2.py View on Github external
gdeltSchema = StructType([
    StructField('GLOBALEVENTID', StringType(), True),
    StructField('SQLDATE', StringType(), True),
    StructField('MonthYear', StringType(), True),
    StructField('Year', StringType(), True),
    StructField('FractionDate', StringType(), True),
    StructField('Actor1Code', StringType(), True),
    StructField('Actor1Name', StringType(), True),
    StructField('Actor1CountryCode', StringType(), True),
    StructField('Actor1KnownGroupCode', StringType(), True),
    StructField('Actor1EthnicCode', StringType(), True),
    StructField('Actor1Religion1Code', StringType(), True),
    StructField('Actor1Religion2Code', StringType(), True),
    StructField('Actor1Type1Code', StringType(), True),
    StructField('Actor1Type2Code', StringType(), True),
    StructField('Actor1Type3Code', StringType(), True),
    StructField('Actor2Code', StringType(), True),
    StructField('Actor2Name', StringType(), True),
    StructField('Actor2CountryCode', StringType(), True),
    StructField('Actor2KnownGroupCode', StringType(), True),
    StructField('Actor2EthnicCode', StringType(), True),
    StructField('Actor2Religion1Code', StringType(), True),
    StructField('Actor2Religion2Code', StringType(), True),
    StructField('Actor2Type1Code', StringType(), True),
    StructField('Actor2Type2Code', StringType(), True),
    StructField('Actor2Type3Code', StringType(), True),
    StructField('IsRootEvent', StringType(), True),
    StructField('EventCode', StringType(), True),
    StructField('EventBaseCode', StringType(), True),
    StructField('EventRootCode', StringType(), True),
    StructField('QuadClass', StringType(), True),
github dmwm / CMSSpark / src / python / CMSSpark / dbs_spark.py View on Github external
ACQUISITION_ERA_ID NUMBER(38)
 PROCESSING_ERA_ID NUMBER(38)
 PHYSICS_GROUP_ID NUMBER(38)
 XTCROSSSECTION FLOAT(126)
 PREP_ID VARCHAR2(256)
 CREATION_DATE NUMBER(38)
 CREATE_BY VARCHAR2(500)
 LAST_MODIFICATION_DATE NUMBER(38)
 LAST_MODIFIED_BY VARCHAR2(500)
    """
    return StructType([
            StructField("d_dataset_id", IntegerType(), True),
            StructField("d_dataset", StringType(), True),
            StructField("d_is_dataset_valid", IntegerType(), True),
            StructField("d_primary_ds_id", IntegerType(), True),
            StructField("d_processed_ds_id", IntegerType(), True),
            StructField("d_data_tier_id", IntegerType(), True),
            StructField("d_dataset_access_type_id", IntegerType(), True),
            StructField("d_acquisition_era_id", IntegerType(), True),
            StructField("d_processing_era_id", IntegerType(), True),
            StructField("d_physics_group_id", IntegerType(), True),
            StructField("d_xtcrosssection", DoubleType(), True),
            StructField("d_prep_id", StringType(), True),
            StructField("d_creation_date", DoubleType(), True),
            StructField("d_create_by", StringType(), True),
            StructField("d_last_modification_date", DoubleType(), True),
            StructField("d_last_modified_by", StringType(), True)
        ])
github databricks / koalas / databricks / koalas / groupby.py View on Github external
kdf = DataFrame(pdf)
            return_schema = kdf._sdf.schema
            if len(pdf) <= limit:
                return kdf

            sdf = self._spark_group_map_apply(
                pandas_transform, return_schema, retain_index=True)
            # If schema is inferred, we can restore indexes too.
            internal = kdf._internal.copy(sdf=sdf,
                                          column_scols=[scol_for(sdf, col)
                                                        for col in kdf._internal.data_columns])
        else:
            return_type = _infer_return_type(func).tpe
            data_columns = self._kdf._internal.data_columns
            return_schema = StructType([
                StructField(c, return_type) for c in data_columns if c not in input_groupnames])

            sdf = self._spark_group_map_apply(
                pandas_transform, return_schema, retain_index=False)
            # Otherwise, it loses index.
            internal = _InternalFrame(sdf=sdf)

        return DataFrame(internal)
github aws-samples / aws-glue-samples / utilities / Hive_metastore_migration / src / hive_metastore_migration.py View on Github external
StructField('name', StringType(), False),
        StructField('tableType', StringType(), True),
        StructField('viewExpandedText', StringType(), True),
        StructField('viewOriginalText', StringType(), True),
        StructField('parameters', MapType(StringType(), StringType(), True), True),
        StructField('partitionKeys', ArrayType(StructType([
            StructField('name', StringType(), True),
            StructField('type', StringType(), True),
            StructField('comment', StringType(), True)
        ]), True), True),
        StructField('storageDescriptor', DATACATALOG_STORAGE_DESCRIPTOR_SCHEMA, True)
    ])

DATACATALOG_PARTITION_ITEM_SCHEMA = \
    StructType([
        StructField('creationTime', StringType(), True),
        StructField('lastAccessTime', StringType(), True),
        StructField('namespaceName', StringType(), True),
        StructField('tableName', StringType(), True),
        StructField('parameters', MapType(StringType(), StringType(), True), True),
        StructField('storageDescriptor', DATACATALOG_STORAGE_DESCRIPTOR_SCHEMA, True),
        StructField('values', ArrayType(StringType(), False), False)
    ])

DATACATALOG_DATABASE_SCHEMA = \
    StructType([
        StructField('items', ArrayType(
            DATACATALOG_DATABASE_ITEM_SCHEMA, False),
                    True),
        StructField('type', StringType(), False)
    ])
github openstack / monasca-transform / monasca_transform / transform / transform_utils.py View on Github external
# FIXME: change when pre_transform_specs df is finalized

        event_type = StructField("event_type", StringType(), True)

        metric_id_list = StructField("metric_id_list",
                                     ArrayType(StringType(),
                                               containsNull=False),
                                     True)
        required_raw_fields_list = StructField("required_raw_fields_list",
                                               ArrayType(StringType(),
                                                         containsNull=False),
                                               True)

        event_processing_params = \
            StructField("event_processing_params",
                        StructType([StructField("set_default_zone_to",
                                                StringType(), True),
                                    StructField("set_default_geolocation_to",
                                                StringType(), True),
                                    StructField("set_default_region_to",
                                                StringType(), True),
                                    ]), True)

        schema = StructType([event_processing_params, event_type,
                             metric_id_list, required_raw_fields_list])

        return schema
github aws-samples / aws-glue-samples / utilities / Hive_metastore_migration / src / hive_metastore_migration.py View on Github external
def construct_struct_schema(schema_tuples_list):
    struct_fields = []
    atomic_types_dict = {
        'int': IntegerType(),
        'long': LongType(),
        'string': StringType()
    }
    for (col_name, col_type, nullable) in schema_tuples_list:
        field_type = atomic_types_dict[col_type]
        struct_fields.append(StructField(name=col_name, dataType=field_type, nullable=nullable))
    return StructType(struct_fields)
github apache / incubator-spot / spot-ingest / pipelines / proxy / streaming.py View on Github external
StructField('urischeme', StringType(), True),
                StructField('uriport', StringType(), True),
                StructField('uripath', StringType(), True),
                StructField('uriquery', StringType(), True),
                StructField('uriextension', StringType(), True),
                StructField('serverip', StringType(), True),
                StructField('scbytes', IntegerType(), True),
                StructField('csbytes', IntegerType(), True),
                StructField('virusid', StringType(), True),
                StructField('bcappname', StringType(), True),
                StructField('bcappoper', StringType(), True),
                StructField('fulluri', StringType(), True),
                StructField('y', StringType(), True),
                StructField('m', StringType(), True),
                StructField('d', StringType(), True),
                StructField('h', StringType(), True)
            ]
github openstack / monasca-transform / monasca_transform / transform / transform_utils.py View on Github external
"event_date", "event_hour", "event_minute",
                   "event_second", "metric_group", "metric_id"]

        columns_struct_fields = [StructField(field_name, StringType(), True)
                                 for field_name in columns]

        # Add a column for a non-string fields
        columns_struct_fields.insert(0,
                                     StructField("event_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.insert(0,
                                     StructField("event_quantity",
                                                 DoubleType(), True))

        # map to metric meta
        columns_struct_fields.append(StructField("meta",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
        # map to dimensions
        columns_struct_fields.append(StructField("dimensions",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
        # map to value_meta
        columns_struct_fields.append(StructField("value_meta",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
github Pinafore / qb / qanta / util / spark_features.py View on Github external
from pyspark import RDD, SparkContext
from pyspark.sql import SparkSession, Row, DataFrame
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

from qanta import logging
from qanta.util.constants import BUZZ_FOLDS, FEATURE_NAMES
from qanta.datasets.quiz_bowl import QuestionDatabase
from qanta.preprocess import format_guess

log = logging.get(__name__)

SCHEMA = StructType([
    StructField('fold', StringType(), False),
    StructField('qnum', IntegerType(), False),
    StructField('sentence', IntegerType(), False),
    StructField('token', IntegerType(), False),
    StructField('guess', StringType(), False),
    StructField('feature_name', StringType(), False),
    StructField('feature_value', StringType(), False)
])


def create_output(path: str):
    df = read_dfs(path).cache()
    question_db = QuestionDatabase()
    answers = question_db.all_answers()
    for qnum in answers:
        answers[qnum] = format_guess(answers[qnum])

    sc = SparkContext.getOrCreate()  # type: SparkContext
    b_answers = sc.broadcast(answers)