How to use the pyspark.sql.types.StructField function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

apache / incubator-spot / spot-ingest / pipelines / flow / streaming.py View on Github

StructField('stos', IntegerType(), True),
                StructField('ipkt', LongType(), True),
                StructField('ibyt', LongType(), True),
                StructField('opkt', LongType(), True),
                StructField('obyt', LongType(), True),
                StructField('input', IntegerType(), True),
                StructField('output', IntegerType(), True),
                StructField('sas', IntegerType(), True),
                StructField('das', IntegerType(), True),
                StructField('dtos', IntegerType(), True),
                StructField('dir', IntegerType(), True),
                StructField('rip', StringType(), True),
                StructField('y', ShortType(), True),
                StructField('m', ShortType(), True),
                StructField('d', ShortType(), True),
                StructField('h', ShortType(), True)
            ]

narayave / Insight-GDELT-Feed / src / spark / gdelt_schema_v2.py View on Github

gdeltSchema = StructType([
    StructField('GLOBALEVENTID', StringType(), True),
    StructField('SQLDATE', StringType(), True),
    StructField('MonthYear', StringType(), True),
    StructField('Year', StringType(), True),
    StructField('FractionDate', StringType(), True),
    StructField('Actor1Code', StringType(), True),
    StructField('Actor1Name', StringType(), True),
    StructField('Actor1CountryCode', StringType(), True),
    StructField('Actor1KnownGroupCode', StringType(), True),
    StructField('Actor1EthnicCode', StringType(), True),
    StructField('Actor1Religion1Code', StringType(), True),
    StructField('Actor1Religion2Code', StringType(), True),
    StructField('Actor1Type1Code', StringType(), True),
    StructField('Actor1Type2Code', StringType(), True),
    StructField('Actor1Type3Code', StringType(), True),
    StructField('Actor2Code', StringType(), True),
    StructField('Actor2Name', StringType(), True),
    StructField('Actor2CountryCode', StringType(), True),
    StructField('Actor2KnownGroupCode', StringType(), True),
    StructField('Actor2EthnicCode', StringType(), True),
    StructField('Actor2Religion1Code', StringType(), True),
    StructField('Actor2Religion2Code', StringType(), True),
    StructField('Actor2Type1Code', StringType(), True),
    StructField('Actor2Type2Code', StringType(), True),
    StructField('Actor2Type3Code', StringType(), True),
    StructField('IsRootEvent', StringType(), True),
    StructField('EventCode', StringType(), True),
    StructField('EventBaseCode', StringType(), True),
    StructField('EventRootCode', StringType(), True),
    StructField('QuadClass', StringType(), True),

dmwm / CMSSpark / src / python / CMSSpark / dbs_spark.py View on Github

ACQUISITION_ERA_ID NUMBER(38)
 PROCESSING_ERA_ID NUMBER(38)
 PHYSICS_GROUP_ID NUMBER(38)
 XTCROSSSECTION FLOAT(126)
 PREP_ID VARCHAR2(256)
 CREATION_DATE NUMBER(38)
 CREATE_BY VARCHAR2(500)
 LAST_MODIFICATION_DATE NUMBER(38)
 LAST_MODIFIED_BY VARCHAR2(500)
    """
    return StructType([
            StructField("d_dataset_id", IntegerType(), True),
            StructField("d_dataset", StringType(), True),
            StructField("d_is_dataset_valid", IntegerType(), True),
            StructField("d_primary_ds_id", IntegerType(), True),
            StructField("d_processed_ds_id", IntegerType(), True),
            StructField("d_data_tier_id", IntegerType(), True),
            StructField("d_dataset_access_type_id", IntegerType(), True),
            StructField("d_acquisition_era_id", IntegerType(), True),
            StructField("d_processing_era_id", IntegerType(), True),
            StructField("d_physics_group_id", IntegerType(), True),
            StructField("d_xtcrosssection", DoubleType(), True),
            StructField("d_prep_id", StringType(), True),
            StructField("d_creation_date", DoubleType(), True),
            StructField("d_create_by", StringType(), True),
            StructField("d_last_modification_date", DoubleType(), True),
            StructField("d_last_modified_by", StringType(), True)
        ])

databricks / koalas / databricks / koalas / groupby.py View on Github

kdf = DataFrame(pdf)
            return_schema = kdf._sdf.schema
            if len(pdf) &lt;= limit:
                return kdf

            sdf = self._spark_group_map_apply(
                pandas_transform, return_schema, retain_index=True)
            # If schema is inferred, we can restore indexes too.
            internal = kdf._internal.copy(sdf=sdf,
                                          column_scols=[scol_for(sdf, col)
                                                        for col in kdf._internal.data_columns])
        else:
            return_type = _infer_return_type(func).tpe
            data_columns = self._kdf._internal.data_columns
            return_schema = StructType([
                StructField(c, return_type) for c in data_columns if c not in input_groupnames])

            sdf = self._spark_group_map_apply(
                pandas_transform, return_schema, retain_index=False)
            # Otherwise, it loses index.
            internal = _InternalFrame(sdf=sdf)

        return DataFrame(internal)

aws-samples / aws-glue-samples / utilities / Hive_metastore_migration / src / hive_metastore_migration.py View on Github

StructField('name', StringType(), False),
        StructField('tableType', StringType(), True),
        StructField('viewExpandedText', StringType(), True),
        StructField('viewOriginalText', StringType(), True),
        StructField('parameters', MapType(StringType(), StringType(), True), True),
        StructField('partitionKeys', ArrayType(StructType([
            StructField('name', StringType(), True),
            StructField('type', StringType(), True),
            StructField('comment', StringType(), True)
        ]), True), True),
        StructField('storageDescriptor', DATACATALOG_STORAGE_DESCRIPTOR_SCHEMA, True)
    ])

DATACATALOG_PARTITION_ITEM_SCHEMA = \
    StructType([
        StructField('creationTime', StringType(), True),
        StructField('lastAccessTime', StringType(), True),
        StructField('namespaceName', StringType(), True),
        StructField('tableName', StringType(), True),
        StructField('parameters', MapType(StringType(), StringType(), True), True),
        StructField('storageDescriptor', DATACATALOG_STORAGE_DESCRIPTOR_SCHEMA, True),
        StructField('values', ArrayType(StringType(), False), False)
    ])

DATACATALOG_DATABASE_SCHEMA = \
    StructType([
        StructField('items', ArrayType(
            DATACATALOG_DATABASE_ITEM_SCHEMA, False),
                    True),
        StructField('type', StringType(), False)
    ])

openstack / monasca-transform / monasca_transform / transform / transform_utils.py View on Github

# FIXME: change when pre_transform_specs df is finalized

        event_type = StructField("event_type", StringType(), True)

        metric_id_list = StructField("metric_id_list",
                                     ArrayType(StringType(),
                                               containsNull=False),
                                     True)
        required_raw_fields_list = StructField("required_raw_fields_list",
                                               ArrayType(StringType(),
                                                         containsNull=False),
                                               True)

        event_processing_params = \
            StructField("event_processing_params",
                        StructType([StructField("set_default_zone_to",
                                                StringType(), True),
                                    StructField("set_default_geolocation_to",
                                                StringType(), True),
                                    StructField("set_default_region_to",
                                                StringType(), True),
                                    ]), True)

        schema = StructType([event_processing_params, event_type,
                             metric_id_list, required_raw_fields_list])

        return schema

aws-samples / aws-glue-samples / utilities / Hive_metastore_migration / src / hive_metastore_migration.py View on Github

def construct_struct_schema(schema_tuples_list):
    struct_fields = []
    atomic_types_dict = {
        'int': IntegerType(),
        'long': LongType(),
        'string': StringType()
    }
    for (col_name, col_type, nullable) in schema_tuples_list:
        field_type = atomic_types_dict[col_type]
        struct_fields.append(StructField(name=col_name, dataType=field_type, nullable=nullable))
    return StructType(struct_fields)

apache / incubator-spot / spot-ingest / pipelines / proxy / streaming.py View on Github

StructField('urischeme', StringType(), True),
                StructField('uriport', StringType(), True),
                StructField('uripath', StringType(), True),
                StructField('uriquery', StringType(), True),
                StructField('uriextension', StringType(), True),
                StructField('serverip', StringType(), True),
                StructField('scbytes', IntegerType(), True),
                StructField('csbytes', IntegerType(), True),
                StructField('virusid', StringType(), True),
                StructField('bcappname', StringType(), True),
                StructField('bcappoper', StringType(), True),
                StructField('fulluri', StringType(), True),
                StructField('y', StringType(), True),
                StructField('m', StringType(), True),
                StructField('d', StringType(), True),
                StructField('h', StringType(), True)
            ]

openstack / monasca-transform / monasca_transform / transform / transform_utils.py View on Github

"event_date", "event_hour", "event_minute",
                   "event_second", "metric_group", "metric_id"]

        columns_struct_fields = [StructField(field_name, StringType(), True)
                                 for field_name in columns]

        # Add a column for a non-string fields
        columns_struct_fields.insert(0,
                                     StructField("event_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.insert(0,
                                     StructField("event_quantity",
                                                 DoubleType(), True))

        # map to metric meta
        columns_struct_fields.append(StructField("meta",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
        # map to dimensions
        columns_struct_fields.append(StructField("dimensions",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
        # map to value_meta
        columns_struct_fields.append(StructField("value_meta",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))

Pinafore / qb / qanta / util / spark_features.py View on Github

from pyspark import RDD, SparkContext
from pyspark.sql import SparkSession, Row, DataFrame
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

from qanta import logging
from qanta.util.constants import BUZZ_FOLDS, FEATURE_NAMES
from qanta.datasets.quiz_bowl import QuestionDatabase
from qanta.preprocess import format_guess

log = logging.get(__name__)

SCHEMA = StructType([
    StructField('fold', StringType(), False),
    StructField('qnum', IntegerType(), False),
    StructField('sentence', IntegerType(), False),
    StructField('token', IntegerType(), False),
    StructField('guess', StringType(), False),
    StructField('feature_name', StringType(), False),
    StructField('feature_value', StringType(), False)
])


def create_output(path: str):
    df = read_dfs(path).cache()
    question_db = QuestionDatabase()
    answers = question_db.all_answers()
    for qnum in answers:
        answers[qnum] = format_guess(answers[qnum])

    sc = SparkContext.getOrCreate()  # type: SparkContext
    b_answers = sc.broadcast(answers)

How to use the pyspark.sql.types.StructField function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

pyspark

Package Health Score

Popular pyspark functions

Similar packages