Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
StructField('stos', IntegerType(), True),
StructField('ipkt', LongType(), True),
StructField('ibyt', LongType(), True),
StructField('opkt', LongType(), True),
StructField('obyt', LongType(), True),
StructField('input', IntegerType(), True),
StructField('output', IntegerType(), True),
StructField('sas', IntegerType(), True),
StructField('das', IntegerType(), True),
StructField('dtos', IntegerType(), True),
StructField('dir', IntegerType(), True),
StructField('rip', StringType(), True),
StructField('y', ShortType(), True),
StructField('m', ShortType(), True),
StructField('d', ShortType(), True),
StructField('h', ShortType(), True)
]
gdeltSchema = StructType([
StructField('GLOBALEVENTID', StringType(), True),
StructField('SQLDATE', StringType(), True),
StructField('MonthYear', StringType(), True),
StructField('Year', StringType(), True),
StructField('FractionDate', StringType(), True),
StructField('Actor1Code', StringType(), True),
StructField('Actor1Name', StringType(), True),
StructField('Actor1CountryCode', StringType(), True),
StructField('Actor1KnownGroupCode', StringType(), True),
StructField('Actor1EthnicCode', StringType(), True),
StructField('Actor1Religion1Code', StringType(), True),
StructField('Actor1Religion2Code', StringType(), True),
StructField('Actor1Type1Code', StringType(), True),
StructField('Actor1Type2Code', StringType(), True),
StructField('Actor1Type3Code', StringType(), True),
StructField('Actor2Code', StringType(), True),
StructField('Actor2Name', StringType(), True),
StructField('Actor2CountryCode', StringType(), True),
StructField('Actor2KnownGroupCode', StringType(), True),
StructField('Actor2EthnicCode', StringType(), True),
StructField('Actor2Religion1Code', StringType(), True),
StructField('Actor2Religion2Code', StringType(), True),
StructField('Actor2Type1Code', StringType(), True),
StructField('Actor2Type2Code', StringType(), True),
StructField('Actor2Type3Code', StringType(), True),
StructField('IsRootEvent', StringType(), True),
StructField('EventCode', StringType(), True),
StructField('EventBaseCode', StringType(), True),
StructField('EventRootCode', StringType(), True),
StructField('QuadClass', StringType(), True),
ACQUISITION_ERA_ID NUMBER(38)
PROCESSING_ERA_ID NUMBER(38)
PHYSICS_GROUP_ID NUMBER(38)
XTCROSSSECTION FLOAT(126)
PREP_ID VARCHAR2(256)
CREATION_DATE NUMBER(38)
CREATE_BY VARCHAR2(500)
LAST_MODIFICATION_DATE NUMBER(38)
LAST_MODIFIED_BY VARCHAR2(500)
"""
return StructType([
StructField("d_dataset_id", IntegerType(), True),
StructField("d_dataset", StringType(), True),
StructField("d_is_dataset_valid", IntegerType(), True),
StructField("d_primary_ds_id", IntegerType(), True),
StructField("d_processed_ds_id", IntegerType(), True),
StructField("d_data_tier_id", IntegerType(), True),
StructField("d_dataset_access_type_id", IntegerType(), True),
StructField("d_acquisition_era_id", IntegerType(), True),
StructField("d_processing_era_id", IntegerType(), True),
StructField("d_physics_group_id", IntegerType(), True),
StructField("d_xtcrosssection", DoubleType(), True),
StructField("d_prep_id", StringType(), True),
StructField("d_creation_date", DoubleType(), True),
StructField("d_create_by", StringType(), True),
StructField("d_last_modification_date", DoubleType(), True),
StructField("d_last_modified_by", StringType(), True)
])
kdf = DataFrame(pdf)
return_schema = kdf._sdf.schema
if len(pdf) <= limit:
return kdf
sdf = self._spark_group_map_apply(
pandas_transform, return_schema, retain_index=True)
# If schema is inferred, we can restore indexes too.
internal = kdf._internal.copy(sdf=sdf,
column_scols=[scol_for(sdf, col)
for col in kdf._internal.data_columns])
else:
return_type = _infer_return_type(func).tpe
data_columns = self._kdf._internal.data_columns
return_schema = StructType([
StructField(c, return_type) for c in data_columns if c not in input_groupnames])
sdf = self._spark_group_map_apply(
pandas_transform, return_schema, retain_index=False)
# Otherwise, it loses index.
internal = _InternalFrame(sdf=sdf)
return DataFrame(internal)
StructField('name', StringType(), False),
StructField('tableType', StringType(), True),
StructField('viewExpandedText', StringType(), True),
StructField('viewOriginalText', StringType(), True),
StructField('parameters', MapType(StringType(), StringType(), True), True),
StructField('partitionKeys', ArrayType(StructType([
StructField('name', StringType(), True),
StructField('type', StringType(), True),
StructField('comment', StringType(), True)
]), True), True),
StructField('storageDescriptor', DATACATALOG_STORAGE_DESCRIPTOR_SCHEMA, True)
])
DATACATALOG_PARTITION_ITEM_SCHEMA = \
StructType([
StructField('creationTime', StringType(), True),
StructField('lastAccessTime', StringType(), True),
StructField('namespaceName', StringType(), True),
StructField('tableName', StringType(), True),
StructField('parameters', MapType(StringType(), StringType(), True), True),
StructField('storageDescriptor', DATACATALOG_STORAGE_DESCRIPTOR_SCHEMA, True),
StructField('values', ArrayType(StringType(), False), False)
])
DATACATALOG_DATABASE_SCHEMA = \
StructType([
StructField('items', ArrayType(
DATACATALOG_DATABASE_ITEM_SCHEMA, False),
True),
StructField('type', StringType(), False)
])
# FIXME: change when pre_transform_specs df is finalized
event_type = StructField("event_type", StringType(), True)
metric_id_list = StructField("metric_id_list",
ArrayType(StringType(),
containsNull=False),
True)
required_raw_fields_list = StructField("required_raw_fields_list",
ArrayType(StringType(),
containsNull=False),
True)
event_processing_params = \
StructField("event_processing_params",
StructType([StructField("set_default_zone_to",
StringType(), True),
StructField("set_default_geolocation_to",
StringType(), True),
StructField("set_default_region_to",
StringType(), True),
]), True)
schema = StructType([event_processing_params, event_type,
metric_id_list, required_raw_fields_list])
return schema
def construct_struct_schema(schema_tuples_list):
struct_fields = []
atomic_types_dict = {
'int': IntegerType(),
'long': LongType(),
'string': StringType()
}
for (col_name, col_type, nullable) in schema_tuples_list:
field_type = atomic_types_dict[col_type]
struct_fields.append(StructField(name=col_name, dataType=field_type, nullable=nullable))
return StructType(struct_fields)
StructField('urischeme', StringType(), True),
StructField('uriport', StringType(), True),
StructField('uripath', StringType(), True),
StructField('uriquery', StringType(), True),
StructField('uriextension', StringType(), True),
StructField('serverip', StringType(), True),
StructField('scbytes', IntegerType(), True),
StructField('csbytes', IntegerType(), True),
StructField('virusid', StringType(), True),
StructField('bcappname', StringType(), True),
StructField('bcappoper', StringType(), True),
StructField('fulluri', StringType(), True),
StructField('y', StringType(), True),
StructField('m', StringType(), True),
StructField('d', StringType(), True),
StructField('h', StringType(), True)
]
"event_date", "event_hour", "event_minute",
"event_second", "metric_group", "metric_id"]
columns_struct_fields = [StructField(field_name, StringType(), True)
for field_name in columns]
# Add a column for a non-string fields
columns_struct_fields.insert(0,
StructField("event_timestamp_unix",
DoubleType(), True))
columns_struct_fields.insert(0,
StructField("event_quantity",
DoubleType(), True))
# map to metric meta
columns_struct_fields.append(StructField("meta",
MapType(StringType(),
StringType(),
True),
True))
# map to dimensions
columns_struct_fields.append(StructField("dimensions",
MapType(StringType(),
StringType(),
True),
True))
# map to value_meta
columns_struct_fields.append(StructField("value_meta",
MapType(StringType(),
StringType(),
True),
True))
from pyspark import RDD, SparkContext
from pyspark.sql import SparkSession, Row, DataFrame
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from qanta import logging
from qanta.util.constants import BUZZ_FOLDS, FEATURE_NAMES
from qanta.datasets.quiz_bowl import QuestionDatabase
from qanta.preprocess import format_guess
log = logging.get(__name__)
SCHEMA = StructType([
StructField('fold', StringType(), False),
StructField('qnum', IntegerType(), False),
StructField('sentence', IntegerType(), False),
StructField('token', IntegerType(), False),
StructField('guess', StringType(), False),
StructField('feature_name', StringType(), False),
StructField('feature_value', StringType(), False)
])
def create_output(path: str):
df = read_dfs(path).cache()
question_db = QuestionDatabase()
answers = question_db.all_answers()
for qnum in answers:
answers[qnum] = format_guess(answers[qnum])
sc = SparkContext.getOrCreate() # type: SparkContext
b_answers = sc.broadcast(answers)