How to use the avro.schema.parse function in avro

To help you get started, we’ve selected a few avro examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Byhiras / pyavroc / tests / test_vs_avro_python.py View on Github external
def _python_create_file(filename):
    if sys.version_info >= (3,):
        schema = avro.schema.Parse(json_schema)
    else:
        schema = avro.schema.parse(json_schema)

    fp = open(filename, 'wb')

    writer = avro.datafile.DataFileWriter(fp, avro.io.DatumWriter(), schema)

    for i in range(1):
        writer.append({"name": "Alyssa", "favorite_number": 256})
        writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})

    writer.close()

    fp.close()
github GoogleCloudPlatform / professional-services / examples / dataflow-data-generator / data-generator-pipeline / data_generator_pipeline.py View on Github external
for key in data_args.primary_key_cols.split(','):
            rows |= 'Enforcing primary key: {}'.format(key) >> EnforcePrimaryKeys(
                        key)

    if data_args.csv_schema_order:
        (rows
            | 'Order fields for CSV writing.' >> beam.FlatMap(lambda d: 
                   [dict_to_csv(d, data_args.csv_schema_order.split(','))])

            | 'Write to GCS' >> beam.io.textio.WriteToText(
                   file_path_prefix=data_args.output_prefix,
                   file_name_suffix='.csv')
        )

    if data_args.avro_schema_file:
        avsc = avro.schema.parse(open(data_args.avro_schema_file, 'rb').read())

        (rows
            # Need to convert time stamps from strings to timestamp-micros
            | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: 
                fix_record_for_avro(row, avsc))
            | 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
                    file_path_prefix=data_args.output_prefix,
                    codec='null',
                    file_name_suffix='.avro',
                    use_fastavro=True,
                    schema=avsc
                )
        )

    if data_args.output_bq_table:
        (rows
github adamnovak / sequence-graphs / importVCF / importVCF.py View on Github external
# Load the Avro schemas
    
    try:
        # This holds the schema for AlleleGroups
        allele_group_schema = avro.schema.parse(
            open(options.allele_group_schema).read())
    except IOError:
        logging.critical("Could not load Avro AlleleGroup schema {}".format(
            options.allele_group_schema))
        logging.critical("Check your --allele_group_schema option")
        sys.exit(1)
        
    try:
        # This holds the schema for Adjacencies
        adjacency_schema = avro.schema.parse(
            open(options.adjacency_schema).read())
    except IOError:
        logging.critical("Could not load Avro Adjacency schema {}".format(
            options.allelegroup_schema))
        logging.critical("Check your --adjacency_schema option")
        sys.exit(1)
        
    # Make Avro-format output file writers. This one is for allele groups.
    allele_group_writer = avro.datafile.DataFileWriter(
        options.allele_group_file, avro.io.DatumWriter(), allele_group_schema)
    # This one is for adjacencies
    adjacency_writer = avro.datafile.DataFileWriter(options.adjacency_file,
        avro.io.DatumWriter(), adjacency_schema)
    
    # Make a VCF reader to read the input VCF
    vcf_reader = vcf.Reader(options.vcf)
github apache / avro / lang / py / avro / datafile.py View on Github external
self.set_meta('avro.codec', codec)
      self.set_meta('avro.schema', str(writers_schema))
      self.datum_writer.writers_schema = writers_schema
    else:
      # open writer for reading to collect metadata
      dfr = DataFileReader(writer, avro.io.DatumReader())

      # TODO(hammer): collect arbitrary metadata
      # collect metadata
      self._sync_marker = dfr.sync_marker
      self.set_meta('avro.codec', dfr.get_meta('avro.codec'))

      # get schema used to write existing file
      schema_from_file = dfr.get_meta('avro.schema')
      self.set_meta('avro.schema', schema_from_file)
      self.datum_writer.writers_schema = avro.schema.parse(schema_from_file)

      # seek to the end of the file and prepare for writing
      writer.seek(0, 2)
      self._header_written = True
github ga4gh / ga4gh-server / ga4gh / _protocol_definitions.py View on Github external
"""


class ListReferenceBasesRequest(ProtocolElement):
    """
    The query parameters for a request to GET /references/{id}/bases,
    for example:  GET /references/{id}/bases?start=100&end=200
    """
    _schemaSource = """
{"namespace": "org.ga4gh.methods", "type": "record", "name":
"ListReferenceBasesRequest", "fields": [{"default": 0, "doc": "",
"type": "long", "name": "start"}, {"default": null, "doc": "", "type":
["null", "long"], "name": "end"}, {"default": null, "doc": "", "type":
["null", "string"], "name": "pageToken"}], "doc": ""}
"""
    schema = avro.schema.parse(_schemaSource)
    requiredFields = set([])

    @classmethod
    def isEmbeddedType(cls, fieldName):
        embeddedTypes = {}
        return fieldName in embeddedTypes

    @classmethod
    def getEmbeddedType(cls, fieldName):
        embeddedTypes = {}

        return embeddedTypes[fieldName]

    __slots__ = [
        'end', 'pageToken', 'start'
    ]
github Yelp / data_pipeline / data_pipeline / envelope.py View on Github external
def _schema(self):
        # Keeping this as an instance method because of issues with sharing
        # this data across processes.
        schema_path = os.path.join(
            os.path.dirname(__file__),
            'schemas/envelope_v1.avsc'
        )
        return avro.schema.parse(open(schema_path).read())
github ga4gh / ga4gh-server / ga4gh / _protocol_definitions.py View on Github external
References are designed to be immutable.
    """
    _schemaSource = """
{"namespace": "org.ga4gh.models", "type": "record", "name":
"Reference", "fields": [{"doc": "", "type": "string", "name": "id"},
{"doc": "", "type": "long", "name": "length"}, {"doc": "", "type":
"string", "name": "md5checksum"}, {"doc": "", "type": "string",
"name": "name"}, {"default": null, "doc": "", "type": ["null",
"string"], "name": "sourceURI"}, {"doc": "", "type": {"items":
"string", "type": "array"}, "name": "sourceAccessions"}, {"default":
false, "doc": "", "type": "boolean", "name": "isDerived"}, {"default":
null, "doc": "", "type": ["null", "float"], "name":
"sourceDivergence"}, {"default": null, "doc": "", "type": ["null",
"int"], "name": "ncbiTaxonId"}], "doc": ""}
"""
    schema = avro.schema.parse(_schemaSource)
    requiredFields = set([
        "id",
        "length",
        "md5checksum",
        "name",
        "sourceAccessions",
    ])

    @classmethod
    def isEmbeddedType(cls, fieldName):
        embeddedTypes = {}
        return fieldName in embeddedTypes

    @classmethod
    def getEmbeddedType(cls, fieldName):
        embeddedTypes = {}
github alexhanna / hadoop / utils / JSONtoAvro.py View on Github external
def main():
	if len(sys.argv) < 2:
		print "Usage: cat input.json | python2.7 JSONtoAvro.py output"
		return

	s = schema.parse(open("tweet.avsc").read())
	f = open(sys.argv[1], "wb")

	writer = datafile.DataFileWriter(f, io.DatumWriter(), s, codec = 'deflate')

	failed = 0

	for line in sys.stdin:
		line = line.strip()

		try:
			data = json.loads(line)
		except ValueError as detail:
			continue

		try:
			writer.append(data)
github ga4gh / ga4gh-server / ga4gh / _protocol_definitions.py View on Github external
possibly work' for grouping data (e.g. Dataset X has all the
    reads, variants, and expression levels for a particular research
    project; Dataset Y has all the work product from a particular
    grant).  For data accessors, they're a simple way to scope
    exploration and analysis (e.g. are there any supporting examples
    in 1000genomes? what's the distribution of that result in the data
    from our project?)
    """
    _schemaSource = """
{"namespace": "org.ga4gh.models", "type": "record", "name": "Dataset",
"fields": [{"doc": "", "type": "string", "name": "id"}, {"default":
null, "doc": "", "type": ["null", "string"], "name": "name"},
{"default": null, "doc": "", "type": ["null", "string"], "name":
"description"}], "doc": ""}
"""
    schema = avro.schema.parse(_schemaSource)
    requiredFields = set([
        "id",
    ])

    @classmethod
    def isEmbeddedType(cls, fieldName):
        embeddedTypes = {}
        return fieldName in embeddedTypes

    @classmethod
    def getEmbeddedType(cls, fieldName):
        embeddedTypes = {}

        return embeddedTypes[fieldName]

    __slots__ = [
github common-workflow-language / cwltool / cwltool / tool_new.py View on Github external
def schema_for_item(value, array_schema, tool):
    if not array_schema:
        return None
    opts = array_schema.get('items', [])
    if not opts:
        return None
    if not isinstance(opts, list):
        opts = [opts]
    opts = [schema_by_name(opt, tool) for opt in opts]
    if len(opts) == 1:
        return opts[0]
    for opt in opts:
        sch = avro.schema.parse(json.dumps(opt))
        if avro.io.validate(sch, value):
            return opt
    return None