How to use the avro.datafile function in avro

To help you get started, we’ve selected a few avro examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rjurney / Agile_Data_Code / ch03 / python / test_avro.py View on Github external
df_writer = datafile.DataFileWriter(
  open(OUTFILE_NAME, 'wb'),
  rec_writer,
  writers_schema = SCHEMA
)

df_writer.append( {"message_id": 11, "topic": "Hello galaxy", "user_id": 1} )
df_writer.append( {"message_id": 12, "topic": "Jim is silly!", "user_id": 1} )
df_writer.append( {"message_id": 23, "topic": "I like apples.", "user_id": 2} )
df_writer.close()

# Test reading avros
rec_reader = io.DatumReader()

# Create a 'data file' (avro file) reader
df_reader = datafile.DataFileReader(
  open(OUTFILE_NAME),
  rec_reader
)

# Read all records stored inside
pp = pprint.PrettyPrinter()
for record in df_reader:
  pp.pprint(record)
github rbystrit / avro_gen / tests / generator_tests.py View on Github external
tweet.metadata.isRetweet.data = True

        tweet.metadata.venueID.known = False
        tweet.metadata.venueID.data = None

        tweet.metadata.venuePoint.known = False
        tweet.metadata.venuePoint.data = None

        tmp_file = tempfile.mktemp()
        with open(tmp_file, "w+b") as f:
            df = datafile.DataFileWriter(f, io.DatumWriter(), schema.parse(schema_json))
            df.append(tweet)
            df.close()

        with open(tmp_file, "rb") as f:
            df = datafile.DataFileReader(f, SpecificDatumReader())
            tweet1 = next(df)
            df.close()

        self.assertEqual(tweet.ID, tweet1.ID)
        self.assertEqual(tweet.text, tweet1.text)
        self.assertEqual(tweet.authorScreenName, tweet1.authorScreenName)
        self.assertEqual(tweet.authorProfileImageURL, tweet1.authorProfileImageURL)
        self.assertEqual(tweet.authorUserID, tweet1.authorUserID)

        self.assertTrue(isinstance(tweet1.location, AvroPoint))
        self.assertEqual(tweet.location.latitude, tweet1.location.latitude)
        self.assertEqual(tweet.location.longitude, tweet1.location.longitude)
        self.assertEqual(tweet.placeID, tweet1.placeID)
        self.assertTrue(isinstance(tweet1.createdAt, AvroDateTime))
        self.assertEqual(tweet.createdAt.dateTimeString, tweet1.createdAt.dateTimeString)
github Byhiras / pyavroc / tests / test_vs_avro_python.py View on Github external
def _python_read(filename):
    fp = avro.datafile.DataFileReader(open(filename, 'rb'), avro.io.DatumReader())

    return list(fp)
github adamnovak / sequence-graphs / importVCF / importVCF.py View on Github external
options.allele_group_schema))
        logging.critical("Check your --allele_group_schema option")
        sys.exit(1)
        
    try:
        # This holds the schema for Adjacencies
        adjacency_schema = avro.schema.parse(
            open(options.adjacency_schema).read())
    except IOError:
        logging.critical("Could not load Avro Adjacency schema {}".format(
            options.allelegroup_schema))
        logging.critical("Check your --adjacency_schema option")
        sys.exit(1)
        
    # Make Avro-format output file writers. This one is for allele groups.
    allele_group_writer = avro.datafile.DataFileWriter(
        options.allele_group_file, avro.io.DatumWriter(), allele_group_schema)
    # This one is for adjacencies
    adjacency_writer = avro.datafile.DataFileWriter(options.adjacency_file,
        avro.io.DatumWriter(), adjacency_schema)
    
    # Make a VCF reader to read the input VCF
    vcf_reader = vcf.Reader(options.vcf)
    
    # Load all the VCF records into memory. TODO: implement streaming with state
    # for each sample.
    records = list(vcf_reader)
    
    for sample in vcf_reader.samples:
        # Process each sample one at a time
        import_sample(records, sample, allele_group_writer, adjacency_writer)
github cloudera / hue / apps / hbase / src / hbase / views.py View on Github external
json.dumps(data)
      return data
    except:
      LOG.debug('Failed to dump data as JSON, falling back to raw data.')
      cleaned = {}
      lim = [0]
      if isinstance(data, str): # Not JSON dumpable, meaning some sort of bytestring or byte data
        #detect if avro file
        if(data[:3] == '\x4F\x62\x6A'):
          #write data to file in memory
          output = io.StringIO()
          output.write(data)

          #read and parse avro
          rec_reader = io.DatumReader()
          df_reader = datafile.DataFileReader(output, rec_reader)
          return json.dumps(clean([record for record in df_reader]))
        return base64.b64encode(data)

      if hasattr(data, "__iter__"):
        if type(data) is dict:
          for i in data:
            cleaned[i] = clean(data[i])
        elif type(data) is list:
          cleaned = []
          for i, item in enumerate(data):
            cleaned += [clean(item)]
        else:
          for i, item in enumerate(data):
            cleaned[i] = clean(item)
      else:
        for key in dir(data):
github rjurney / Agile_Data_Code / ch09 / tune_weights.py View on Github external
from nltk.tokenize import word_tokenize

import dateutil.parser

pp = pprint.PrettyPrinter()

conn = pymongo.Connection() # defaults to localhost
db = conn.agile_data
from_to_reply_ratios = db['from_to_reply_ratios']
hourly_from_reply_probs = db['hourly_from_reply_probs']
token_reply_rates = db['token_reply_rates']

# Test reading avros
rec_reader = io.DatumReader()
# Create a 'data file' (avro file) reader
df_reader = datafile.DataFileReader(
  open("/me/Data/test_mbox/part-1.avro"),
  rec_reader
)

# Go through all the avro emails...
for record in df_reader:
  # Get the message_id, from, first to, and message body
  message_id = record['message_id']
  froms = record['from']['address']
  if record['tos']:
    if record['tos'][0]:
      to = record['tos'][0]['address']
  
  # For each token in the body, if there's a match in MongoDB, 
  # append it and average all of them at the end
  word_probs = []
github apache / beam / sdks / python / apache_beam / io / avroio.py View on Github external
the specification.
    """
    if f.tell() > 0:
      f.seek(0)
    decoder = avroio.BinaryDecoder(f)
    header = avroio.DatumReader().read_data(datafile.META_SCHEMA,
                                            datafile.META_SCHEMA, decoder)
    if header.get('magic') != datafile.MAGIC:
      raise ValueError('Not an Avro file. File header should start with %s but'
                       'started with %s instead.', datafile.MAGIC,
                       header.get('magic'))

    meta = header['meta']

    if datafile.CODEC_KEY in meta:
      codec = meta[datafile.CODEC_KEY]
    else:
      codec = 'null'

    schema_string = meta[datafile.SCHEMA_KEY]
    sync_marker = header['sync']

    return codec, schema_string, sync_marker
github apache / beam / sdks / python / apache_beam / io / avroio.py View on Github external
"""Reads metadata from a given Avro file.

    Args:
      f: Avro file to read.
    Returns:
      a tuple containing the codec, schema, and the sync marker of the Avro
      file.

    Raises:
      ValueError: if the file does not start with the byte sequence defined in
                  the specification.
    """
    if f.tell() > 0:
      f.seek(0)
    decoder = avroio.BinaryDecoder(f)
    header = avroio.DatumReader().read_data(datafile.META_SCHEMA,
                                            datafile.META_SCHEMA, decoder)
    if header.get('magic') != datafile.MAGIC:
      raise ValueError('Not an Avro file. File header should start with %s but'
                       'started with %s instead.', datafile.MAGIC,
                       header.get('magic'))

    meta = header['meta']

    if datafile.CODEC_KEY in meta:
      codec = meta[datafile.CODEC_KEY]
    else:
      codec = 'null'

    schema_string = meta[datafile.SCHEMA_KEY]
    sync_marker = header['sync']
github diana-hep / oamap / makeplanets.py View on Github external
if schema.nullable:
        return ["null", out]
    else:
        return out

avroschema = avro.schema.make_avsc_object(convert2avro(schema.content, {}))

import avro.datafile
import avro.io

writer = avro.datafile.DataFileWriter(open("planets_uncompressed.avro", "wb"), avro.io.DatumWriter(), avroschema)
for star in before:
    writer.append(star)
writer.close()

writer = avro.datafile.DataFileWriter(open("planets.avro", "wb"), avro.io.DatumWriter(), avroschema, codec="deflate")
for star in before:
    writer.append(star)
writer.close()

import bson

writer = open("planets.bson", "wb")
for star in before:
    writer.write(bson.BSON.encode(star))
writer.close()
os.system("gzip -k planets.bson")
github axbaretto / beam / sdks / python / apache_beam / io / avroio.py View on Github external
"""Reads metadata from a given Avro file.

    Args:
      f: Avro file to read.
    Returns:
      a tuple containing the codec, schema, and the sync marker of the Avro
      file.

    Raises:
      ValueError: if the file does not start with the byte sequence defined in
                  the specification.
    """
    if f.tell() > 0:
      f.seek(0)
    decoder = avroio.BinaryDecoder(f)
    header = avroio.DatumReader().read_data(datafile.META_SCHEMA,
                                            datafile.META_SCHEMA, decoder)
    if header.get('magic') != datafile.MAGIC:
      raise ValueError('Not an Avro file. File header should start with %s but'
                       'started with %s instead.'
                       % (datafile.MAGIC, header.get('magic')))

    meta = header['meta']

    if datafile.CODEC_KEY in meta:
      codec = meta[datafile.CODEC_KEY]
    else:
      codec = b'null'

    schema_string = meta[datafile.SCHEMA_KEY].decode('utf-8')
    sync_marker = header['sync']