Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
df_writer = datafile.DataFileWriter(
open(OUTFILE_NAME, 'wb'),
rec_writer,
writers_schema = SCHEMA
)
df_writer.append( {"message_id": 11, "topic": "Hello galaxy", "user_id": 1} )
df_writer.append( {"message_id": 12, "topic": "Jim is silly!", "user_id": 1} )
df_writer.append( {"message_id": 23, "topic": "I like apples.", "user_id": 2} )
df_writer.close()
# Test reading avros
rec_reader = io.DatumReader()
# Create a 'data file' (avro file) reader
df_reader = datafile.DataFileReader(
open(OUTFILE_NAME),
rec_reader
)
# Read all records stored inside
pp = pprint.PrettyPrinter()
for record in df_reader:
pp.pprint(record)
tweet.metadata.isRetweet.data = True
tweet.metadata.venueID.known = False
tweet.metadata.venueID.data = None
tweet.metadata.venuePoint.known = False
tweet.metadata.venuePoint.data = None
tmp_file = tempfile.mktemp()
with open(tmp_file, "w+b") as f:
df = datafile.DataFileWriter(f, io.DatumWriter(), schema.parse(schema_json))
df.append(tweet)
df.close()
with open(tmp_file, "rb") as f:
df = datafile.DataFileReader(f, SpecificDatumReader())
tweet1 = next(df)
df.close()
self.assertEqual(tweet.ID, tweet1.ID)
self.assertEqual(tweet.text, tweet1.text)
self.assertEqual(tweet.authorScreenName, tweet1.authorScreenName)
self.assertEqual(tweet.authorProfileImageURL, tweet1.authorProfileImageURL)
self.assertEqual(tweet.authorUserID, tweet1.authorUserID)
self.assertTrue(isinstance(tweet1.location, AvroPoint))
self.assertEqual(tweet.location.latitude, tweet1.location.latitude)
self.assertEqual(tweet.location.longitude, tweet1.location.longitude)
self.assertEqual(tweet.placeID, tweet1.placeID)
self.assertTrue(isinstance(tweet1.createdAt, AvroDateTime))
self.assertEqual(tweet.createdAt.dateTimeString, tweet1.createdAt.dateTimeString)
def _python_read(filename):
fp = avro.datafile.DataFileReader(open(filename, 'rb'), avro.io.DatumReader())
return list(fp)
options.allele_group_schema))
logging.critical("Check your --allele_group_schema option")
sys.exit(1)
try:
# This holds the schema for Adjacencies
adjacency_schema = avro.schema.parse(
open(options.adjacency_schema).read())
except IOError:
logging.critical("Could not load Avro Adjacency schema {}".format(
options.allelegroup_schema))
logging.critical("Check your --adjacency_schema option")
sys.exit(1)
# Make Avro-format output file writers. This one is for allele groups.
allele_group_writer = avro.datafile.DataFileWriter(
options.allele_group_file, avro.io.DatumWriter(), allele_group_schema)
# This one is for adjacencies
adjacency_writer = avro.datafile.DataFileWriter(options.adjacency_file,
avro.io.DatumWriter(), adjacency_schema)
# Make a VCF reader to read the input VCF
vcf_reader = vcf.Reader(options.vcf)
# Load all the VCF records into memory. TODO: implement streaming with state
# for each sample.
records = list(vcf_reader)
for sample in vcf_reader.samples:
# Process each sample one at a time
import_sample(records, sample, allele_group_writer, adjacency_writer)
json.dumps(data)
return data
except:
LOG.debug('Failed to dump data as JSON, falling back to raw data.')
cleaned = {}
lim = [0]
if isinstance(data, str): # Not JSON dumpable, meaning some sort of bytestring or byte data
#detect if avro file
if(data[:3] == '\x4F\x62\x6A'):
#write data to file in memory
output = io.StringIO()
output.write(data)
#read and parse avro
rec_reader = io.DatumReader()
df_reader = datafile.DataFileReader(output, rec_reader)
return json.dumps(clean([record for record in df_reader]))
return base64.b64encode(data)
if hasattr(data, "__iter__"):
if type(data) is dict:
for i in data:
cleaned[i] = clean(data[i])
elif type(data) is list:
cleaned = []
for i, item in enumerate(data):
cleaned += [clean(item)]
else:
for i, item in enumerate(data):
cleaned[i] = clean(item)
else:
for key in dir(data):
from nltk.tokenize import word_tokenize
import dateutil.parser
pp = pprint.PrettyPrinter()
conn = pymongo.Connection() # defaults to localhost
db = conn.agile_data
from_to_reply_ratios = db['from_to_reply_ratios']
hourly_from_reply_probs = db['hourly_from_reply_probs']
token_reply_rates = db['token_reply_rates']
# Test reading avros
rec_reader = io.DatumReader()
# Create a 'data file' (avro file) reader
df_reader = datafile.DataFileReader(
open("/me/Data/test_mbox/part-1.avro"),
rec_reader
)
# Go through all the avro emails...
for record in df_reader:
# Get the message_id, from, first to, and message body
message_id = record['message_id']
froms = record['from']['address']
if record['tos']:
if record['tos'][0]:
to = record['tos'][0]['address']
# For each token in the body, if there's a match in MongoDB,
# append it and average all of them at the end
word_probs = []
the specification.
"""
if f.tell() > 0:
f.seek(0)
decoder = avroio.BinaryDecoder(f)
header = avroio.DatumReader().read_data(datafile.META_SCHEMA,
datafile.META_SCHEMA, decoder)
if header.get('magic') != datafile.MAGIC:
raise ValueError('Not an Avro file. File header should start with %s but'
'started with %s instead.', datafile.MAGIC,
header.get('magic'))
meta = header['meta']
if datafile.CODEC_KEY in meta:
codec = meta[datafile.CODEC_KEY]
else:
codec = 'null'
schema_string = meta[datafile.SCHEMA_KEY]
sync_marker = header['sync']
return codec, schema_string, sync_marker
"""Reads metadata from a given Avro file.
Args:
f: Avro file to read.
Returns:
a tuple containing the codec, schema, and the sync marker of the Avro
file.
Raises:
ValueError: if the file does not start with the byte sequence defined in
the specification.
"""
if f.tell() > 0:
f.seek(0)
decoder = avroio.BinaryDecoder(f)
header = avroio.DatumReader().read_data(datafile.META_SCHEMA,
datafile.META_SCHEMA, decoder)
if header.get('magic') != datafile.MAGIC:
raise ValueError('Not an Avro file. File header should start with %s but'
'started with %s instead.', datafile.MAGIC,
header.get('magic'))
meta = header['meta']
if datafile.CODEC_KEY in meta:
codec = meta[datafile.CODEC_KEY]
else:
codec = 'null'
schema_string = meta[datafile.SCHEMA_KEY]
sync_marker = header['sync']
if schema.nullable:
return ["null", out]
else:
return out
avroschema = avro.schema.make_avsc_object(convert2avro(schema.content, {}))
import avro.datafile
import avro.io
writer = avro.datafile.DataFileWriter(open("planets_uncompressed.avro", "wb"), avro.io.DatumWriter(), avroschema)
for star in before:
writer.append(star)
writer.close()
writer = avro.datafile.DataFileWriter(open("planets.avro", "wb"), avro.io.DatumWriter(), avroschema, codec="deflate")
for star in before:
writer.append(star)
writer.close()
import bson
writer = open("planets.bson", "wb")
for star in before:
writer.write(bson.BSON.encode(star))
writer.close()
os.system("gzip -k planets.bson")
"""Reads metadata from a given Avro file.
Args:
f: Avro file to read.
Returns:
a tuple containing the codec, schema, and the sync marker of the Avro
file.
Raises:
ValueError: if the file does not start with the byte sequence defined in
the specification.
"""
if f.tell() > 0:
f.seek(0)
decoder = avroio.BinaryDecoder(f)
header = avroio.DatumReader().read_data(datafile.META_SCHEMA,
datafile.META_SCHEMA, decoder)
if header.get('magic') != datafile.MAGIC:
raise ValueError('Not an Avro file. File header should start with %s but'
'started with %s instead.'
% (datafile.MAGIC, header.get('magic')))
meta = header['meta']
if datafile.CODEC_KEY in meta:
codec = meta[datafile.CODEC_KEY]
else:
codec = b'null'
schema_string = meta[datafile.SCHEMA_KEY].decode('utf-8')
sync_marker = header['sync']