Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
import pandas as pd
import pickle as pkl
import lz4.frame as lz4f
from coffea.util import numpy as np
from coffea.processor.spark.spark_executor import agg_histos_raw, reduce_histos_raw
from coffea.processor.test_items import NanoTestProcessor
proc = NanoTestProcessor()
one = proc.accumulator.identity()
two = proc.accumulator.identity()
hlist1 = [lz4f.compress(pkl.dumps(one))]
hlist2 = [lz4f.compress(pkl.dumps(one)),lz4f.compress(pkl.dumps(two))]
harray1 = np.array(hlist1, dtype='O')
harray2 = np.array(hlist2, dtype='O')
series1 = pd.Series(harray1)
series2 = pd.Series(harray2)
df = pd.DataFrame({'histos': harray2})
# correctness of these functions is checked in test_spark_executor
agg1 = agg_histos_raw(series1, proc, 1)
agg2 = agg_histos_raw(series2, proc, 1)
red = reduce_histos_raw(df, proc, 1)
def test_frame_decompress_mem_usage(data):
tracemalloc = pytest.importorskip('tracemalloc')
tracemalloc.start()
compressed = lz4.frame.compress(data)
prev_snapshot = None
for i in range(1000):
decompressed = lz4.frame.decompress(compressed) # noqa: F841
if i % 100 == 0:
gc.collect()
snapshot = tracemalloc.take_snapshot()
if prev_snapshot:
stats = snapshot.compare_to(prev_snapshot, 'lineno')
assert stats[0].size_diff < MEM_INCREASE_LIMIT
prev_snapshot = snapshot
def test_content_checksum_failure(data):
compressed = lz4frame.compress(data, content_checksum=True)
message = r'^LZ4F_decompress failed with code: ERROR_contentChecksum_invalid$'
with pytest.raises(RuntimeError, message=message):
last = struct.unpack('B', compressed[-1:])[0]
lz4frame.decompress(compressed[:-1] + struct.pack('B', last ^ 0x42))
chunks.append(chunk)
nchunks[filemeta.dataset] += 1
if nchunks[filemeta.dataset] >= maxchunks:
break
# pop all _work_function args here
savemetrics = executor_args.pop('savemetrics', False)
flatten = executor_args.pop('flatten', False)
mmap = executor_args.pop('mmap', False)
nano = executor_args.pop('nano', False)
cachestrategy = executor_args.pop('cachestrategy', None)
pi_compression = executor_args.pop('processor_compression', 1)
if pi_compression is None:
pi_to_send = processor_instance
else:
pi_to_send = lz4f.compress(cloudpickle.dumps(processor_instance), compression_level=pi_compression)
closure = partial(
_work_function,
flatten=flatten,
savemetrics=savemetrics,
mmap=mmap,
nano=nano,
cachestrategy=cachestrategy,
skipbadfiles=skipbadfiles,
retries=retries,
xrootdtimeout=xrootdtimeout,
)
# hack around dask/dask#5503 which is really a silly request but here we are
if executor is dask_executor:
executor_args['heavy_input'] = pi_to_send
closure = partial(closure, processor_instance='heavy')
else:
def apply_lz4_compression(decompressed_input_bin) -> tuple:
"""
Apply LZ4 compression to the input
Args:
:param decompressed_input_bin: the binary to be compressed
:return: a tuple (compressed_result, LZ4)
"""
return lz4.frame.compress(decompressed_input_bin), LZ4
LOG.error('%s - indicator without confidence', self.name)
sindicator.confidence = "Unknown" # We shouldn't be here
elif confidence < 50:
sindicator.confidence = "Low"
elif confidence < 75:
sindicator.confidence = "Medium"
else:
sindicator.confidence = "High"
sindicator.add_indicator_type(type_mapper['indicator_type'])
sindicator.add_observable(o)
sp.add_indicator(sindicator)
spackage = 'lz4'+lz4.frame.compress(
sp.to_json(),
compression_level=lz4.frame.COMPRESSIONLEVEL_MINHC
)
with self.SR.pipeline() as p:
p.multi()
p.zadd(self.redis_skey, score, spid)
p.hset(self.redis_skey_value, spid, spackage)
result = p.execute()[0]
self.statistics['added'] += result
def agg_histos_raw(series, processor_instance, lz4_clevel):
goodlines = series[series.str.len() > 0]
if goodlines.size == 1: # short-circuit trivial aggregations
return goodlines[0]
outhist = processor_instance.accumulator.identity()
for line in goodlines:
outhist.add(pkl.loads(lz4f.decompress(line)))
return lz4f.compress(pkl.dumps(outhist), compression_level=lz4_clevel)
def dumps(o):
return compress(
msgpack.packb(o, default=encode_ext, use_bin_type=True))