How to use the blosc.decompress function in blosc

To help you get started, we’ve selected a few blosc examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github daeyun / object-shapes-cvpr18 / python / mvshape / eval_out_reader.py View on Github external
def read_split_metadata(split_file, subset_tags=None):
    """

    :param split_file:
    :param subset_tags: For example, ['NOVELCLASS', 'NOVELMODEL', 'NOVELVIEW']
    :return:
    """

    # Make sure cpp implementation is used for parsing protocol buffers. Otherwise it will be too slow.
    from google.protobuf.internal import api_implementation
    protobuf_impl_name = api_implementation._default_implementation_type
    assert 'cpp' == protobuf_impl_name, protobuf_impl_name

    with open(split_file, mode='rb') as f:
        compressed = f.read()
    decompressed = blosc.decompress(compressed)
    examples = dataset_pb2.Examples()
    examples.ParseFromString(decompressed)

    if subset_tags is not None:
        assert isinstance(subset_tags, (list, tuple))
        experiment_names = subset_tags

        ret = collections.defaultdict(list)

        for example in examples.examples:
            tags = tags_from_example(example)
            for name in experiment_names:
                if name in tags:
                    ret[name].append(example)
    else:
        ret = list(examples.examples)
github analysiscenter / batchflow / batchflow / batch.py View on Github external
def _load_blosc(self, ix, src=None, dst=None):
        """ Load data from a blosc packed file """
        file_name = self._get_file_name(ix, src)
        with open(file_name, 'rb') as f:
            data = dill.loads(blosc.decompress(f.read()))
            components = tuple(dst or self.components)
            try:
                item = tuple(data[i] for i in components)
            except Exception as e:
                raise KeyError('Cannot find components in corresponfig file', e)
        return item
github pydata / pandas-msgpack / pandas_msgpack / packers.py View on Github external
elif is_object_dtype(dtype):
        return np.array(values, dtype=object)

    dtype = pandas_dtype(dtype).base

    if not as_is_ext:
        values = values.encode('latin1')

    if compress:
        if compress == u'zlib':
            _check_zlib()
            decompress = zlib.decompress
        elif compress == u'blosc':
            _check_blosc()
            decompress = blosc.decompress
        else:
            raise ValueError("compress must be one of 'zlib' or 'blosc'")

        try:
            return np.frombuffer(
                _move_into_mutable_buffer(decompress(values)),
                dtype=dtype,
            )
        except _BadMove as e:
            # Pull the decompressed data off of the `_BadMove` exception.
            # We don't just store this in the locals because we want to
            # minimize the risk of giving users access to a `bytes` object
            # whose data is also given to a mutable buffer.
            values = e.args[0]
            if len(values) > 1:
                # The empty string and single characters are memoized in many
github spyking-circus / spyking-circus / circus / shared / mpi.py View on Github external
displacements = [numpy.int64(sum(sizes[:i])) for i in range(len(sizes))]

    np_type       = get_np_dtype(dtype)
    mpi_type      = get_mpi_type(dtype)    
    data_shape    = data.shape

    if not compress:
        gdata = numpy.empty(numpy.int64(sum(sizes)), dtype=np_type)
        mpi_comm.Gatherv([data.flatten(), size, mpi_type], [gdata, (sizes, displacements), mpi_type], root=root)
    else:
        data = blosc.compress(data, typesize=mpi_type.size, cname='blosclz')
        data = mpi_comm.gather(data, root=0)
        gdata = numpy.empty(0, dtype=np_type)
        if comm.rank == 0:
            for blosc_data in data:
                gdata = numpy.concatenate((gdata, numpy.frombuffer(blosc.decompress(blosc_data), dtype=np_type)))

    if len(data_shape) == 1:
        return gdata
    else:
        if shape == 0:
            num_lines = data_shape[0]
            if num_lines > 0:
                return gdata.reshape((num_lines, gdata.size//num_lines))
            else:
                return gdata.reshape((0, gdata.shape[1]))
        if shape == 1:
            num_columns = data_shape[1]
            if num_columns > 0:
                return gdata.reshape((gdata.size//num_columns, num_columns))
            else:
                return gdata.reshape((gdata.shape[0], 0))
github jhuapl-boss / boss-tools / lambda / downsample_volume.py View on Github external
volume_empty = True # abort if the volume doesn't exist in S3
    for offset in xyz_range(step):
        if args.get('test'):
            # Enable Test Mode
            # This is where the cubes downsamples are all taken from 0/0/0
            # so that the entire frame doesn't have to be populated to test
            # the code paths that downsample cubes
            cube = offset # use target 0/0/0
        else:
            cube = target + offset

        try:
            obj_key = HashedKey(parent_iso, col_id, exp_id, chan_id, resolution, t, cube.morton, version=version)
            data = s3.get(obj_key)
            data = blosc.decompress(data)

            # DP ???: Check to see if the buffer is all zeros?
            data = Buffer.frombuffer(data, dtype=np_types[data_type])
            data.resize(dim)

            #log.debug("Downloaded cube {}".format(cube))
            volume[offset * dim: (offset + 1) * dim] = data
            volume_empty = False
        except Exception as e: # TODO: Create custom exception for S3 download
            #log.exception("Problem downloading cubes {}".format(cube))
            #log.debug("No cube at {}".format(cube))

            # Eat the error, we don't care if the cube doesn't exist
            # If the cube doesn't exist blank data will be used for downsampling
            # If all the cubes don't exist, then the downsample is finished
            pass
github jhuapl-boss / boss / django / bossspatialdb / parsers.py View on Github external
# Get bit depth
        try:
            bit_depth = resource.get_bit_depth()
        except ValueError:
            return BossParserError("Unsupported data type provided to parser: {}".format(resource.get_data_type()),
                                   ErrorCodes.TYPE_ERROR)

        # Make sure cutout request is under 500MB UNCOMPRESSED
        if is_too_large(req, bit_depth):
            return BossParserError("Cutout request is over 500MB when uncompressed. Reduce cutout dimensions.",
                                   ErrorCodes.REQUEST_TOO_LARGE)

        try:
            # Decompress
            raw_data = blosc.decompress(stream.read())
            data_mat = np.fromstring(raw_data, dtype=resource.get_numpy_data_type())
        except MemoryError:
            return BossParserError("Ran out of memory decompressing data.",
                                    ErrorCodes.BOSS_SYSTEM_ERROR)
        except:
            return BossParserError("Failed to decompress data. Verify the datatype/bitdepth of your data "
                                   "matches the channel.", ErrorCodes.DATATYPE_DOES_NOT_MATCH)

        # Reshape and return
        try:
            if req.time_request:
                # Time series request (even if single time point) - Get 4D matrix
                parsed_data = np.reshape(data_mat,
                                         (len(req.get_time()),
                                          req.get_z_span(),
                                          req.get_y_span(),
github tensorwerk / hangar-py / src / hangar / remote / server.py View on Github external
def FetchFindMissingHashRecords(self, request_iterator, context):
        """Determine data tensor hash records existing on the server and not on the client.
        """
        for idx, request in enumerate(request_iterator):
            if idx == 0:
                commit = request.commit
                hBytes, offset = bytearray(request.total_byte_size), 0
            size = len(request.hashs)
            hBytes[offset: offset + size] = request.hashs
            offset += size

        uncompBytes = blosc.decompress(hBytes)
        c_hashs_raw = chunks.deserialize_record_pack(uncompBytes)
        c_hashset = set([chunks.deserialize_ident(raw).digest for raw in c_hashs_raw])

        with tempfile.TemporaryDirectory() as tempD:
            tmpDF = os.path.join(tempD, 'test.lmdb')
            tmpDB = lmdb.open(path=tmpDF, **c.LMDB_SETTINGS)
            commiting.unpack_commit_ref(self.env.refenv, tmpDB, commit)
            s_hashes_schemas = queries.RecordQuery(tmpDB).data_hash_to_schema_hash()
            s_hashes = set(s_hashes_schemas.keys())
            tmpDB.close()

        c_missing = list(s_hashes.difference(c_hashset))
        c_hash_schemas_raw = [chunks.serialize_ident(c_mis, s_hashes_schemas[c_mis]) for c_mis in c_missing]
        raw_pack = chunks.serialize_record_pack(c_hash_schemas_raw)
        err = hangar_service_pb2.ErrorProto(code=0, message='OK')
        response_pb = hangar_service_pb2.FindMissingHashRecordsReply
github pyacq / pyacq / pyacq / core / streamconverter / plaindata_to_sharedmem.py View on Github external
self.start(first_start = False)
                continue
            m0,m1 = self.recv_socket.recv_multipart()
            self.last_packet_time = time.time()
            
            abs_pos = msgpack.loads(m0)
            if self.last_pos>abs_pos:
                print 'restart because last not googd'
                self.start(first_start = False)
                
                continue
            
            if self.compress is None:
                buf = buffer(m1)
            elif self.compress == 'blosc':
                buf = blosc.decompress(m1)
            
            chunk = np.frombuffer(buf, dtype = np_array.dtype, ).reshape(-1, n).transpose()
            #~ print 'recv', abs_pos, chunk.shape
            
            new = chunk.shape[1]
            head = abs_pos%half_size+half_size
            tail = head - new
            np_array[:,  tail:head] = chunk

            head = abs_pos%half_size+half_size
            tail = head - new
            np_array[:,  tail:head] = chunk
            head2 = abs_pos%half_size
            tail2 = max(head2 - new, 0)
            new2 = head2-tail2
            if new2!=0:
github tensorwerk / hangar-py / src / hangar / remote / server.py View on Github external
In order to prevent errors or malicious behavior, the cryptographic hash
        of every tensor is calculated and compared to what the client "said" it
        is. If an error is detected, no sample in the entire stream will be
        saved to disk.
        """
        for idx, request in enumerate(request_iterator):
            if idx == 0:
                uncomp_nbytes = request.uncomp_nbytes
                comp_nbytes = request.comp_nbytes
                dBytes, offset = bytearray(comp_nbytes), 0
            size = len(request.raw_data)
            dBytes[offset: offset + size] = request.raw_data
            offset += size

        uncompBytes = blosc.decompress(dBytes)
        if uncomp_nbytes != len(uncompBytes):
            msg = f'ERROR: uncomp_nbytes sent: {uncomp_nbytes} != received {comp_nbytes}'
            context.set_details(msg)
            context.set_code(grpc.StatusCode.DATA_LOSS)
            err = hangar_service_pb2.ErrorProto(code=15, message=msg)
            reply = hangar_service_pb2.PushDataReply(error=err)
            return reply

        unpacked_records = chunks.deserialize_record_pack(uncompBytes)
        received_data = []
        for record in unpacked_records:
            data = chunks.deserialize_record(record)
            schema_hash = data.schema
            received_hash = array_hash_digest(data.array)
            if received_hash != data.digest:
                msg = f'HASH MANGLED, received: {received_hash} != expected digest: {data.digest}'
github Blosc / bloscpack / bloscpack / append.py View on Github external
if blosc_args['shuffle'] is None:
        blosc_args['shuffle'] = DEFAULT_SHUFFLE
    if blosc_args['cname'] is None:
        blosc_args['cname'] = DEFAULT_CNAME
    _check_blosc_args(blosc_args)
    offsets_pos = (BLOSCPACK_HEADER_LENGTH +
                  (METADATA_HEADER_LENGTH + metadata_header['max_meta_size'] +
                      CHECKSUMS_LOOKUP[metadata_header['meta_checksum']].size
                   if metadata is not None else 0))
    # seek to the final offset
    original_fp.seek(offsets[-1], 0)
    # decompress the last chunk
    compressed, blosc_header, digest = _read_compressed_chunk_fp(original_fp,
                                                         checksum_impl)
    # TODO check digest
    decompressed = blosc.decompress(compressed)
    # figure out how many bytes we need to read to rebuild the last chunk
    ultimo_length = len(decompressed)
    bytes_to_read = bloscpack_header.chunk_size - ultimo_length
    if new_size <= bytes_to_read:
        # special case
        # must squeeze data into last chunk
        fill_up = new_content_fp.read(new_size)
        # seek back to the position of the original last chunk
        original_fp.seek(offsets[-1], 0)
        # write the chunk that has been filled up
        compressed = _compress_chunk_str(decompressed + fill_up, blosc_args)
        digest = checksum_impl(compressed)
        _write_compressed_chunk(original_fp, compressed, digest)
        # return 0 to indicate that no new chunks have been written
        # build the new header
        bloscpack_header.last_chunk += new_size