How to use the petastorm.predicates.PredicateBase function in petastorm

To help you get started, we’ve selected a few petastorm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github uber / petastorm / petastorm / predicates.py View on Github external
raise ValueError('Predicate is nor derived from PredicateBase')
        self._predicate_list = predicate_list
        self._reduce_func = reduce_func

    def get_fields(self):
        fields = set()
        for p in self._predicate_list:
            fields |= p.get_fields()
        return fields

    def do_include(self, values):
        include_list = [p.do_include(values) for p in self._predicate_list]
        return self._reduce_func(include_list)


class in_pseudorandom_split(PredicateBase):
    """ Split dataset according to a split list based on volume_guid.
        The split is pseudorandom (can not supply the seed yet), i.e. the split outcome is always the same.
        Split is performed by hashing volume_guid uniformly to 0:1 range and returning part of full dataset
        which was hashed in given sub-range

        Example:
            'split_list = [0.5, 0.2, 0.3]' - dataset will be split on three subsets in proportion
            subset 1: 0.5 of log data
            subset 2: 0.2 of log data
            subset 3: 0.3 of log data
            Note, split is not exact, so avoid small fraction (e.g. 0.001) to avoid empty sets
    """

    def __init__(self, fraction_list, subset_index, predicate_field):
        """ split_list: a list of log fractions (real numbers in range [0:1])
            subset_index: define which subset will be used by the Reader
github uber / petastorm / petastorm / predicates.py View on Github external
""" Test if predicate_field list contain at least one value from inclusion_values set """

    def __init__(self, inclusion_values, _predicate_field):
        self._inclusion_values = list(inclusion_values)
        self._predicate_field = _predicate_field

    def get_fields(self):
        return {self._predicate_field}

    def do_include(self, values):
        if not isinstance(values[self._predicate_field], collections.Iterable):
            raise ValueError('Predicate field should have iterable type')
        return any(np.in1d(values[self._predicate_field], self._inclusion_values))


class in_lambda(PredicateBase):
    """ Wrap up custom function to be used as a predicate
        example: in_lambda(['labels_object_roles'], lambda labels_object_roles : len(labels_object_roles) > 3)
    """

    def __init__(self, predicate_fields, predicate_func, state_arg=None):
        """
        :param predicate_fields: list of fields to be used in predicate
        :param predicate_func: predicate function
               example: lambda labels_object_roles : len(labels_object_roles) > 3
        :param state_arg: additional object to keep function state. it will be passed to
               predicate_func after fields arguments ONLY if it is not None
        """
        if not isinstance(predicate_fields, list):
            raise ValueError('Predicate fields should be a list')
        self._predicate_fields = predicate_fields
        self._predicate_func = predicate_func
github uber / petastorm / petastorm / reader.py View on Github external
def _apply_predicate_to_row_groups(self, dataset, row_groups, predicate):
        """Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup
        indexes and a list of worker_predicate: predicates that could not be applied at this level
        (parquet partitioning)."""

        if predicate:
            if not isinstance(predicate, PredicateBase):
                raise ValueError('predicate parameter is expected to be derived from PredicateBase')
            predicate_fields = predicate.get_fields()

            if set(predicate_fields) == dataset.partitions.partition_names:
                assert len(dataset.partitions.partition_names) == 1, \
                    'Datasets with only a single partition level supported at the moment'

                filtered_row_group_indexes = []
                for piece_index, piece in enumerate(row_groups):
                    partition_name, partition_index = piece.partition_keys[0]
                    partition_value = dataset.partitions[0].keys[partition_index]

                    # Convert partition value to correct type per the schema
                    partition_value = self.schema.fields[partition_name].numpy_dtype(partition_value)
                    if predicate.do_include({partition_name: partition_value}):
                        filtered_row_group_indexes.append(piece_index)
github uber / petastorm / petastorm / predicates.py View on Github external
""" A predicate used to negate another predicate. """

    def __init__(self, predicate):
        if not isinstance(predicate, PredicateBase):
            raise ValueError('Predicate is nor derived from PredicateBase')

        self._predicate = predicate

    def get_fields(self):
        return self._predicate.get_fields()

    def do_include(self, values):
        return not self._predicate.do_include(values)


class in_reduce(PredicateBase):
    """ A predicate used to aggregate other predicates using any reduce logical operation."""

    def __init__(self, predicate_list, reduce_func):
        """ predicate_list: list of predicates
            reduce_func: function to aggregate result of all predicates in the list
            e.g. all() will implements logical 'And', any() implements logical 'Or'
        """
        check_list = [isinstance(p, PredicateBase) for p in predicate_list]
        if not all(check_list):
            raise ValueError('Predicate is nor derived from PredicateBase')
        self._predicate_list = predicate_list
        self._reduce_func = reduce_func

    def get_fields(self):
        fields = set()
        for p in self._predicate_list:
github uber / petastorm / petastorm / reader_impl / reader_v2.py View on Github external
def _apply_predicate_to_row_groups(self, dataset, row_groups, predicate):
        """Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup
        indexes and a list of worker_predicate: predicates that could not be applied at this level
        (parquet partitioning)."""

        if predicate:
            if not isinstance(predicate, PredicateBase):
                raise ValueError('predicate parameter is expected to be derived from PredicateBase')
            predicate_fields = predicate.get_fields()

            if set(predicate_fields) == dataset.partitions.partition_names:
                assert len(dataset.partitions.partition_names) == 1, \
                    'Datasets with only a single partition level supported at the moment'

                filtered_row_group_indexes = []
                for piece_index, piece in enumerate(row_groups):
                    partition_name, partition_index = piece.partition_keys[0]
                    partition_value = dataset.partitions[0].keys[partition_index]
                    if predicate.do_include({partition_name: partition_value}):
                        filtered_row_group_indexes.append(piece_index)

                worker_predicate = None
            else:
github uber / petastorm / petastorm / predicates.py View on Github external
    @abc.abstractmethod
    def get_fields(self):
        pass

    @abc.abstractmethod
    def do_include(self, values):
        pass


def _string_to_bucket(string, bucket_num):
    hash_str = hashlib.md5(string.encode('utf-8')).hexdigest()
    return int(hash_str, 16) % bucket_num


class in_set(PredicateBase):
    """ Test if predicate_field value is in inclusion_values set """

    def __init__(self, inclusion_values, predicate_field):
        self._inclusion_values = set(inclusion_values)
        self._predicate_field = predicate_field

    def get_fields(self):
        return {self._predicate_field}

    def do_include(self, values):
        return values[self._predicate_field] in self._inclusion_values


class in_intersection(PredicateBase):
    """ Test if predicate_field list contain at least one value from inclusion_values set """
github uber / petastorm / petastorm / predicates.py View on Github external
raise ValueError('Predicate fields should be a list')
        self._predicate_fields = predicate_fields
        self._predicate_func = predicate_func
        self._state_arg = state_arg

    def get_fields(self):
        return set(self._predicate_fields)

    def do_include(self, values):
        args = [values[field] for field in self._predicate_fields]
        if self._state_arg is not None:
            args.append(self._state_arg)
        return self._predicate_func(*args)


class in_negate(PredicateBase):
    """ A predicate used to negate another predicate. """

    def __init__(self, predicate):
        if not isinstance(predicate, PredicateBase):
            raise ValueError('Predicate is nor derived from PredicateBase')

        self._predicate = predicate

    def get_fields(self):
        return self._predicate.get_fields()

    def do_include(self, values):
        return not self._predicate.do_include(values)


class in_reduce(PredicateBase):
github uber / petastorm / petastorm / predicates.py View on Github external
def __init__(self, predicate_list, reduce_func):
        """ predicate_list: list of predicates
            reduce_func: function to aggregate result of all predicates in the list
            e.g. all() will implements logical 'And', any() implements logical 'Or'
        """
        check_list = [isinstance(p, PredicateBase) for p in predicate_list]
        if not all(check_list):
            raise ValueError('Predicate is nor derived from PredicateBase')
        self._predicate_list = predicate_list
        self._reduce_func = reduce_func
github uber / petastorm / petastorm / predicates.py View on Github external
class in_set(PredicateBase):
    """ Test if predicate_field value is in inclusion_values set """

    def __init__(self, inclusion_values, predicate_field):
        self._inclusion_values = set(inclusion_values)
        self._predicate_field = predicate_field

    def get_fields(self):
        return {self._predicate_field}

    def do_include(self, values):
        return values[self._predicate_field] in self._inclusion_values


class in_intersection(PredicateBase):
    """ Test if predicate_field list contain at least one value from inclusion_values set """

    def __init__(self, inclusion_values, _predicate_field):
        self._inclusion_values = list(inclusion_values)
        self._predicate_field = _predicate_field

    def get_fields(self):
        return {self._predicate_field}

    def do_include(self, values):
        if not isinstance(values[self._predicate_field], collections.Iterable):
            raise ValueError('Predicate field should have iterable type')
        return any(np.in1d(values[self._predicate_field], self._inclusion_values))


class in_lambda(PredicateBase):