How to use the dataflows.helpers.resource_matcher.ResourceMatcher function in dataflows

To help you get started, we’ve selected a few dataflows examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datahq / dataflows / dataflows / processors / printer.py View on Github external
def func(rows):
        spec = rows.res

        if not ResourceMatcher(resources, spec.descriptor).match(spec.name):
            yield from rows
            return

        header_print(spec.name, kwargs)

        schema_fields = spec.schema.fields
        if fields:
            schema_fields = [f for f in schema_fields if f.name in fields]

        field_names = [f.name for f in schema_fields]
        headers = ['#'] + [
            '{}\n({})'.format(f.name, f.type) for f in schema_fields
        ]
        toprint = []
        last = []
        x = 1
github datahq / dataflows / dataflows / processors / filter_rows.py View on Github external
def func(package):
        matcher = ResourceMatcher(resources, package.pkg)
        yield package.pkg
        for r in package:
            if matcher.match(r.res.name):
                yield process_resource(r, condition)
            else:
                yield r
github datahq / dataflows / dataflows / processors / unpivot.py View on Github external
def func(package):

        matcher = ResourceMatcher(resources, package.pkg)
        all_res_config = {}
        for resource in package.pkg.descriptor['resources']:
            config = all_res_config.setdefault(resource['name'], {})
            name = resource['name']
            if not matcher.match(name):
                continue
            schema = resource.get('schema')
            if schema is None:
                continue

            fields = schema.get('fields', [])

            for u_field in unpivot_fields:
                field_name_re = re.compile(u_field['name'])
                fields_to_pivot = list(
                    filter(match_fields(field_name_re, True), fields)
github frictionlessdata / datapackage-pipelines / datapackage_pipelines / lib / stream_remote_resources.py View on Github external
close()
    del stream

    return itertools\
        .islice(
            _reader(
                get_opener(_url, _resource, columns),
                _url,
                max_row=limit_rows),
            1, None)


parameters, datapackage, resource_iterator = ingest()

resources = ResourceMatcher(parameters.get('resources'), datapackage)
ignore_missing = parameters.get('ignore-missing', False)
limit_rows = parameters.get('limit-rows', -1)

new_resource_iterator = []
for resource in datapackage['resources']:

    if streamable(resource):
        url = resource[PROP_STREAMED_FROM]

        name = resource['name']
        if not resources.match(name):
            continue

        path = get_path(resource)
        if path is None or path == PATH_PLACEHOLDER:
            path = os.path.join('data', name + '.csv')
github datahq / dataflows / dataflows / processors / sort_rows.py View on Github external
def func(package):
        matcher = ResourceMatcher(resources, package.pkg)
        yield package.pkg
        for rows in package:
            if matcher.match(rows.res.name):
                yield _sorter(rows, key_calc, reverse, batch_size)
            else:
                yield rows
github frictionlessdata / datapackage-pipelines / datapackage_pipelines / lib / load_resource.py View on Github external
if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        stream = self.parameters.get('stream', True)
        required = self.parameters.get('required', True)
        resource = self.parameters.get('resource')
        resources = self.parameters.get('resources')
        if resource is not None:
            assert not resources
            resource_index = resource if isinstance(resource, int) else None
        else:
            assert resources
            resource_index = None
            resource = list(resources.keys())
        name_matcher = ResourceMatcher(resource, self.dp) \
                       if isinstance(resource, (str, list)) \
                       else None

        selected_resources = []
        found = False
        try:
            dp = datapackage.DataPackage(url)
        except Exception:
            if required:
                raise
            else:
                dp = None
        if dp:
            dp = self.process_datapackage(dp)
            for i, orig_res in enumerate(dp.resources):
                if resource_index == i or \
github datahq / dataflows / dataflows / processors / delete_fields.py View on Github external
def func(package):
        matcher = ResourceMatcher(resources, package.pkg)
        dp_resources = package.pkg.descriptor.get('resources', [])
        field_res = [
            re.compile('^{}$'.format(f if regex else re.escape(f))) for f in fields
        ]
        matched = set()
        new_field_names = {}
        for resource in dp_resources:
            if matcher.match(resource['name']):
                schema_fields = resource['schema'].get('fields', [])
                new_fields = []
                for sf in schema_fields:
                    skip = False
                    for f in field_res:
                        if f.match(sf['name']):
                            skip = True
                            matched.add(f.pattern)
github datahq / dataflows / dataflows / processors / load.py View on Github external
self.iterators = (resource for resource, descriptor in zip(resource_iterator, resources)
                              if resource_matcher.match(descriptor['name']))

        # If load_source is string:
        else:
            # Handle Environment vars if necessary:
            if self.load_source.startswith('env://'):
                env_var = self.load_source[6:]
                self.load_source = os.environ.get(env_var)
                if self.load_source is None:
                    raise ValueError(f"Couldn't find value for env var '{env_var}'")

            # Loading from datapackage:
            if os.path.basename(self.load_source) == 'datapackage.json' or self.options.get('format') == 'datapackage':
                self.load_dp = Package(self.load_source)
                resource_matcher = ResourceMatcher(self.resources, self.load_dp)
                for resource in self.load_dp.resources:
                    if resource_matcher.match(resource.name):
                        self.resource_descriptors.append(resource.descriptor)
                        self.iterators.append(resource.iter(keyed=True, cast=True))

            # Loading for any other source
            else:
                path = os.path.basename(self.load_source)
                path = os.path.splitext(path)[0]
                descriptor = dict(path=self.name or path,
                                  profile='tabular-data-resource')
                self.resource_descriptors.append(descriptor)
                descriptor['name'] = self.name or path
                if 'encoding' in self.options:
                    descriptor['encoding'] = self.options['encoding']
                self.options.setdefault('custom_parsers', {}).setdefault('xml', XMLParser)
github datahq / dataflows / dataflows / processors / update_resource.py View on Github external
def func(package: PackageWrapper):
        matcher = ResourceMatcher(resources, package.pkg)
        for resource in package.pkg.descriptor['resources']:
            if matcher.match(resource['name']):
                resource.update(props)
        yield package.pkg

        res_iter = iter(package)
        for r in res_iter:
            if matcher.match(r.res.name):
                yield r.it
            else:
                yield r
github datahq / dataflows / dataflows / processors / find_replace.py View on Github external
def func(package):
        matcher = ResourceMatcher(resources, package.pkg)
        yield package.pkg
        for rows in package:
            if matcher.match(rows.res.name):
                yield _find_replace(rows, fields)
            else:
                yield rows