How to use the goodtables.datatable.DataTable function in goodtables

To help you get started, we’ve selected a few goodtables examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github frictionlessdata / goodtables-py / tests / test_datatable.py View on Github external
def test_wrong_encoding_replaces(self):

        data_source = os.path.join(self.data_dir, 'hmt','BIS_spending_over__25_000_July_2014.csv')
        encoding = 'UTF-8'  # should be 'ISO-8859-2'
        decode_strategy = 'replace'
        data = datatable.DataTable(data_source, encoding=encoding, decode_strategy=decode_strategy)

        self.assertTrue(data)
github frictionlessdata / goodtables-py / tests / test_datatable.py View on Github external
def test_excel_from_url(self):

        data_source = 'https://github.com/okfn/goodtables/raw/master/examples/hmt/BIS_monthly_spend_December_2012.xls'
        data = datatable.DataTable(data_source, format='excel')

        self.assertTrue(data.headers)
github frictionlessdata / goodtables-py / tests / test_datatable.py View on Github external
def test_set_decoding_on_self_when_passed(self):

        data_source = os.path.join(self.data_dir, 'jungle','VilleMTP_MTP_BudgetPri_2015.csv')
        encoding = 'windows-1252'
        data = datatable.DataTable(data_source, encoding=encoding)
        self.assertEqual(data.encoding, data.passed_encoding)
github frictionlessdata / goodtables-py / goodtables / processors / base.py View on Github external
return False
            return run_valid

        valid = True
        openfiles = []

        if is_table:
            data = data_source
        else:
            try:
                data = datatable.DataTable(data_source, headers=headers,
                                           format=format, encoding=encoding,
                                           decode_strategy=decode_strategy,
                                           header_index=self.header_index)
                openfiles.extend(data.openfiles)
            except datatable.DataTable.RAISES as e:
                valid = False
                data = None
                if isinstance(e, exceptions.DataSourceHTTPError):
                    error_type = 'http_{0}_error'.format(e.status)
                elif isinstance(e, exceptions.DataSourceDecodeError):
                    error_type = 'data_decode_error'
                elif isinstance(e, exceptions.DataSourceFormatUnsupportedError):
                    error_type = 'data_{0}_error'.format(e.file_format)
                elif isinstance(e, exceptions.DataSourceMalformatedError):
                    error_type = 'invalid_{0}_error'.format(format)
                    
                entry = self.make_entry(
                    processor='base',
                    result_category=self.RESULT_CATEGORY_FILE,
                    result_level=self.RESULT_LEVEL_ERROR,
                    result_message=e.msg,
github frictionlessdata / goodtables-py / goodtables / processors / base.py View on Github external
"""

        def _run_valid(process_valid, run_valid):
            """Set/maintain the valid state of the run."""
            if not process_valid and run_valid:
                return False
            return run_valid

        valid = True
        openfiles = []

        if is_table:
            data = data_source
        else:
            try:
                data = datatable.DataTable(data_source, headers=headers,
                                           format=format, encoding=encoding,
                                           decode_strategy=decode_strategy,
                                           header_index=self.header_index)
                openfiles.extend(data.openfiles)
            except datatable.DataTable.RAISES as e:
                valid = False
                data = None
                if isinstance(e, exceptions.DataSourceHTTPError):
                    error_type = 'http_{0}_error'.format(e.status)
                elif isinstance(e, exceptions.DataSourceDecodeError):
                    error_type = 'data_decode_error'
                elif isinstance(e, exceptions.DataSourceFormatUnsupportedError):
                    error_type = 'data_{0}_error'.format(e.file_format)
                elif isinstance(e, exceptions.DataSourceMalformatedError):
                    error_type = 'invalid_{0}_error'.format(format)
github frictionlessdata / goodtables-py / goodtables / pipeline / pipeline.py View on Github external
report_backend = 'yaml'

        report_options = {
            'schema': helpers.report_schema,
            'backend': report_backend,
            'client_stream': self.report_stream,
            'limit': self.report_limit,
            'post_task': self.report_post_task
        }

        self.report = tellme.Report('Pipeline', **report_options)

        self.pipeline = self.get_pipeline()

        try:
            self.data = datatable.DataTable(self.data_source, format=self.format,
                                            encoding=encoding,
                                            decode_strategy=decode_strategy,
                                            header_index=self.header_index)
            self.openfiles.extend(self.data.openfiles)
            
        except datatable.DataTable.RAISES:
            self.data = self.data_source
github frictionlessdata / goodtables-py / goodtables / pipeline / batch.py View on Github external
def get_dataset_csv(self):
        """Get the dataset from a CSV file for this batch process."""

        dataset = []
        resources = datatable.DataTable(self.source, encoding='utf-8')

        data_index = resources.headers.index(self.data_key)
        keys_header_index = {}
        
        for key in [self.schema_key, self.format_key, self.encoding_key]:
            if key in resources.headers:
                keys_header_index[key] = resources.headers.index(key)

        for entry in resources.values:

            rv = {'data': entry[data_index], 'schema': None, 'encoding': None,
                  'format': None}
            
            for key, index in keys_header_index.items():
                if index is not None:
                    rv[key] = entry[index]
github frictionlessdata / goodtables-py / goodtables / pipeline / pipeline.py View on Github external
def run(self):
        """Run the pipeline."""

        def _run_valid(process_valid, run_valid):
            """Set/maintain the valid state of the run."""
            if not process_valid and run_valid:
                return False
            return run_valid

        valid = True

        for processor in self.pipeline:
            
            if isinstance(self.data, datatable.DataTable):
                _valid, _, self.data = processor.run(self.data, is_table=True,
                    encoding=self.encoding, decode_strategy=self.decode_strategy)
            else:
                _valid, _, self.data = processor.run(self.data_source, is_table=False,
                                            decode_strategy=self.decode_strategy,
                                            encoding=self.encoding, format=self.format)
            valid = _run_valid(_valid, valid)
            
            # if a validator returns invalid, we stop the pipeline,
            # unless break_on_invalid_processor is False
            if not valid and self.break_on_invalid_processor:
                break

            if self.data:
                self.data.replay()