How to use the dataset.AuxTables function in dataset

To help you get started, we’ve selected a few dataset examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github HoloClean / holoclean / repair / repair.py View on Github external
def get_infer_dataframes(self, infer_idx, Y_pred):
        distr = []
        infer_val = []
        Y_assign = Y_pred.data.numpy().argmax(axis=1)
        domain_size = self.feat_dataset.var_to_domsize

        # Need to map the inferred value index of the random variable to the actual value
        # val_idx = val_id - 1 since val_id was numbered starting from 1 whereas
        # val_idx starts at 0.
        query = 'SELECT _vid_, val_id-1, rv_val FROM {pos_values}'.format(pos_values=AuxTables.pos_values.name)
        pos_values = self.ds.engine.execute_query(query)
        # dict mapping _vid_ --> val_idx --> value
        vid_to_val = {}
        for vid, val_idx, val in pos_values:
            vid_to_val[vid] = vid_to_val.get(vid, {})
            vid_to_val[vid][val_idx] = val

        for idx in range(Y_pred.shape[0]):
            vid = int(infer_idx[idx])
            rv_distr = list(Y_pred[idx].data.numpy())
            rv_val_idx = int(Y_assign[idx])
            rv_val = vid_to_val[vid][rv_val_idx]
            rv_prob = Y_pred[idx].data.numpy().max()
            d_size = domain_size[vid]
            distr.append({'_vid_': vid, 'distribution':[str(p) for p in rv_distr[:d_size]]})
            infer_val.append({'_vid_': vid, 'inferred_val_idx': rv_val_idx, 'inferred_val': rv_val, 'prob':rv_prob})
github HoloClean / holoclean / evaluate / eval.py View on Github external
We also distinguish between repairs on correct cells and repairs on
        incorrect cells (correct cells are cells where init == ground truth).
        """
        query = """
        SELECT
            (t1.init_value = t3._value_) AS is_correct,
            count(*)
        FROM   {} as t1, {} as t2, {} as t3
        WHERE  t1._tid_ = t2._tid_
          AND  t1.attribute = t2.attribute
          AND  t1.init_value != t2.rv_value
          AND  t1._tid_ = t3._tid_
          AND  t1.attribute = t3._attribute_
        GROUP BY is_correct
          """.format(AuxTables.cell_domain.name,
                  AuxTables.inf_values_dom.name,
                  self.clean_data.name)
        res = self.ds.engine.execute_query(query)

        # Memoize the number of repairs on correct cells and incorrect cells.
        # Since we do a GROUP BY we need to check which row of the result
        # corresponds to the correct/incorrect counts.
        self.total_repairs_grdt_correct, self.total_repairs_grdt_incorrect = 0, 0
        self.total_repairs_grdt = 0
        if not res:
            return

        if res[0][0]:
            correct_idx, incorrect_idx = 0, 1
        else:
            correct_idx, incorrect_idx = 1, 0
github HoloClean / holoclean / evaluate / eval.py View on Github external
def compute_correct_repairs(self):
        """
        compute_correct_repairs memoizes the number of error cells
        that were correctly inferred.

        This value is always equal or less than total errors (see
        compute_total_errors).
        """
        queries = []
        correct_repairs = 0.0
        for attr in self.ds.get_attributes():
            query = correct_repairs_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
                                                        attr=attr, inf_dom=AuxTables.inf_values_dom.name)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)
        for res in results:
            correct_repairs += float(res[0][0])
        self.correct_repairs = correct_repairs
github HoloClean / holoclean / evaluate / eval.py View on Github external
def compute_total_repairs(self):
        """
        compute_total_repairs memoizes the number of repairs:
        the # of cells that were inferred and where the inferred value
        is not equal to the initial value.
        """

        query = "SELECT count(*) FROM " \
                "  (SELECT _vid_ " \
                "     FROM {} as t1, {} as t2 " \
                "    WHERE t1._tid_ = t2._tid_ " \
                "      AND t1.attribute = t2.attribute " \
                "      AND t1.init_value != t2.rv_value) AS t".format(AuxTables.cell_domain.name,
                                                                      AuxTables.inf_values_dom.name)
        res = self.ds.engine.execute_query(query)
        self.total_repairs = float(res[0][0])
github HoloClean / holoclean / evaluate / eval.py View on Github external
from
            {cell_domain} as t1,
            {clean_data} as t2
            left join {dk_cells} as t3 on t2._tid_ = t3._tid_ and t2._attribute_ = t3.attribute
            left join {inf_values_dom} as t4 on t2._tid_ = t4._tid_ and t2._attribute_ = t4.attribute where t1._tid_ = t2._tid_ and t1.attribute = t2._attribute_
        group by
            clean,
            status,
            inferred,
            init_eq_grdth,
            init_eq_infer,
            wl_eq_init,
            wl_eq_grdth,
            wl_eq_infer,
            infer_eq_grdth
        """.format(cell_domain=AuxTables.cell_domain.name,
                clean_data=self.clean_data.name,
                dk_cells=AuxTables.dk_cells.name,
                inf_values_dom=AuxTables.inf_values_dom.name)

        res = self.ds.engine.execute_query(query)

        df_stats = pd.DataFrame(res,
                columns=["is_clean", "cell_status", "is_inferred",
                    "init = grdth", "init = inferred",
                    "w. label = init", "w. label = grdth", "w. label = inferred",
                    "infer = grdth", "count"])
        df_stats = df_stats.sort_values(list(df_stats.columns)).reset_index(drop=True)
        logging.debug("weak label statistics:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', len(df_stats))
        pd.set_option('display.max_colwidth', -1)
github HoloClean / holoclean / repair / featurize / occurfeat.py View on Github external
def create_tensor(self):
        """
        For each unique VID (cell) returns the co-occurrence probability between
        each possible domain value for this VID and the initial/raw values for the
        corresponding entity/tuple of this cell.

        :return: Torch.Tensor of shape (# of VIDs) X (max domain) X (# of attributes)
            where tensor[i][j][k] contains the co-occur probability between the j-th domain value
            of the i-th random variable (VID) and the initial/raw value of the k-th
            attribute for the corresponding entity.
        """
        # Iterate over tuples in domain
        tensors = []
        # Set tuple_id index on raw_data
        t = self.ds.aux_table[AuxTables.cell_domain]
        sorted_domain = t.df.reset_index().sort_values(by=['_vid_'])[['_tid_','attribute','_vid_','domain']]
        records = sorted_domain.to_records()
        for row in tqdm(list(records)):
            #Get tuple from raw_dataset
            tid = row['_tid_']
            tuple = self.raw_data_dict[tid]
            feat_tensor = self.gen_feat_tensor(row, tuple)
            tensors.append(feat_tensor)
        combined = torch.cat(tensors)
        return combined
github HoloClean / holoclean / domain / domain.py View on Github external
"""
        store_domains stores the 'domain' DataFrame as the 'cell_domain'
        auxiliary table as well as generates the 'pos_values' auxiliary table,
        a long-format of the domain values, in Postgres.

        pos_values schema:
            _tid_: entity/tuple ID
            _cid_: cell ID
            _vid_: random variable ID (all cells with more than 1 domain value)
            _

        """
        if domain.empty:
            raise Exception("ERROR: Generated domain is empty.")
        else:
            self.ds.generate_aux_table(AuxTables.cell_domain, domain, store=True, index_attrs=['_vid_'])
            self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_tid_'])
            self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_cid_'])
            query = "SELECT _vid_, _cid_, _tid_, attribute, a.rv_val, a.val_id from %s , unnest(string_to_array(regexp_replace(domain,\'[{\"\"}]\',\'\',\'gi\'),\'|||\')) WITH ORDINALITY a(rv_val,val_id)" % AuxTables.cell_domain.name
            self.ds.generate_aux_table_sql(AuxTables.pos_values, query, index_attrs=['_tid_', 'attribute'])
github HoloClean / holoclean / repair / featurize / initattfeat.py View on Github external
def create_tensor(self):
        query = 'SELECT _vid_, attribute, init_index FROM %s ORDER BY _vid_'%AuxTables.cell_domain.name
        results = self.ds.engine.execute_query(query)
        map_input = []
        for res in results:
            map_input.append((res[0], self.attr_to_idx[res[1]], res[2]))
        tensors = self.pool.map(partial(gen_feat_tensor, classes=self.classes, total_attrs=self.total_attrs), map_input)
        combined = torch.cat(tensors)
        return combined
github HoloClean / holoclean / evaluate / eval.py View on Github external
{clean_data} as t2
            left join {dk_cells} as t3 on t2._tid_ = t3._tid_ and t2._attribute_ = t3.attribute
            left join {inf_values_dom} as t4 on t2._tid_ = t4._tid_ and t2._attribute_ = t4.attribute where t1._tid_ = t2._tid_ and t1.attribute = t2._attribute_
        group by
            clean,
            status,
            inferred,
            init_eq_grdth,
            init_eq_infer,
            wl_eq_init,
            wl_eq_grdth,
            wl_eq_infer,
            infer_eq_grdth
        """.format(cell_domain=AuxTables.cell_domain.name,
                clean_data=self.clean_data.name,
                dk_cells=AuxTables.dk_cells.name,
                inf_values_dom=AuxTables.inf_values_dom.name)

        res = self.ds.engine.execute_query(query)

        df_stats = pd.DataFrame(res,
                columns=["is_clean", "cell_status", "is_inferred",
                    "init = grdth", "init = inferred",
                    "w. label = init", "w. label = grdth", "w. label = inferred",
                    "infer = grdth", "count"])
        df_stats = df_stats.sort_values(list(df_stats.columns)).reset_index(drop=True)
        logging.debug("weak label statistics:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', len(df_stats))
        pd.set_option('display.max_colwidth', -1)
        logging.debug("%s", df_stats)
        pd.reset_option('display.max_columns')