How to use the datatable.isna function in datatable

To help you get started, we’ve selected a few datatable examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / driverlessai-recipes / data / airlines_joined_data_flights_in_out.py View on Github external
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # select flights leaving from SFO only
        X = X[dt.f['Origin'] == 'SFO', :]

        # Fill NaNs in DepDelay column
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0

        # create binary target column
        depdelay_threshold_mins = 15
        target = 'DepDelay%dm' % depdelay_threshold_mins
        X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
        cols_to_keep.extend([
            target,
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
github h2oai / datatable / tests / ijby / test-assign.py View on Github external
def test_assign_string_columns():
    DT = dt.Frame(A=["One", "two", "three", None, "five"])
    DT[dt.isna(f.A), f.A] = dt.Frame(["FOUR"])
    assert_equals(DT, dt.Frame(A=["One", "two", "three", "FOUR", "five"]))
github h2oai / datatable / tests / munging / test_dt_rows.py View on Github external
def test_rows_isna(df1):
    from datatable import isna
    dt1 = df1[isna(f.A), :]
    frame_integrity_check(dt1)
    assert dt1.names == df1.names
    assert dt1.to_list() == [[None, None], [None, 8]]
github h2oai / driverlessai-recipes / models / linearsvm.py View on Github external
if self.num_classes >= 2:
            model = SVC(kernel='linear', probability=True, random_state=self.random_state)
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
        else:
            model = SVR(kernel='linear')
        self.means = dict()
        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if np.isnan(self.means[col]):
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()
        model.fit(X, y, sample_weight=sample_weight)
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=abs(model.coef_[0]),
                                  iterations=0)
github h2oai / driverlessai-recipes / models / algorithms / libfm_fastfm.py View on Github external
else:
            model = als.FMRegression(n_iter=self.params["n_iter"], init_stdev=self.params["init_stdev"],
                                     rank=self.params["rank"], l2_reg_w=self.params["l2_reg_w"],
                                     l2_reg_V=self.params["l2_reg_V"], random_state=self.random_state)

        self.means = dict()
        self.standard_scaler = StandardScaler()
        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if np.isnan(self.means[col]):
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()
        X = self.standard_scaler.fit_transform(X)
        X = csr_matrix(X)  # requires sparse matrix
        model.fit(X, y)
        importances = np.array(abs(model.w_))

        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),  # abs(model.coef_[0])
                                  iterations=0)
github h2oai / driverlessai-recipes / models / algorithms / nusvm.py View on Github external
y = lb.transform(y)
        else:
            feature_model = NuSVR(kernel='linear', nu=self.params['nu'])
            model = NuSVR(nu=self.params['nu'], kernel=self.params['kernel'],
                          degree=self.params['degree'])

        self.means = dict()

        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if self.means[col] is None:
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0

        X = X.to_numpy()

        # nu is infeasible sometimes
        # doing quaternary search on both sides of selected nu
        valid_nu = None
        while valid_nu is None:
            try:
                model.fit(X, y)
                valid_nu = self.params['nu']
            except:
                if self.params['nu'] > 0.5:
                    self.params['nu'] = 1.0 - self.params['nu']
                else:
                    self.params['nu'] = (4.0 - 3.0 * self.params['nu']) / 4.0
                if self.num_classes >= 2:
github h2oai / driverlessai-recipes / algorithms / svm.py View on Github external
model = SVC(C=self.params["C"], kernel=self.params["kernel"], probability=True, random_state=self.random_state)      
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
        else:
            model = SVR(C=self.params["C"], kernel=self.params["kernel"], epsilon=self.params["epsilon"])
        self.means = dict()
        self.scaler=StandardScaler()
        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if np.isnan(self.means[col]):
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()
        X=self.scaler.fit_transform(X)
        if self.num_classes >= 2: 
             feature_model.fit(X, y, sample_weight=sample_weight)
             model.fit(X, y, sample_weight=sample_weight)
        else :
             feature_model.fit(X, y)
             model.fit(X, y)         
             
        importances=np.array(abs(feature_model.coef_))
            
        
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),#abs(model.coef_[0])
                                  iterations=0)
github h2oai / driverlessai-recipes / transformers / generic / count_missing_values_transformer.py View on Github external
def transform(self, X: dt.Frame):
        if X.ncols == 0:
            return np.zeros(X.nrows)
        return X[:, dt.sum([dt.isna(dt.f[x]) for x in range(X.ncols)])]
github h2oai / driverlessai-recipes / transformers / augmentation / uszipcode_features_light.py View on Github external
def transform(self, X: dt.Frame):
        try:
            X = dt.Frame(X)
            X.names = ['zip_key']
            X = X[:, str('zip_key')]
            zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0]
            zip_features = [self.get_zipcode_property(self.parse_zipcode(x)) for x in zip_list]
            X_g = dt.Frame({"zip_key": zip_list, self.get_property_name(): zip_features})
            X_g.key = 'zip_key'
            X_result = X[:, :, dt.join(X_g)]
            return X_result[:, 1:]
        except:
            return np.zeros(X.shape[0])