How to use the h2o.H2OFrame function in h2o

To help you get started, we’ve selected a few h2o examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / h2o-3 / h2o-docs / src / api / data-science-example-1 / example-h2o-scikit-learn.py View on Github external
#!/usr/bin/env python

from h2o import H2OFrame, H2OModel
import h2o as h2o

localH2O = h2o.init()
air = H2OFrame.from_csv(localH2O, "allyears_tiny.csv", index_col = False)
air.head().print()

X_air = air['Origin', 'Dest', 'Distance', 'UniqueCarrier', 'Month', 'DayofMonth', 'DayOfWeek']
y_air = air['IsDepDelayed']

X_air_train, X_air_valid, X_air_test, y_air_train, y_air_valid, y_air_test = \
  H2OFrame.train_valid_test(X_air, y_air, valid_size = 0.1, test_size = 0.1)

my_gbm = H2OModel.GBM(distribution = "multinomial", n_trees = 10,
                      interaction_depth = 3, shrinkage = 0.01,
                      importance = True)
air_gbm = my_gbm.fit(x=X_air_train, y=y_air_train, x_valid=X_air_valid, y_valid=y_air_valid)
air_gbm.print()

pred = air_gbm.predict(X_air_test)
pred.head().print()
github h2oai / h2o-3 / h2o-docs / src / booklets / v2_2015 / source / python / ipython_dataprep_input.py View on Github external
import h2o
h2o.init()  # Will set up H2O cluster using all available cores

h2o.init(ip="123.45.67.89", port=54321)
# To create an H2OFrame object from a python tuple:
df = h2o.H2OFrame(zip(*((1, 2, 3),
                   ('a', 'b', 'c'),
                   (0.1, 0.2, 0.3))))
df
# To create an H2OFrame object from a python list:
df = h2o.H2OFrame(zip(*[[1, 2, 3],
                   ['a', 'b', 'c'],
                   [0.1, 0.2, 0.3]]))
df
# To create an H2OFrame object from a python dict (or collections.OrderedDict):
df = h2o.H2OFrame({'A': [1, 2, 3],
                   'B': ['a', 'b', 'c'],
                   'C': [0.1, 0.2, 0.3]})
df

# To create an H2OFrame object from a dict with specified column types:
df2 = h2o.H2OFrame.from_python({'A': [1, 2, 3],
                                'B': ['a', 'a', 'b'],
                                'C': ['hello', 'all', 'world'],
                                'D': ['12MAR2015:11:00:00', '13MAR2015:12:00:00', '14MAR2015:13:00:00']},
                                column_types=['numeric', 'enum', 'string', 'time'])

df2

df2.types

import numpy as np
github roshanmadhushanka / PythonML / Article 05.py View on Github external
import numpy as np
from h2o.estimators import H2OAutoEncoderEstimator
from h2o.estimators import H2ODeepLearningEstimator
from dataprocessor import ProcessData, Filter
from featureeng import Measures
from parser import DataFrameParser

# Initialize server
h2o.init()

# AutoEncoder anomaly removal process
p_train = ProcessData.trainData(moving_k_closest_average=True, standard_deviation=True, probability_distribution=True, bin_classification=True)
p_test = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True, bin_classification=True)

# Converting to h2o frane
h_test = h2o.H2OFrame(p_test)
h_test.set_names(list(p_test.columns))

h_train = h2o.H2OFrame(p_train)
h_train.set_names(list(p_train.columns))

# Define autoencoder
anomaly_model = H2OAutoEncoderEstimator(
        activation="Rectifier",
        hidden=[25, 12, 25],
        sparse=True,
        l1=1e-4,
        epochs=100
    )

# Select relevant features
anomaly_train_columns = list(p_train.columns)
github roshanmadhushanka / PythonML / backup / autoencoder9[DL][IQR Rule].py View on Github external
hTrain, hValidate = hData.split_frame(ratios=[_validation_ratio_2])

hTest = h2o.H2OFrame(pTest)
hTest.set_names(list(pTest.columns))

# Training model
print "\nTraining Model"
print "----------------------------------------------------------------------------------------------------------------"
training_columns = list(pData.columns)
training_columns.remove(response_column)
training_columns.remove('UnitNumber')
training_columns.remove('Time')

# Create h2o frame using filtered pandas frame
filtered = h2o.H2OFrame(filtered_train)
filtered.set_names(list(filtered_train.columns))

model = H2ODeepLearningEstimator(hidden=[64, 64, 64], score_each_iteration=True, variable_importances=True, epochs=100, activation='Tanh')
model.train(x=training_columns, y=response_column, training_frame=filtered, validation_frame=hValidate)

print "\nModel Performance"
print "----------------------------------------------------------------------------------------------------------------"
# Evaluate model
print model.model_performance(test_data=hTest)
github roshanmadhushanka / PythonML / autoencoder6.py View on Github external
df1 = pTrain.iloc[i, :]
        filtered_train = filtered_train.append(df1, ignore_index=True)
        count += 1
    Progress.printProgress(iteration=(i+1), total=hTrain.nrow, decimals=1, prefix="Progress", suffix="Complete")

print filtered_train
print "Original Size :", hTrain.nrow
print "Filtered Size :", len(filtered_train)
print "Removed Rows  :", (hTrain.nrow-len(filtered_train))

# Feature Engineering
pTrain = ProcessData.trainDataToFrame(filtered_train, moving_k_closest_average=True, standard_deviation=True, probability_distribution=True)
pTest = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True)

# Convert pandas to h2o frame - for model training
hValidate = h2o.H2OFrame(pValidate)
hValidate.set_names(list(pValidate.columns))

hTrain = h2o.H2OFrame(pTrain)
hTrain.set_names(list(pTrain.columns))

hTest = h2o.H2OFrame(pTest)
hTest.set_names(list(pTest.columns))

# Training model
print "\nTraining Model"
print "----------------------------------------------------------------------------------------------------------------"
training_columns = list(pData.columns)
training_columns.remove(response_column)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
github roshanmadhushanka / PythonML / GridSearch[GradientBoosting].py View on Github external
anomaly_series = list(set(anomaly_series))
print anomaly_series
print len(anomaly_series)

# Remove anomalies
df = pData.drop(pData.index[anomaly_series])

# Feature engineering
data_frame = ProcessData.trainDataToFrame(df, moving_k_closest_average=True, standard_deviation=True)
testing_frame = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True)

# Create h2o frame
hData = h2o.H2OFrame(data_frame)
hData.set_names(list(data_frame.columns))

hTesting = h2o.H2OFrame(testing_frame)
hTesting.set_names(list(testing_frame.columns))

# Split data inti training and validation
hTrain, hValidate = hData.split_frame(ratios=[0.8])

h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)
h2o.export_file(hTesting, "hTestingMy.csv", force=True)

training_columns = list(pData.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')

response_column = 'RUL'
github h2oai / driverlessai-recipes / models / algorithms / h2o-3-models.py View on Github external
column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs')
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**params)
github antklen / sdsj2018_h2o_baseline / predict.py View on Github external
# df = pd.read_csv(args.test_csv)
    df = pd.read_csv(args.test_csv, dtype=model_config['dtypes'],
                     parse_dates=model_config['datetime_cols'])
    print('Dataset read, shape {}'.format(df.shape))
    print('time elapsed: {}'.format(time.time()-start_time))

    # preprocessing
    df, df_pred = preprocess(df, model_config, type='test')
    print('time elapsed: {}'.format(time.time()-start_time))

    # final data shape
    print('final df shape {}'.format(df.shape))

    # convert data to h2o format
    print('convert data to h2o format..')
    test = h2o.H2OFrame(df)
    print('time elapsed: {}'.format(time.time()-start_time))

    # make prediction
    aml = h2o.load_model(model_config['model_path'])
    if model_config['mode'] == 'regression':
        df_pred['prediction'] = aml.predict(test).as_data_frame().squeeze()
    if model_config['mode'] == 'classification':
        df_pred['prediction'] = aml.predict(test)['p1'].as_data_frame().squeeze()

    df_pred[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False)

    print('Prediction time: {}'.format(time.time() - start_time))
github h2oai / h2o-3 / h2o-py / h2o / model / model_base.py View on Github external
"""
        Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
        response. The effect of a variable is measured in change in the mean response.

        :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
        :param cols: Feature(s) for which partial dependence will be calculated.
        :param destination_key: An key reference to the created partial dependence tables in H2O.
        :param nbins: Number of bins used. For categorical columns make sure the number of bins exceed the level count.
        :param plot: A boolean specifying whether to plot partial dependence table.
        :param plot_stddev: A boolean specifying whether to add std err to partial dependence plot.
        :param figsize: Dimension/size of the returning plots, adjust to fit your output cells.
        :param server: ?
        :returns: Plot and list of calculated mean response tables for each feature requested.
        """

        if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame")
        assert_is_type(cols, [str])
        assert_is_type(destination_key, None, str)
        assert_is_type(nbins, int)
        assert_is_type(plot, bool)
        assert_is_type(figsize, (int, int))

        # Check cols specified exist in frame data
        for xi in cols:
            if xi not in data.names:
                raise H2OValueError("Column %s does not exist in the training frame" % xi)

        kwargs = {}
        kwargs["cols"] = cols
        kwargs["model_id"] = self.model_id
        kwargs["frame_id"] = data.frame_id
        kwargs["nbins"] = nbins
github h2oai / driverlessai-recipes / models / algorithms / h2o-3-models.py View on Github external
if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None:
                sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)]

        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]