How to use the h2o.make_syn_dir function in h2o

To help you get started, we’ve selected a few h2o examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / h2o-2 / py / testdir_single_jvm / test_rf1_fvec.py View on Github external
def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

# two row dataset gets this. Avoiding it for now
# java.lang.ArrayIndexOutOfBoundsException: 1
# at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange (10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
github h2oai / h2o-2 / py / testdir_multi_jvm / test_rf_parity_500trees.py View on Github external
def test_rf_1ktrees_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [500]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(500) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
github h2oai / h2o-2 / py / testdir_single_jvm / test_rf_predict_fvec.py View on Github external
def test_rf_predict_fvec(self):
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        trees = 6
        timeoutSecs = 20
        hex_key = 'iris2.csv.hex'
        parseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', schema='put', hex_key=hex_key)
        h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, destination_key="iris_rf_model", timeoutSecs=timeoutSecs)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key. Inspect/Summary result"

        start = time.time()
        predict = h2o.nodes[0].generate_predictions(model_key="iris_rf_model", data_key=hex_key, 
            prediction='predict.hex')
        print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds'
        print "predict:", h2o.dump_json(predict)
        csvPredictPathname = SYNDATASETS_DIR + "/" + "iris2.predict.csv"
        h2o.nodes[0].csv_download(src_key='predict.hex', csvPathname=csvPredictPathname)
github h2oai / h2o-2 / py / testdir_single_jvm / test_GLM2_syn_corr.py View on Github external
def test_GLM2_mnist(self):
        if not SCIPY_INSTALLED:
            pass

        else:    
            SYNDATASETS_DIR = h2o.make_syn_dir()

            csvFilelist = [
                (10000, 500, 'cA', 60),
            ]

            trial = 0
            for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist:
                trialStart = time.time()

                # PARSE test****************************************
                csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='put',
github h2oai / h2o-2 / py / testdir_single_jvm / test_exec2_row_range.py View on Github external
def test_exec2_row_range(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000000, 5, 'cA', 200),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
github h2oai / h2o-2 / py / testdir_single_jvm / test_speedrf_predict3.py View on Github external
def test_rf_predict3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1==1:
            y = 4 # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
github h2oai / h2o-2 / py / testdir_single_jvm / test_PCA_many_cols.py View on Github external
def test_PCA_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 10, 'cA', 300), 
            (10000, 50, 'cB', 300), 
            (10000, 100, 'cC', 300), 
            # (10000, 500, 'cH', 300), 
            # (10000, 1000, 'cI', 300), 
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print (rowCount, colCount, hex_key, timeoutSecs)
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
github h2oai / h2o-2 / py / testdir_single_jvm / test_summary2_uniform_int_w_NA.py View on Github external
def test_summary2_uniform_int_w_NA(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'B.hex', 1, 1000*M,            ('C1',  1.0*M, 250.0*M, 500.0*M, 750.0*M, 1000.0*M)),
            (ROWS, 1, 'B.hex', 1, 1000,            ('C1',  1.0, 250.0, 500.0, 750.0, 1000.0)),
            (ROWS, 1, 'x.hex', 1, 20000,           ('C1',  1.0, 5000.0, 10000.0, 15000.0, 20000.0)),
            (ROWS, 1, 'x.hex', -5000, 0,           ('C1', -5000.00, -3750.0, -2500.0, -1250.0, 0)),
            (ROWS, 1, 'x.hex', -100000, 100000,    ('C1',  -100000.0, -50000.0, 0, 50000.0, 100000.0)),

            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,           ('C1',   1.0, 2501.0, 5001.0, 7501.0, 10000.0)),
            (ROWS, 1, 'B.hex', -100, 100,          ('C1',  -100.0, -50.0, 0.0, 50.0, 100.0)),

            (ROWS, 1, 'C.hex', 1, 100000,          ('C1',   1.0, 25001.0, 50001.0, 75001.0, 100000.0)),
github h2oai / h2o-2 / py / testdir_single_jvm / test_summary2_unifiles2.py View on Github external
def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
github h2oai / h2o-2 / py / testdir_multi_jvm / test_many_fp_formats_libsvm_fvec.py View on Github external
def test_many_fp_formats_libsvm_fvec(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30,'sparse'),
            (100, 100, 'cF', 30,'sparse50'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)