How to use the modin.pandas function in modin

To help you get started, we’ve selected a few modin examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github modin-project / modin / ci / benchmarks / df_op_benchmark.py View on Github external
import os
import modin.pandas as pd

from utils import time_logger
import numpy as np

parser = argparse.ArgumentParser(description="arithmetic benchmark")
parser.add_argument("--path", dest="path", help="path to the csv data file")
parser.add_argument("--logfile", dest="logfile", help="path to the log file")
args = parser.parse_args()
file = args.path
file_size = os.path.getsize(file)

logging.basicConfig(filename=args.logfile, level=logging.INFO)

df = pd.read_csv(file)
blocks = df._block_partitions.flatten().tolist()
ray.wait(blocks, len(blocks))

num_rows, num_cols = df.shape
new_row = np.random.randint(0, 100, size=num_cols)
new_col = np.random.randint(0, 100, size=num_rows)


def rand_row_loc():
    return np.random.randint(0, num_rows)


def rand_col_loc():
    return np.random.randint(0, num_cols)
github modin-project / modin / ci / benchmarks / generate_data.py View on Github external
from __future__ import print_function

import modin.pandas as pd
import numpy as np
import os

num_rows = [100, 10000, 100000, 150000, 200000, 350000, 500000]
num_cols = [1000]

path_to_data = "benchmarks/data/"
if not os.path.exists(path_to_data):
    os.makedirs(path_to_data)

for r in num_rows:
    for c in num_cols:
        df = pd.DataFrame(np.random.randint(0, 100, size=(r, c)))
        df.to_csv(path_to_data + "test-data-{}-{}.csv".format(r, c))

# Files for multi df tests
num_rows = [100, 1000, 100000, 1000000]
num_cols = [1000]

path_to_data = "benchmarks/data/multi/"
if not os.path.exists(path_to_data):
    os.makedirs(path_to_data)

for r in num_rows:
    for c in num_cols:
        df = pd.DataFrame(np.random.randint(0, 100, size=(r, c)))
        df.to_csv(path_to_data + "test-data-{}-{}.csv".format(r, c))
github WinVector / data_algebra / build / lib / data_algebra / modin_model.py View on Github external
def __init__(self, modin_engine=None):
        # can't change engine, so track it as a global
        # https://github.com/modin-project/modin
        global MODIN_ENGINE
        global MODIN_PANDAS
        if MODIN_PANDAS is None:
            if modin_engine is None:
                raise ValueError("modin_engine not set")
            MODIN_ENGINE = modin_engine
            # https://github.com/modin-project/modin
            os.environ["MODIN_ENGINE"] = MODIN_ENGINE
            import modin.pandas
            MODIN_PANDAS = modin.pandas
        else:
            if (modin_engine is not None) and (modin_engine != MODIN_ENGINE):
                raise ValueError("MODIN_ENGINE already set to "
                                 + MODIN_ENGINE
                                 + ", and called with modin_engine=="
                                 + modin_engine)
        data_algebra.eval_model.EvalModel.__init__(self)
        self.impl = PandasModelBase(pd=MODIN_PANDAS,
                                    presentation_model_name='modin')
github modin-project / modin / ci / benchmarks / df_op_benchmark.py View on Github external
df.iloc[:, rand_col_loc()] = new_col

with time_logger("write a row: {}; Size: {} bytes".format(file, file_size)):
    df.iloc[rand_row_loc(), :] = new_row

# element r/w

with time_logger("read an element: {}; Size: {} bytes".format(file, file_size)):
    df.iloc[rand_row_loc(), rand_col_loc()]

with time_logger("write an element: {}; Size: {} bytes".format(file, file_size)):
    df.iloc[rand_row_loc(), rand_col_loc()] = np.random.randint(0, 100)

# appending
with time_logger("append a row: {}; Size: {} bytes".format(file, file_size)):
    df.append(pd.Series(new_row), ignore_index=True)

with time_logger("append a column: {}; Size: {} bytes".format(file, file_size)):
    df["new"] = new_col
github laactech / foxcross / foxcross / pandas_serving.py View on Github external
def _format_input(
        self, data: Dict
    ) -> Union[pandas.DataFrame, Dict[str, pandas.DataFrame]]:
        try:
            if data.pop("multi_dataframe", None) is True:
                logger.debug("Formatting pandas multi_dataframe input")
                return {key: pandas.DataFrame(value) for key, value in data.items()}
            else:
                return pandas.DataFrame(data)
        except (TypeError, KeyError) as exc:
            err_msg = f"Error reading in json: {exc}"
            logger.warning(err_msg)
            raise HTTPException(status_code=400, detail=err_msg)
github h2oai / db-benchmark / modin / groupby-modin.py View on Github external
ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))
ans.reset_index(inplace=True)