How to use the vaex.file function in vaex

To help you get started, we’ve selected a few vaex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github vaexio / vaex / packages / vaex-core / vaex / export.py View on Github external
with file(args.input) as lines:
                line = next(lines).strip()
                # print line
                names = line.strip().split(",")
                line = next(lines).strip()
                values = line.strip().split(",")
                numerics = []
                for value in values:
                    try:
                        float(value)
                        numerics.append(True)
                    except:
                        numerics.append(False)
                names_numeric = [name for name, numeric in zip(names, numerics) if numeric]
                print(names_numeric)
                output = vaex.file.other.Hdf5MemoryMapped.create(args.output, row_count, names_numeric)
                Ncols = len(names)
                cols = [output.columns[name] if numeric else None for name, numeric in zip(names, numerics)]

                def copy(line, row_index):
                    values = line.strip().split(",")
                    for column_index in range(Ncols):
                        if numerics[column_index]:
                            value = float(values[column_index])
                            cols[column_index][row_index] = value
                row = 0
                copy(line, row)
                row += 1
                progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None
                for line in lines:
                    # print line
                    copy(line, row)
github vaexio / vaex / packages / vaex-core / vaex / export.py View on Github external
column_names.append(random_index_name)
        data_types.append(np.int64().dtype)
        data_shapes.append((N,))
        ucds.append(None)
        units.append(None)
    else:
        random_index_name = None

    # TODO: all expressions can have missing values.. how to support that?
    null_values = {key: dataset.columns[key].fill_value for key in dataset.get_column_names() if dataset.is_masked(key) and dataset.dtype(key).kind != "f"}
    vaex.file.colfits.empty(path, N, column_names, data_types, data_shapes, ucds, units, null_values=null_values)
    if shuffle:
        del column_names[-1]
        del data_types[-1]
        del data_shapes[-1]
    dataset_output = vaex.file.other.FitsBinTable(path, write=True)
    _export(dataset_input=dataset, dataset_output=dataset_output, path=path, random_index_column=random_index_name,
            column_names=column_names, selection=selection, shuffle=shuffle,
            progress=progress, sort=sort, ascending=ascending)
    dataset_output.close_files()
github vaexio / vaex / packages / vaex-core / vaex / __init__.py View on Github external
# sort to get predicatable behaviour (useful for testing)
                    filenames.extend(list(sorted(glob.glob(path))))
            ds = None
            if len(filenames) == 0:
                raise IOError('Could not open file: {}, it does not exist'.format(path))
            filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
            filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                naked_path = path
                if '?' in naked_path:
                    naked_path = naked_path[:naked_path.index('?')]
                ext = os.path.splitext(naked_path)[1]
                if os.path.exists(filename_hdf5) and convert:  # also check mtime?
                    if convert:
                        ds = vaex.file.open(filename_hdf5)
                    else:
                        ds = vaex.file.open(filename_hdf5, *args, **kwargs)
                else:
                    if ext == '.csv' or naked_path.endswith(".csv.bz2"):  # special support for csv.. should probably approach it a different way
                        ds = from_csv(path, copy_index=copy_index, **kwargs)
                    else:
                        ds = vaex.file.open(path, *args, **kwargs)
                    if convert and ds:
                        ds.export_hdf5(filename_hdf5, shuffle=shuffle)
                        ds = vaex.file.open(filename_hdf5) # argument were meant for pandas?
                if ds is None:
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path))
                    if os.path.exists(path):
                        raise IOError('Could not open file: {}, it does not exist?'.format(path))
            elif len(filenames) > 1:
github vaexio / vaex / packages / vaex-ui / vaex / ui / main.py View on Github external
def samp_table_load_votable(self, url=None, table_id=None, name=None):
        filenames = []
        if table_id is not None:
            filename = table_id
            if filename.startswith("file:/"):
                filename = filename[5:]

            basename, ext = os.path.splitext(filename)
            if os.path.exists(filename):
                filenames.append(filename)
            for other_ext in [".hdf5", ".fits"]:
                filename = basename + other_ext
                print(filename)
                if os.path.exists(filename) and filename not in filenames:
                    filenames.append(filename)
            filenames = list(filter(vaex.file.can_open, filenames))
        options = []
        for filename in filenames:
            options.append(filename + " | read directly from file (faster)")
        options.append(url + " | load as VOTable (slower)")
        # options.append("link to existing opened dataset")
        for dataset in self.dataset_selector.datasets:
            options.append("link to existing open dataset: " + dataset.name)
        index = choose(self, "SAMP: load table", "Choose how to load table", options)
        if index is not None:
            if index < len(filenames):
                print("open file", filenames[index])
                self.load_file(filenames[index], table_id)
            elif index == len(filenames):
                self.load_votable(url, table_id)
                print("load votable", url)
            else:
github vaexio / vaex / packages / vaex-core / vaex / export.py View on Github external
parser_file = subparsers.add_parser('csv', help='use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)')
    parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)")
    parser_file.add_argument("output", help="output file (ends in .hdf5)")
    parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
    dataset = None
    if args.task == "soneira":
        if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level, vaex.utils.confirm_on_console):
            if not args.quiet:
                print("generating soneira peebles dataset...")
            dataset = vaex.file.other.SoneiraPeebles(args.dimension, 2, args.max_level, args.lambdas)
        else:
            return 1
    if args.task == "tap":
        dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print("exporting from {tap_url} table name {table_name} to {output}".format(tap_url=args.tap_url, table_name=args.table_name, output=args.output))
    if args.task == "csv":
        # dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print("exporting from {input} to {output}".format(input=args.input, output=args.output))
    if args.task == "file":
        if args.input[0] == "@":
            inputs = open(args.input[1:]).readlines()
            dataset = vaex.open_many(inputs)
        else:
            dataset = vaex.open(args.input)
github vaexio / vaex / packages / vaex-core / vaex / file / column.py View on Github external
def __init__(self, file, byte_offset, length, dtype, write=False, path=None):
        self.path = path or file.name
        self.file = file
        self.tls = threading.local()
        # keep a record of all duplicate file handles to we can close them
        self.file_handles = []
        self.tls.file = vaex.file.dup(file)
        self.file_handles.append(file)
        self.native = False
        # if hasattr(self.file, 'fileno') and osname
        #     fcntl.fcntl(self.file.fileno(), F_NOCACHE, 1)
            # self.native = True
        #libc.fcntl(self.file.fileno(), fcntl.F_NOCACHE, 1)
        #libc.fcntl(c_int(self.file.fileno()), c_int(fcntl.F_NOCACHE), c_int(1))
        self.byte_offset = byte_offset
        self.length = length
        self.dtype = np.dtype(dtype)
        self.shape = (length,)
        self.write = write
github vaexio / vaex / packages / vaex-core / vaex / file / column.py View on Github external
offset_optimal = offset & ~page_mask
                padding = offset - offset_optimal
                bytes_read = libc.pread(ctypes.c_int32(self.file.fileno()), ar_ptr, ctypes.c_uint64(N * itemsize + padding), ctypes.c_uint64(offset_optimal))
                if (bytes_read-padding) != N * itemsize:
                    raise IOError('read error: expected %d bytes, read %d, padding: %d' % (N * itemsize, bytes_read, padding))
                ar = np.frombuffer(ar_bytes, self.dtype, offset=padding, count=N)
            else:
                byte_length = items*itemsize
                offset = self.byte_offset + start * itemsize
                # Quick and safe way to get the thread local file handle:
                file = getattr(self.tls, 'file', None)
                if file is None:
                    with cache_lock:
                        file = getattr(self.tls, 'file', None)
                        if file is None:
                            file = self.tls.file = vaex.file.dup(self.file)
                            self.file_handles.append(file)
                # this is the fast path, that avoids a memory copy but gets a view on the underlying data
                # cache.py:CachedFile supports this
                if hasattr(file, '_as_numpy'):
                    ar = file._as_numpy(offset, byte_length, self.dtype)
                else:
                    # Traditinal file object go this slower route
                    # and they need per thread file object since the location (seek)
                    # is in the state of the file object

                    file.seek(offset)
                    data = file.read(byte_length)
                    ar = np.frombuffer(data, self.dtype, count=N)
            if USE_CACHE:
                with cache_lock:
                    cache[key] = ar
github vaexio / vaex / packages / vaex-core / vaex / file / cache.py View on Github external
def dup(self):
        if callable(self.file):
            file = self.file
        else:
            file = vaex.file.dup(self.file)
        return CachedFile(file, self.path, self.cache_dir, self.block_size, data_file=self.data_file, mask_file=self.mask_file)
github vaexio / vaex / packages / vaex-core / vaex / file / s3.py View on Github external
if '?' in naked_path:
        naked_path = naked_path[:naked_path.index('?')]
    # only use the first item
    options = {key: values[0] for key, values in parse_qs(o.query).items()}
    options.update(kwargs)
    use_cache = options.get('cache', 'true') in ['true', 'True', '1']
    if 'cache' in options:
        del options['cache']
    anon = options.get('anon', 'false') in ['true', 'True', '1']
    if 'anon' in options:
        del options['anon']
    s3 = s3fs.S3FileSystem(anon=anon, default_block_size=1,
                           default_fill_cache=False, **options)
    if use_cache:
        fp = lambda: s3.open(naked_path, mode)
        fp = vaex.file.cache.CachedFile(fp, naked_path)
    else:
        fp = s3.open(naked_path, mode)
    return fp
github vaexio / vaex / packages / vaex-arrow / vaex_arrow / opener.py View on Github external
def register_opener():
    vaex.file.register(ArrowOpener)
    vaex.file.register(ParquetOpener)