How to use the pyarrow.Table function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github omnisci / pymapd / tests / test_integration.py View on Github external
def test_load_empty_table_arrow(self, con):

        con.execute("drop table if exists baz;")
        con.execute("create table baz (a int, b float, c text);")

        data = [(1, 1.1, 'a'),
                (2, 2.2, '2'),
                (3, 3.3, '3')]

        df = pd.DataFrame(data, columns=list('abc')).astype({
            'a': 'int32',
            'b': 'float32'
        })

        table = pa.Table.from_pandas(df, preserve_index=False)
        con.load_table("baz", table, method='arrow')
        result = sorted(con.execute("select * from baz"))
        self.check_empty_insert(result, data)
        con.execute("drop table if exists baz;")
github apache / arrow / python / testing / parquet_interop.py View on Github external
def test_fastparquet_read_with_hdfs():
    fs = hdfs_test_client()

    df = tm.makeDataFrame()
    table = pa.Table.from_pandas(df)

    path = '/tmp/testing.parquet'
    with fs.open(path, 'wb') as f:
        pq.write_table(table, f)

    parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)

    result = parquet_file.to_pandas()
    tm.assert_frame_equal(result, df)
github SuperCowPowers / zat / bat / log_to_parquet.py View on Github external
print('Writing {:d} rows...'.format(num_rows))
            if writer is None:
                arrow_table = pa.Table.from_pandas(_make_df(current_row_set))
                writer = pq.ParquetWriter(parquet_file, arrow_table.schema, compression=compression, use_deprecated_int96_timestamps=True)
                writer.write_table(arrow_table)
            else:
                arrow_table = pa.Table.from_pandas(_make_df(current_row_set))
                writer.write_table(arrow_table)

            # Empty the current row set
            current_row_set = []

    # Add any left over rows and close the Parquet file
    if num_rows:
        print('Writing {:d} rows...'.format(num_rows))
        arrow_table = pa.Table.from_pandas(_make_df(current_row_set))
        writer.write_table(arrow_table)
        writer.close()
        print('Parquet File Complete')
github droher / boxball / transform / src / parquet.py View on Github external
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]):
    """
    Writes  Parquet version of the chunked dataframe input.

    Arrow table creation and Parquet-writes take up around 25% of the time on this function.
    The CSV read takes around 75%.
    """
    rows_processed = 0
    for df in df_iterator:
        rows_processed += min(BUFFER_SIZE_ROWS, len(df))
        for col_name in date_cols:
            df[col_name] = pd.to_datetime(df[col_name], unit="ms")
        pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
        parquet_writer.write_table(pa_table)

        print("Rows processed: {}".format(rows_processed), end="\r", flush=True)
    print()
github scikit-hep / awkward-array / awkward / arrow.py View on Github external
def convert(obj, message):
        if isinstance(obj, (awkward.array.base.AwkwardArray, numpy.ndarray)):
            out = toarrow(obj)
            if isinstance(out, pyarrow.Table):
                return out
            else:
                return pyarrow.Table.from_batches([pyarrow.RecordBatch.from_arrays([out], [""])])
        else:
            raise TypeError(message)
github int-brain-lab / ibllib / brainbox / io / parquet.py View on Github external
def save(file, table):
    """
    Save pandas dataframe to parquet
    :param file:
    :param table:
    :return:
    """
    pq.write_table(pa.Table.from_pandas(table), file)
github vaexio / vaex / packages / vaex-arrow / vaex_arrow / dataset.py View on Github external
def _load(self):
        source = pa.memory_map(self.path)
        try:
            # first we try if it opens as stream
            reader = pa.ipc.open_stream(source)
        except pa.lib.ArrowInvalid:
            # if not, we open as file
            reader = pa.ipc.open_file(source)
            # for some reason this reader is not iterable
            batches = [reader.get_batch(i) for i in range(reader.num_record_batches)]
        else:
            # if a stream, we're good
            batches = reader  # this reader is iterable
        table = pa.Table.from_batches(batches)
        self._load_table(table)
github SuperCowPowers / zat / bat / dataframe_to_parquet.py View on Github external
def df_to_parquet(df, filename, compression='SNAPPY'):
    """write_to_parquet: Converts a Pandas DataFrame into a Parquet file
        Args:
            df (pandas dataframe): The Pandas Dataframe to be saved as parquet file
            filename (string): The full path to the filename for the Parquet file
    """

    # Nullable integer arrays are currently not handled by Arrow
    # See: https://issues.apache.org/jira/browse/ARROW-5379
    # Cast Nullable integer arrays to float32 before serializing
    null_int_types = [pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype, pd.Int64Dtype]
    for col in df:
        if type(df[col].dtype) in null_int_types:
            df[col] = df[col].astype(np.float32)

    arrow_table = pa.Table.from_pandas(df)
    if compression == 'UNCOMPRESSED':
        compression = None
    pq.write_table(arrow_table, filename, compression=compression, flavor='spark')
github tensorflow / data-validation / tensorflow_data_validation / api / validation_api.py View on Github external
return statistics.datasets[0]
  # If there are multiple datasets, attempt to find the dataset for the
  # default slice (i.e., slice for all examples) from among the datasets.
  for dataset in statistics.datasets:
    if dataset.name == constants.DEFAULT_SLICE_KEY:
      logging.warning('Multiple datasets found in statistics. Using the '
                      'default slice dataset.')
      return dataset
  # If there are multiple datasets, but the default slice is not found, raise an
  # error.
  raise ValueError('Only statistics proto with one dataset or the default '
                   'slice (i.e., "All Examples" slice) is currently supported.')


@beam.typehints.with_input_types(
    beam.typehints.Tuple[pa.Table, anomalies_pb2.Anomalies])
@beam.typehints.with_output_types(types.BeamSlicedTable)
class _GenerateAnomalyReasonSliceKeys(beam.DoFn):
  """Yields a slice key for each anomaly reason in the Anomalies proto."""

  def process(self, element):
    table, anomalies_proto = element
    for slice_key in slicing_util.generate_slices(
        table, [anomalies_util.anomalies_slicer], anomalies=anomalies_proto):
      yield slice_key, table


@beam.typehints.with_input_types(pa.Table)
@beam.typehints.with_output_types(types.BeamSlicedTable)
class IdentifyAnomalousExamples(beam.PTransform):
  """API for identifying anomalous examples.