How to use the pyarrow.int64 function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JDASoftwareGroup / kartothek / tests / core / test_common_metadata.py View on Github external
def test_make_meta_column_normalization_pyarrow_schema():
    # GH228
    df = pd.DataFrame(
        [{"part": 1, "id": 1, "col1": "abc"}, {"part": 2, "id": 2, "col1": np.nan}],
        # Kartothek normalizes field order s.t. partition keys are first and the
        # rest is alphabetically. This is reverse.
        columns=["col1", "id", "part"],
    )
    schema = make_meta(
        pa.Schema.from_pandas(df), origin="gh228", partition_keys=["part"]
    )
    fields = [
        pa.field("part", pa.int64()),
        pa.field("col1", pa.string()),
        pa.field("id", pa.int64()),
    ]
    expected_schema = pa.schema(fields)

    assert schema.internal().equals(expected_schema, check_metadata=False)
github tensorflow / io / tests / test_arrow_eager.py View on Github external
def get_arrow_type(self, dt, is_list):
    """get_arrow_type"""
    if dt == dtypes.bool:
      arrow_type = pa.bool_()
    elif dt == dtypes.int8:
      arrow_type = pa.int8()
    elif dt == dtypes.int16:
      arrow_type = pa.int16()
    elif dt == dtypes.int32:
      arrow_type = pa.int32()
    elif dt == dtypes.int64:
      arrow_type = pa.int64()
    elif dt == dtypes.uint8:
      arrow_type = pa.uint8()
    elif dt == dtypes.uint16:
      arrow_type = pa.uint16()
    elif dt == dtypes.uint32:
      arrow_type = pa.uint32()
    elif dt == dtypes.uint64:
      arrow_type = pa.uint64()
    elif dt == dtypes.float16:
      arrow_type = pa.float16()
    elif dt == dtypes.float32:
      arrow_type = pa.float32()
    elif dt == dtypes.float64:
      arrow_type = pa.float64()
    else:
      raise TypeError("Unsupported dtype for Arrow" + str(dt))
github JDASoftwareGroup / kartothek / tests / core / test_index.py View on Github external
new_index1 = original_index.copy().remove_values([1, 3], inplace=inplace)
    expected_index1 = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={2: ["d"]}
    )
    assert new_index1 == expected_index1

    new_index2 = original_index.copy().remove_values([1.0, 3.0], inplace=inplace)
    expected_index2 = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={2: ["d"]}
    )
    assert new_index2 == expected_index2

    new_index3 = original_index.copy().remove_values(["1", "3"], inplace=inplace)
    expected_index3 = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={2: ["d"]}
    )
    assert new_index3 == expected_index3
github JDASoftwareGroup / kartothek / tests / core / test_common_metadata.py View on Github external
def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
github snowflakedb / snowflake-connector-python / test / pandas / test_unit_arrow_chunk_iterator.py View on Github external
def test_iterate_over_timestamp_ntz_chunk():
    random.seed(datetime.datetime.now())
    scale = random.randint(0, 9)
    column_meta = [
        {"logicalType": "TIMESTAMP_NTZ", "scale": str(scale)},
        {"logicalType": "TIMESTAMP_NTZ", "scale": str(scale)}
    ]
    data_type = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()),
                                pyarrow.field('fraction', pyarrow.int32())]) if scale > 7 else pyarrow.int64()

    def timestamp_ntz_generator(scale):
        epoch = random.randint(-621355968, 2534023007)
        frac = random.randint(0, 10**scale - 1) * (10**(9 - scale)) if scale > 7 else random.randint(0, 10**scale - 1)
        if scale > 7:
            return {'epoch': epoch, 'fraction': frac}
        else:
            epoch = str(epoch)
            frac = str(frac)
            ZEROFILL = '000000000'
            frac = ZEROFILL[:scale - len(frac)] + frac
            return int(epoch + frac) if scale else int(epoch)

    def expected_data_transform_ntz(_scale):
        def expected_data_transform_ntz_impl(data, scale=_scale):
            if scale > 7:
github apache / arrow / python / benchmarks / common.py View on Github external
raise ValueError("unrecognized type %r" % (type_name,))

        if kind in ('int', 'float'):
            ty = getattr(pa, type_name)()
        elif kind == 'bool':
            ty = pa.bool_()
        elif kind == 'decimal':
            ty = pa.decimal128(9, 9)
        elif kind == 'fixed binary':
            ty = pa.binary(size)
        elif kind == 'varying binary':
            ty = pa.binary()
        elif kind in ('ascii', 'unicode'):
            ty = pa.string()
        elif kind == 'int64 list':
            ty = pa.list_(pa.int64())
        elif kind == 'struct':
            ty = pa.struct([pa.field('u', pa.int64()),
                            pa.field('v', pa.float64()),
                            pa.field('w', pa.bool_())])

        factories = {
            'int': self.generate_int_list,
            'float': self.generate_float_list,
            'bool': self.generate_bool_list,
            'decimal': self.generate_decimal_list,
            'fixed binary': partial(self.generate_fixed_binary_list,
                                    size=size),
            'varying binary': partial(self.generate_varying_binary_list,
                                      min_size=3, max_size=40),
            'ascii': partial(self.generate_ascii_string_list,
                             min_size=3, max_size=40),
github allwefantasy / pyjava / python / pyjava / datatype / types.py View on Github external
def to_arrow_type(dt):
    """ Convert Spark data type to pyarrow type
    """
    import pyarrow as pa
    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp('us', tz='UTC')
    elif type(dt) == ArrayType:
github IntelPython / sdc / sdc / io / parquet_pio.py View on Github external
def _get_numba_typ_from_pa_typ(pa_typ):
    import pyarrow as pa
    _typ_map = {
        # boolean
        pa.bool_(): types.bool_,
        # signed int types
        pa.int8(): types.int8,
        pa.int16(): types.int16,
        pa.int32(): types.int32,
        pa.int64(): types.int64,
        # unsigned int types
        pa.uint8(): types.uint8,
        pa.uint16(): types.uint16,
        pa.uint32(): types.uint32,
        pa.uint64(): types.uint64,
        # float types (TODO: float16?)
        pa.float32(): types.float32,
        pa.float64(): types.float64,
        # String
        pa.string(): string_type,
        # date
        pa.date32(): types.NPDatetime('ns'),
        pa.date64(): types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'): types.NPDatetime('ns'),
        pa.timestamp('us'): types.NPDatetime('ns'),
github graphistry / pygraphistry / graphistry / util / graph / rectify.py View on Github external
import pyarrow as arrow


int32 = arrow.int32()
int64 = arrow.int64()


def rectify(
    edges,
    nodes,
    edge,
    node,
    edge_src,
    edge_dst,
    safe = True
):
    return _rectify_node_ids(
        edges=_rectify_edge_ids(
            edges=edges,
            edge=edge,
            safe=safe