How to use the pyarrow.string function in pyarrow

To help you get started, we’ve selected a few pyarrow examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github snowflakedb / snowflake-connector-python / test / pandas / test_unit_arrow_chunk_iterator.py View on Github external
def test_iterate_over_string_chunk():
    random.seed(datetime.datetime.now())
    column_meta = [
            {"logicalType": "TEXT"},
            {"logicalType": "TEXT"}
    ]
    field_foo = pyarrow.field("column_foo", pyarrow.string(), True, column_meta[0])
    field_bar = pyarrow.field("column_bar", pyarrow.string(), True, column_meta[1])
    pyarrow.schema([field_foo, field_bar])

    def str_generator():
        return str(random.randint(-100, 100))

    iterate_over_test_chunk([pyarrow.string(), pyarrow.string()],
                            column_meta, str_generator)
github JDASoftwareGroup / kartothek / tests / core / test_common_metadata.py View on Github external
pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema
github xhochy / fletcher / tests / test_pandas_extension.py View on Github external
)
xfail_list_setitem_not_implemented = pytest.mark.xfail_by_type_filter(
    [pa.types.is_list], "__setitem__ is not implemented for lists"
)
xfail_missing_list_dict_encode = pytest.mark.xfail_by_type_filter(
    [pa.types.is_list],
    "ArrowNotImplementedError: dictionary-encode not implemented for list",
)
xfail_bool_too_few_uniques = pytest.mark.xfail_by_type_filter(
    [pa.types.is_boolean], "Test requires at least 3 unique values"
)


test_types = [
    FletcherTestType(
        pa.string(),
        ["🙈", "Ö", "Č", "a", "B"] * 20,
        [None, "A"],
        ["B", "B", None, None, "A", "A", "B", "C"],
        ["B", "C", "A"],
        ["B", None, "A"],
        lambda: choices(list(string.ascii_letters), k=10),
    ),
    FletcherTestType(
        pa.bool_(),
        [True, False, True, True, False] * 20,
        [None, False],
        [True, True, None, None, False, False, True, False],
        [True, False, False],
        [True, None, False],
        lambda: choices([True, False], k=10),
    ),
github GoogleCloudPlatform / professional-services / examples / dataflow-data-generator / data-generator-pipeline / data_generator / ParquetUtil.py View on Github external
def get_pyarrow_translated_schema(string_schema):
    """
    Converts string schema dict to pyarrow schema for writing to parquet.
    :param string_schema:
    :return: pyarrow schema
    """
    type_conversions = {
        'STRING': pa.string(),
        'NUMERIC': pa.int64(),
        'BYTE': None,
        'INTEGER': pa.int64(),
        'FLOAT': pa.float64(),
        'NUMERIC': pa.int64(),
        'BOOLEAN': pa.bool_(),
        'TIMESTAMP': pa.timestamp('us'),
        'DATE': pa.date32(),
        'TIME': pa.time64('us'),
        'DATETIME': pa.timestamp('us'),
        'GEOGRAPHY': None,
        'RECORD': None
    }
    pa_schema_list = []
    for field in string_schema:
        field_type = field['type']
github JDASoftwareGroup / kartothek / kartothek / core / dataset.py View on Github external
"IndexCol": {
                    IndexValue: ["partition_label"]
                },
                "SecondIndexCol": {
                    Value: ["partition_label"]
                }
            }

        """
        if self.primary_indices_loaded:
            return self

        indices = _construct_dynamic_index_from_partitions(
            partitions=self.partitions,
            table_meta=self.table_meta,
            default_dtype=pa.string() if self.metadata_version == 3 else None,
            partition_keys=self.partition_keys,
        )
        combined_indices = self.indices.copy()
        combined_indices.update(indices)
        return self.copy(indices=combined_indices)
github dask / dask / dask / dataframe / io / parquet.py View on Github external
if min_maxs:
            # We have min/max pairs
            divisions = [mn for mn, mx in min_maxs] + [min_maxs[-1][1]]

            if pa.__version__ < LooseVersion("0.13.1"):

                # Handle conversion to pandas timestamp divisions
                index_field = pa_schema.field_by_name(divisions_name)
                if pa.types.is_timestamp(index_field.type):
                    time_unit = index_field.type.unit
                    divisions_ns = [_to_ns(d, time_unit) for d in divisions]
                    divisions = [pd.Timestamp(ns) for ns in divisions_ns]

                # Handle encoding of bytes string
                if index_field.type == pa.string():
                    # Parquet strings are always encoded as utf-8
                    encoding = "utf-8"
                    divisions = [d.decode(encoding).strip() for d in divisions]

        else:  # pragma: no cover
            if infer_divisions is True:
                raise ValueError(
                    (
                        "Unable to infer divisions for index of '{index_name}' "
                        "because it is not known to be "
                        "sorted across partitions"
                    ).format(index_name=divisions_name_in_schema)
                )

            divisions = (None,) * (len(pa_pieces) + 1)
    elif pa_pieces:
github allwefantasy / pyjava / python / pyjava / serializers.py View on Github external
print("----pyarrow version---")
            print(pa.__version__)
        writer = None
        try:
            for batch in iterator:
                if is_dev:
                    print(batch.to_pandas())
                if writer is None:
                    writer = pa.RecordBatchStreamWriter(stream, batch.schema)
                writer.write_batch(batch)

            # if iterator is empty, we should write default schema
            if writer is None:
                if is_dev:
                    print("----dump empty arrow---")
                rb = pa.RecordBatch.from_arrays([[]], schema=pa.schema([('value', pa.string())]))
                writer = pa.RecordBatchStreamWriter(stream, rb.schema)
                writer.write_batch(rb)

        finally:
            if writer is not None:
                writer.close()
github xhochy / fletcher / fletcher / algorithms / string.py View on Github external
result_offsets = np.empty(len(a) + 1, dtype=np.int32)
        result_offsets[0] = 0
        total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
        result_data = np.empty(total_size, dtype=np.uint8)
        _merge_string_data(
            len(a),
            valid,
            offsets_a,
            data_a,
            offsets_b,
            data_b,
            result_offsets,
            result_data,
        )
        buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
        return pa.Array.from_buffers(pa.string(), len(a), buffers)
    return a
github xhochy / fletcher / fletcher / base.py View on Github external
"""
        if not isinstance(string, str):
            raise TypeError(
                "'construct_from_string' expects a string, got "
            )

        # Remove fletcher specific naming from the arrow type string.
        if string.startswith("fletcher_chunked["):
            string = string[len("fletcher_chunked[") : -1]
        else:
            raise TypeError(
                f"Cannot construct a 'FletcherChunkedDtype' from '{string}'"
            )

        if string == "list":
            return cls(pa.list_(pa.string()))

        try:
            type_for_alias = pa.type_for_alias(string)
        except (ValueError, KeyError):
            # pandas API expects a TypeError
            raise TypeError(string)

        return cls(type_for_alias)
github mozilla / OpenWPM / automation / DataAggregator / parquet_schema.py View on Github external
pa.field('operation', pa.string()),
    pa.field('value', pa.string()),
    pa.field('arguments', pa.string()),
    pa.field('time_stamp', pa.string(), nullable=False)
]
PQ_SCHEMAS['javascript'] = pa.schema(fields)

# javascript_cookies
fields = [
    pa.field('crawl_id', pa.uint32()),
    pa.field('visit_id', pa.int64()),
    pa.field('instance_id', pa.uint32(), nullable=False),
    pa.field('extension_session_uuid', pa.string()),
    pa.field('event_ordinal', pa.int64()),
    pa.field('record_type', pa.string()),
    pa.field('change_cause', pa.string()),
    pa.field('expiry', pa.string()),
    pa.field('is_http_only', pa.bool_()),
    pa.field('is_host_only', pa.bool_()),
    pa.field('is_session', pa.bool_()),
    pa.field('host', pa.string()),
    pa.field('is_secure', pa.bool_()),
    pa.field('name', pa.string()),
    pa.field('path', pa.string()),
    pa.field('value', pa.string()),
    pa.field('same_site', pa.string()),
    pa.field('first_party_domain', pa.string()),
    pa.field('store_id', pa.string()),
    pa.field('time_stamp', pa.string())
]
PQ_SCHEMAS['javascript_cookies'] = pa.schema(fields)