Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
arrow_type = arrow_array.type
buffers = arrow_array.buffers()
if len(buffers) == 2:
return numpy_array_from_arrow_array(arrow_array)
elif len(buffers) == 3 and isinstance(arrow_array.type, type(pyarrow.string())):
bitmap_buffer, offsets, string_bytes = arrow_array.buffers()
if arrow_array.null_count == 0:
null_bitmap = None # we drop any null_bitmap when there are no null counts
else:
null_bitmap = np.frombuffer(bitmap_buffer, 'uint8', len(bitmap_buffer))
offsets = np.frombuffer(offsets, np.int32, len(offsets)//4)
if string_bytes is None:
string_bytes = np.array([], dtype='S1')
else:
string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes))
column = ColumnStringArrow(offsets, string_bytes, len(arrow_array), null_bitmap=null_bitmap)
return column
else:
raise TypeError('type unsupported: %r' % arrow_type)
2 is coming
3 our
4 way.
>>> df.text.str.lower()
Expression = str_lower(text)
Length: 5 dtype: str (expression)
---------------------------------
0 something
1 very pretty
2 is coming
3 our
4 way.
"""
sl = _to_string_sequence(x).lower()
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
2 is coming
3 our
4 way.
>>> df.text.str.replace(pat='et', repl='__')
Expression = str_replace(text, pat='et', repl='__')
Length: 5 dtype: str (expression)
---------------------------------
0 Som__hing
1 very pr__ty
2 is coming
3 our
4 way.
"""
sl = _to_string_sequence(x).replace(pat, repl, n, flags, regex)
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
3 our
4 way.
>>> df.text.str.rstrip(to_strip='ing')
Expression = str_rstrip(text, to_strip='ing')
Length: 5 dtype: str (expression)
---------------------------------
0 Someth
1 very pretty
2 is com
3 our
4 way.
"""
# in c++ we give empty string the same meaning as None
sl = _to_string_sequence(x).rstrip('' if to_strip is None else to_strip) if to_strip != '' else x
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
3 our
4 way.
>>> df.text.str.lstrip(to_strip='very ')
Expression = str_lstrip(text, to_strip='very ')
Length: 5 dtype: str (expression)
---------------------------------
0 Something
1 pretty
2 is coming
3 our
4 way.
"""
# in c++ we give empty string the same meaning as None
sl = _to_string_sequence(x).lstrip('' if to_strip is None else to_strip) if to_strip != '' else x
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
4 way.
>>> df.text.str.upper()
Expression = str_upper(text)
Length: 5 dtype: str (expression)
---------------------------------
0 SOMETHING
1 VERY PRETTY
2 IS COMING
3 OUR
4 WAY.
"""
sl = _to_string_sequence(x).upper()
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
to_offset = 0 # we need this for selections
to_offset_unselected = 0 # we need this for filtering
count = len(dataset_input)# if not selection else dataset_input.length_unfiltered()
is_string = dtype == str_type
# TODO: if no filter, selection or mask, we can choose the quick path for str
string_byte_offset = 0
for i1, i2 in vaex.utils.subdivide(count, max_length=max_length):
logger.debug("from %d to %d (total length: %d, output length: %d)", i1, i2, len(dataset_input), N)
values = dataset_input.evaluate(column_name, i1=i1, i2=i2, filtered=True, parallel=False, selection=selection)
no_values = len(values)
if no_values:
if is_string:
# for strings, we don't take sorting/shuffling into account when building the structure
to_column = to_array
assert isinstance(to_column, ColumnStringArrow)
from_sequence = _to_string_sequence(values)
to_sequence = to_column.string_sequence.slice(to_offset, to_offset+no_values, string_byte_offset)
string_byte_offset += to_sequence.fill_from(from_sequence)
to_offset += no_values
else:
fill_value = np.nan if dtype.kind == "f" else None
# assert np.ma.isMaskedArray(to_array) == np.ma.isMaskedArray(values), "to (%s) and from (%s) array are not of both masked or unmasked (%s)" %\
# (np.ma.isMaskedArray(to_array), np.ma.isMaskedArray(values), column_name)
if shuffle or sort:
target_set_item = order_array[i1:i2]
else:
target_set_item = slice(to_offset, to_offset + no_values)
if dtype.type == np.datetime64:
values = values.view(np.int64)
if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(values):
to_array.data[target_set_item] = values.filled(fill_value)
2 is coming
3 our
4 way.
>>> df.text.str.ljust(width=10, fillchar='!')
Expression = str_ljust(text, width=10, fillchar='!')
Length: 5 dtype: str (expression)
---------------------------------
0 Something!
1 very pretty
2 is coming!
3 our!!!!!!!
4 way.!!!!!!
"""
sl = _to_string_sequence(x).pad(width, fillchar, False, True)
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
2 is coming
3 our
4 way.
>>> df.text.str.pad(width=10, side='left', fillchar='!')
Expression = str_pad(text, width=10, side='left', fillchar='!')
Length: 5 dtype: str (expression)
---------------------------------
0 !Something
1 very pretty
2 !is coming
3 !!!!!!!our
4 !!!!!!way.
"""
sl = _to_string_sequence(x).pad(width, fillchar, side in ['left', 'both'], side in ['right', 'both'])
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)
2 is coming
3 our
4 way.
>>> df.text.str.repeat(3)
Expression = str_repeat(text, 3)
Length: 5 dtype: str (expression)
---------------------------------
0 SomethingSomethingSomething
1 very prettyvery prettyvery pretty
2 is comingis comingis coming
3 ourourour
4 way.way.way.
"""
sl = _to_string_sequence(x).repeat(repeats)
return column.ColumnStringArrow(sl.bytes, sl.indices, sl.length, sl.offset, string_sequence=sl)