1from __future__ import annotations
2
3from typing import (
4 TYPE_CHECKING,
5 Literal,
6)
7
8import numpy as np
9
10from pandas._config import get_option
11
12from pandas._libs import (
13 lib,
14 missing as libmissing,
15)
16from pandas._libs.arrays import NDArrayBacked
17from pandas._typing import (
18 AxisInt,
19 Dtype,
20 Scalar,
21 npt,
22 type_t,
23)
24from pandas.compat import pa_version_under7p0
25from pandas.compat.numpy import function as nv
26from pandas.util._decorators import doc
27
28from pandas.core.dtypes.base import (
29 ExtensionDtype,
30 StorageExtensionDtype,
31 register_extension_dtype,
32)
33from pandas.core.dtypes.common import (
34 is_array_like,
35 is_bool_dtype,
36 is_dtype_equal,
37 is_integer_dtype,
38 is_object_dtype,
39 is_string_dtype,
40 pandas_dtype,
41)
42
43from pandas.core import ops
44from pandas.core.array_algos import masked_reductions
45from pandas.core.arrays import (
46 ExtensionArray,
47 FloatingArray,
48 IntegerArray,
49)
50from pandas.core.arrays.floating import FloatingDtype
51from pandas.core.arrays.integer import IntegerDtype
52from pandas.core.arrays.numpy_ import PandasArray
53from pandas.core.construction import extract_array
54from pandas.core.indexers import check_array_indexer
55from pandas.core.missing import isna
56
57if TYPE_CHECKING:
58 import pyarrow
59
60 from pandas._typing import (
61 NumpySorter,
62 NumpyValueArrayLike,
63 )
64
65 from pandas import Series
66
67
68@register_extension_dtype
69class StringDtype(StorageExtensionDtype):
70 """
71 Extension dtype for string data.
72
73 .. warning::
74
75 StringDtype is considered experimental. The implementation and
76 parts of the API may change without warning.
77
78 Parameters
79 ----------
80 storage : {"python", "pyarrow"}, optional
81 If not given, the value of ``pd.options.mode.string_storage``.
82
83 Attributes
84 ----------
85 None
86
87 Methods
88 -------
89 None
90
91 Examples
92 --------
93 >>> pd.StringDtype()
94 string[python]
95
96 >>> pd.StringDtype(storage="pyarrow")
97 string[pyarrow]
98 """
99
100 name = "string"
101
102 #: StringDtype().na_value uses pandas.NA
103 @property
104 def na_value(self) -> libmissing.NAType:
105 return libmissing.NA
106
107 _metadata = ("storage",)
108
109 def __init__(self, storage=None) -> None:
110 if storage is None:
111 storage = get_option("mode.string_storage")
112 if storage not in {"python", "pyarrow"}:
113 raise ValueError(
114 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
115 )
116 if storage == "pyarrow" and pa_version_under7p0:
117 raise ImportError(
118 "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
119 )
120 self.storage = storage
121
122 @property
123 def type(self) -> type[str]:
124 return str
125
126 @classmethod
127 def construct_from_string(cls, string):
128 """
129 Construct a StringDtype from a string.
130
131 Parameters
132 ----------
133 string : str
134 The type of the name. The storage type will be taking from `string`.
135 Valid options and their storage types are
136
137 ========================== ==============================================
138 string result storage
139 ========================== ==============================================
140 ``'string'`` pd.options.mode.string_storage, default python
141 ``'string[python]'`` python
142 ``'string[pyarrow]'`` pyarrow
143 ========================== ==============================================
144
145 Returns
146 -------
147 StringDtype
148
149 Raise
150 -----
151 TypeError
152 If the string is not a valid option.
153 """
154 if not isinstance(string, str):
155 raise TypeError(
156 f"'construct_from_string' expects a string, got {type(string)}"
157 )
158 if string == "string":
159 return cls()
160 elif string == "string[python]":
161 return cls(storage="python")
162 elif string == "string[pyarrow]":
163 return cls(storage="pyarrow")
164 else:
165 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
166
167 # https://github.com/pandas-dev/pandas/issues/36126
168 # error: Signature of "construct_array_type" incompatible with supertype
169 # "ExtensionDtype"
170 def construct_array_type( # type: ignore[override]
171 self,
172 ) -> type_t[BaseStringArray]:
173 """
174 Return the array type associated with this dtype.
175
176 Returns
177 -------
178 type
179 """
180 from pandas.core.arrays.string_arrow import ArrowStringArray
181
182 if self.storage == "python":
183 return StringArray
184 else:
185 return ArrowStringArray
186
187 def __from_arrow__(
188 self, array: pyarrow.Array | pyarrow.ChunkedArray
189 ) -> BaseStringArray:
190 """
191 Construct StringArray from pyarrow Array/ChunkedArray.
192 """
193 if self.storage == "pyarrow":
194 from pandas.core.arrays.string_arrow import ArrowStringArray
195
196 return ArrowStringArray(array)
197 else:
198 import pyarrow
199
200 if isinstance(array, pyarrow.Array):
201 chunks = [array]
202 else:
203 # pyarrow.ChunkedArray
204 chunks = array.chunks
205
206 results = []
207 for arr in chunks:
208 # using _from_sequence to ensure None is converted to NA
209 str_arr = StringArray._from_sequence(np.array(arr))
210 results.append(str_arr)
211
212 if results:
213 return StringArray._concat_same_type(results)
214 else:
215 return StringArray(np.array([], dtype="object"))
216
217
218class BaseStringArray(ExtensionArray):
219 """
220 Mixin class for StringArray, ArrowStringArray.
221 """
222
223 @doc(ExtensionArray.tolist)
224 def tolist(self):
225 if self.ndim > 1:
226 return [x.tolist() for x in self]
227 return list(self.to_numpy())
228
229
230class StringArray(BaseStringArray, PandasArray):
231 """
232 Extension array for string data.
233
234 .. warning::
235
236 StringArray is considered experimental. The implementation and
237 parts of the API may change without warning.
238
239 Parameters
240 ----------
241 values : array-like
242 The array of data.
243
244 .. warning::
245
246 Currently, this expects an object-dtype ndarray
247 where the elements are Python strings
248 or nan-likes (``None``, ``np.nan``, ``NA``).
249 This may change without warning in the future. Use
250 :meth:`pandas.array` with ``dtype="string"`` for a stable way of
251 creating a `StringArray` from any sequence.
252
253 .. versionchanged:: 1.5.0
254
255 StringArray now accepts array-likes containing
256 nan-likes(``None``, ``np.nan``) for the ``values`` parameter
257 in addition to strings and :attr:`pandas.NA`
258
259 copy : bool, default False
260 Whether to copy the array of data.
261
262 Attributes
263 ----------
264 None
265
266 Methods
267 -------
268 None
269
270 See Also
271 --------
272 :func:`pandas.array`
273 The recommended function for creating a StringArray.
274 Series.str
275 The string methods are available on Series backed by
276 a StringArray.
277
278 Notes
279 -----
280 StringArray returns a BooleanArray for comparison methods.
281
282 Examples
283 --------
284 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
285 <StringArray>
286 ['This is', 'some text', <NA>, 'data.']
287 Length: 4, dtype: string
288
289 Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
290 will convert the values to strings.
291
292 >>> pd.array(['1', 1], dtype="object")
293 <PandasArray>
294 ['1', 1]
295 Length: 2, dtype: object
296 >>> pd.array(['1', 1], dtype="string")
297 <StringArray>
298 ['1', '1']
299 Length: 2, dtype: string
300
301 However, instantiating StringArrays directly with non-strings will raise an error.
302
303 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
304
305 >>> pd.array(["a", None, "c"], dtype="string") == "a"
306 <BooleanArray>
307 [True, <NA>, False]
308 Length: 3, dtype: boolean
309 """
310
311 # undo the PandasArray hack
312 _typ = "extension"
313
314 def __init__(self, values, copy: bool = False) -> None:
315 values = extract_array(values)
316
317 super().__init__(values, copy=copy)
318 if not isinstance(values, type(self)):
319 self._validate()
320 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
321
322 def _validate(self):
323 """Validate that we only store NA or strings."""
324 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
325 raise ValueError("StringArray requires a sequence of strings or pandas.NA")
326 if self._ndarray.dtype != "object":
327 raise ValueError(
328 "StringArray requires a sequence of strings or pandas.NA. Got "
329 f"'{self._ndarray.dtype}' dtype instead."
330 )
331 # Check to see if need to convert Na values to pd.NA
332 if self._ndarray.ndim > 2:
333 # Ravel if ndims > 2 b/c no cythonized version available
334 lib.convert_nans_to_NA(self._ndarray.ravel("K"))
335 else:
336 lib.convert_nans_to_NA(self._ndarray)
337
338 @classmethod
339 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
340 if dtype and not (isinstance(dtype, str) and dtype == "string"):
341 dtype = pandas_dtype(dtype)
342 assert isinstance(dtype, StringDtype) and dtype.storage == "python"
343
344 from pandas.core.arrays.masked import BaseMaskedArray
345
346 if isinstance(scalars, BaseMaskedArray):
347 # avoid costly conversion to object dtype
348 na_values = scalars._mask
349 result = scalars._data
350 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
351 result[na_values] = libmissing.NA
352
353 else:
354 if hasattr(scalars, "type"):
355 # pyarrow array
356 scalars = np.array(scalars)
357 # convert non-na-likes to str, and nan-likes to StringDtype().na_value
358 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
359
360 # Manually creating new array avoids the validation step in the __init__, so is
361 # faster. Refactor need for validation?
362 new_string_array = cls.__new__(cls)
363 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
364
365 return new_string_array
366
367 @classmethod
368 def _from_sequence_of_strings(
369 cls, strings, *, dtype: Dtype | None = None, copy: bool = False
370 ):
371 return cls._from_sequence(strings, dtype=dtype, copy=copy)
372
373 @classmethod
374 def _empty(cls, shape, dtype) -> StringArray:
375 values = np.empty(shape, dtype=object)
376 values[:] = libmissing.NA
377 return cls(values).astype(dtype, copy=False)
378
379 def __arrow_array__(self, type=None):
380 """
381 Convert myself into a pyarrow Array.
382 """
383 import pyarrow as pa
384
385 if type is None:
386 type = pa.string()
387
388 values = self._ndarray.copy()
389 values[self.isna()] = None
390 return pa.array(values, type=type, from_pandas=True)
391
392 def _values_for_factorize(self):
393 arr = self._ndarray.copy()
394 mask = self.isna()
395 arr[mask] = None
396 return arr, None
397
398 def __setitem__(self, key, value):
399 value = extract_array(value, extract_numpy=True)
400 if isinstance(value, type(self)):
401 # extract_array doesn't extract PandasArray subclasses
402 value = value._ndarray
403
404 key = check_array_indexer(self, key)
405 scalar_key = lib.is_scalar(key)
406 scalar_value = lib.is_scalar(value)
407 if scalar_key and not scalar_value:
408 raise ValueError("setting an array element with a sequence.")
409
410 # validate new items
411 if scalar_value:
412 if isna(value):
413 value = libmissing.NA
414 elif not isinstance(value, str):
415 raise TypeError(
416 f"Cannot set non-string value '{value}' into a StringArray."
417 )
418 else:
419 if not is_array_like(value):
420 value = np.asarray(value, dtype=object)
421 if len(value) and not lib.is_string_array(value, skipna=True):
422 raise TypeError("Must provide strings.")
423
424 mask = isna(value)
425 if mask.any():
426 value = value.copy()
427 value[isna(value)] = libmissing.NA
428
429 super().__setitem__(key, value)
430
431 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
432 # the super() method NDArrayBackedExtensionArray._putmask uses
433 # np.putmask which doesn't properly handle None/pd.NA, so using the
434 # base class implementation that uses __setitem__
435 ExtensionArray._putmask(self, mask, value)
436
437 def astype(self, dtype, copy: bool = True):
438 dtype = pandas_dtype(dtype)
439
440 if is_dtype_equal(dtype, self.dtype):
441 if copy:
442 return self.copy()
443 return self
444
445 elif isinstance(dtype, IntegerDtype):
446 arr = self._ndarray.copy()
447 mask = self.isna()
448 arr[mask] = 0
449 values = arr.astype(dtype.numpy_dtype)
450 return IntegerArray(values, mask, copy=False)
451 elif isinstance(dtype, FloatingDtype):
452 arr = self.copy()
453 mask = self.isna()
454 arr[mask] = "0"
455 values = arr.astype(dtype.numpy_dtype)
456 return FloatingArray(values, mask, copy=False)
457 elif isinstance(dtype, ExtensionDtype):
458 # Skip the PandasArray.astype method
459 return ExtensionArray.astype(self, dtype, copy)
460 elif np.issubdtype(dtype, np.floating):
461 arr = self._ndarray.copy()
462 mask = self.isna()
463 arr[mask] = 0
464 values = arr.astype(dtype)
465 values[mask] = np.nan
466 return values
467
468 return super().astype(dtype, copy)
469
470 def _reduce(
471 self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs
472 ):
473 if name in ["min", "max"]:
474 return getattr(self, name)(skipna=skipna, axis=axis)
475
476 raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
477
478 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
479 nv.validate_min((), kwargs)
480 result = masked_reductions.min(
481 values=self.to_numpy(), mask=self.isna(), skipna=skipna
482 )
483 return self._wrap_reduction_result(axis, result)
484
485 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
486 nv.validate_max((), kwargs)
487 result = masked_reductions.max(
488 values=self.to_numpy(), mask=self.isna(), skipna=skipna
489 )
490 return self._wrap_reduction_result(axis, result)
491
492 def value_counts(self, dropna: bool = True) -> Series:
493 from pandas import value_counts
494
495 result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
496 result.index = result.index.astype(self.dtype)
497 return result
498
499 def memory_usage(self, deep: bool = False) -> int:
500 result = self._ndarray.nbytes
501 if deep:
502 return result + lib.memory_usage_of_objects(self._ndarray)
503 return result
504
505 @doc(ExtensionArray.searchsorted)
506 def searchsorted(
507 self,
508 value: NumpyValueArrayLike | ExtensionArray,
509 side: Literal["left", "right"] = "left",
510 sorter: NumpySorter = None,
511 ) -> npt.NDArray[np.intp] | np.intp:
512 if self._hasna:
513 raise ValueError(
514 "searchsorted requires array to be sorted, which is impossible "
515 "with NAs present."
516 )
517 return super().searchsorted(value=value, side=side, sorter=sorter)
518
519 def _cmp_method(self, other, op):
520 from pandas.arrays import BooleanArray
521
522 if isinstance(other, StringArray):
523 other = other._ndarray
524
525 mask = isna(self) | isna(other)
526 valid = ~mask
527
528 if not lib.is_scalar(other):
529 if len(other) != len(self):
530 # prevent improper broadcasting when other is 2D
531 raise ValueError(
532 f"Lengths of operands do not match: {len(self)} != {len(other)}"
533 )
534
535 other = np.asarray(other)
536 other = other[valid]
537
538 if op.__name__ in ops.ARITHMETIC_BINOPS:
539 result = np.empty_like(self._ndarray, dtype="object")
540 result[mask] = libmissing.NA
541 result[valid] = op(self._ndarray[valid], other)
542 return StringArray(result)
543 else:
544 # logical
545 result = np.zeros(len(self._ndarray), dtype="bool")
546 result[valid] = op(self._ndarray[valid], other)
547 return BooleanArray(result, mask)
548
549 _arith_method = _cmp_method
550
551 # ------------------------------------------------------------------------
552 # String methods interface
553 # error: Incompatible types in assignment (expression has type "NAType",
554 # base class "PandasArray" defined the type as "float")
555 _str_na_value = libmissing.NA # type: ignore[assignment]
556
557 def _str_map(
558 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
559 ):
560 from pandas.arrays import BooleanArray
561
562 if dtype is None:
563 dtype = StringDtype(storage="python")
564 if na_value is None:
565 na_value = self.dtype.na_value
566
567 mask = isna(self)
568 arr = np.asarray(self)
569
570 if is_integer_dtype(dtype) or is_bool_dtype(dtype):
571 constructor: type[IntegerArray] | type[BooleanArray]
572 if is_integer_dtype(dtype):
573 constructor = IntegerArray
574 else:
575 constructor = BooleanArray
576
577 na_value_is_na = isna(na_value)
578 if na_value_is_na:
579 na_value = 1
580 result = lib.map_infer_mask(
581 arr,
582 f,
583 mask.view("uint8"),
584 convert=False,
585 na_value=na_value,
586 # error: Argument 1 to "dtype" has incompatible type
587 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
588 # "Type[object]"
589 dtype=np.dtype(dtype), # type: ignore[arg-type]
590 )
591
592 if not na_value_is_na:
593 mask[:] = False
594
595 return constructor(result, mask)
596
597 elif is_string_dtype(dtype) and not is_object_dtype(dtype):
598 # i.e. StringDtype
599 result = lib.map_infer_mask(
600 arr, f, mask.view("uint8"), convert=False, na_value=na_value
601 )
602 return StringArray(result)
603 else:
604 # This is when the result type is object. We reach this when
605 # -> We know the result type is truly object (e.g. .encode returns bytes
606 # or .findall returns a list).
607 # -> We don't know the result type. E.g. `.get` can return anything.
608 return lib.map_infer_mask(arr, f, mask.view("uint8"))