1from __future__ import annotations
2
3from typing import (
4 TYPE_CHECKING,
5 ClassVar,
6 Literal,
7)
8
9import numpy as np
10
11from pandas._config import get_option
12
13from pandas._libs import (
14 lib,
15 missing as libmissing,
16)
17from pandas._libs.arrays import NDArrayBacked
18from pandas._libs.lib import ensure_string_array
19from pandas.compat import pa_version_under10p1
20from pandas.compat.numpy import function as nv
21from pandas.util._decorators import doc
22
23from pandas.core.dtypes.base import (
24 ExtensionDtype,
25 StorageExtensionDtype,
26 register_extension_dtype,
27)
28from pandas.core.dtypes.common import (
29 is_array_like,
30 is_bool_dtype,
31 is_integer_dtype,
32 is_object_dtype,
33 is_string_dtype,
34 pandas_dtype,
35)
36
37from pandas.core import ops
38from pandas.core.array_algos import masked_reductions
39from pandas.core.arrays.base import ExtensionArray
40from pandas.core.arrays.floating import (
41 FloatingArray,
42 FloatingDtype,
43)
44from pandas.core.arrays.integer import (
45 IntegerArray,
46 IntegerDtype,
47)
48from pandas.core.arrays.numpy_ import NumpyExtensionArray
49from pandas.core.construction import extract_array
50from pandas.core.indexers import check_array_indexer
51from pandas.core.missing import isna
52
53if TYPE_CHECKING:
54 import pyarrow
55
56 from pandas._typing import (
57 AxisInt,
58 Dtype,
59 DtypeObj,
60 NumpySorter,
61 NumpyValueArrayLike,
62 Scalar,
63 Self,
64 npt,
65 type_t,
66 )
67
68 from pandas import Series
69
70
71@register_extension_dtype
72class StringDtype(StorageExtensionDtype):
73 """
74 Extension dtype for string data.
75
76 .. warning::
77
78 StringDtype is considered experimental. The implementation and
79 parts of the API may change without warning.
80
81 Parameters
82 ----------
83 storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
84 If not given, the value of ``pd.options.mode.string_storage``.
85
86 Attributes
87 ----------
88 None
89
90 Methods
91 -------
92 None
93
94 Examples
95 --------
96 >>> pd.StringDtype()
97 string[python]
98
99 >>> pd.StringDtype(storage="pyarrow")
100 string[pyarrow]
101 """
102
103 # error: Cannot override instance variable (previously declared on
104 # base class "StorageExtensionDtype") with class variable
105 name: ClassVar[str] = "string" # type: ignore[misc]
106
107 #: StringDtype().na_value uses pandas.NA except the implementation that
108 # follows NumPy semantics, which uses nan.
109 @property
110 def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
111 if self.storage == "pyarrow_numpy":
112 return np.nan
113 else:
114 return libmissing.NA
115
116 _metadata = ("storage",)
117
118 def __init__(self, storage=None) -> None:
119 if storage is None:
120 infer_string = get_option("future.infer_string")
121 if infer_string:
122 storage = "pyarrow_numpy"
123 else:
124 storage = get_option("mode.string_storage")
125 if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
126 raise ValueError(
127 f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
128 f"Got {storage} instead."
129 )
130 if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
131 raise ImportError(
132 "pyarrow>=10.0.1 is required for PyArrow backed StringArray."
133 )
134 self.storage = storage
135
136 @property
137 def type(self) -> type[str]:
138 return str
139
140 @classmethod
141 def construct_from_string(cls, string) -> Self:
142 """
143 Construct a StringDtype from a string.
144
145 Parameters
146 ----------
147 string : str
148 The type of the name. The storage type will be taking from `string`.
149 Valid options and their storage types are
150
151 ========================== ==============================================
152 string result storage
153 ========================== ==============================================
154 ``'string'`` pd.options.mode.string_storage, default python
155 ``'string[python]'`` python
156 ``'string[pyarrow]'`` pyarrow
157 ========================== ==============================================
158
159 Returns
160 -------
161 StringDtype
162
163 Raise
164 -----
165 TypeError
166 If the string is not a valid option.
167 """
168 if not isinstance(string, str):
169 raise TypeError(
170 f"'construct_from_string' expects a string, got {type(string)}"
171 )
172 if string == "string":
173 return cls()
174 elif string == "string[python]":
175 return cls(storage="python")
176 elif string == "string[pyarrow]":
177 return cls(storage="pyarrow")
178 elif string == "string[pyarrow_numpy]":
179 return cls(storage="pyarrow_numpy")
180 else:
181 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
182
183 # https://github.com/pandas-dev/pandas/issues/36126
184 # error: Signature of "construct_array_type" incompatible with supertype
185 # "ExtensionDtype"
186 def construct_array_type( # type: ignore[override]
187 self,
188 ) -> type_t[BaseStringArray]:
189 """
190 Return the array type associated with this dtype.
191
192 Returns
193 -------
194 type
195 """
196 from pandas.core.arrays.string_arrow import (
197 ArrowStringArray,
198 ArrowStringArrayNumpySemantics,
199 )
200
201 if self.storage == "python":
202 return StringArray
203 elif self.storage == "pyarrow":
204 return ArrowStringArray
205 else:
206 return ArrowStringArrayNumpySemantics
207
208 def __from_arrow__(
209 self, array: pyarrow.Array | pyarrow.ChunkedArray
210 ) -> BaseStringArray:
211 """
212 Construct StringArray from pyarrow Array/ChunkedArray.
213 """
214 if self.storage == "pyarrow":
215 from pandas.core.arrays.string_arrow import ArrowStringArray
216
217 return ArrowStringArray(array)
218 elif self.storage == "pyarrow_numpy":
219 from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
220
221 return ArrowStringArrayNumpySemantics(array)
222 else:
223 import pyarrow
224
225 if isinstance(array, pyarrow.Array):
226 chunks = [array]
227 else:
228 # pyarrow.ChunkedArray
229 chunks = array.chunks
230
231 results = []
232 for arr in chunks:
233 # convert chunk by chunk to numpy and concatenate then, to avoid
234 # overflow for large string data when concatenating the pyarrow arrays
235 arr = arr.to_numpy(zero_copy_only=False)
236 arr = ensure_string_array(arr, na_value=libmissing.NA)
237 results.append(arr)
238
239 if len(chunks) == 0:
240 arr = np.array([], dtype=object)
241 else:
242 arr = np.concatenate(results)
243
244 # Bypass validation inside StringArray constructor, see GH#47781
245 new_string_array = StringArray.__new__(StringArray)
246 NDArrayBacked.__init__(
247 new_string_array,
248 arr,
249 StringDtype(storage="python"),
250 )
251 return new_string_array
252
253
254class BaseStringArray(ExtensionArray):
255 """
256 Mixin class for StringArray, ArrowStringArray.
257 """
258
259 @doc(ExtensionArray.tolist)
260 def tolist(self):
261 if self.ndim > 1:
262 return [x.tolist() for x in self]
263 return list(self.to_numpy())
264
265 @classmethod
266 def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
267 if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]:
268 # TODO: require any NAs be valid-for-string
269 raise ValueError
270 return cls._from_sequence(scalars, dtype=dtype)
271
272
273# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
274# incompatible with definition in base class "ExtensionArray"
275class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc]
276 """
277 Extension array for string data.
278
279 .. warning::
280
281 StringArray is considered experimental. The implementation and
282 parts of the API may change without warning.
283
284 Parameters
285 ----------
286 values : array-like
287 The array of data.
288
289 .. warning::
290
291 Currently, this expects an object-dtype ndarray
292 where the elements are Python strings
293 or nan-likes (``None``, ``np.nan``, ``NA``).
294 This may change without warning in the future. Use
295 :meth:`pandas.array` with ``dtype="string"`` for a stable way of
296 creating a `StringArray` from any sequence.
297
298 .. versionchanged:: 1.5.0
299
300 StringArray now accepts array-likes containing
301 nan-likes(``None``, ``np.nan``) for the ``values`` parameter
302 in addition to strings and :attr:`pandas.NA`
303
304 copy : bool, default False
305 Whether to copy the array of data.
306
307 Attributes
308 ----------
309 None
310
311 Methods
312 -------
313 None
314
315 See Also
316 --------
317 :func:`pandas.array`
318 The recommended function for creating a StringArray.
319 Series.str
320 The string methods are available on Series backed by
321 a StringArray.
322
323 Notes
324 -----
325 StringArray returns a BooleanArray for comparison methods.
326
327 Examples
328 --------
329 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
330 <StringArray>
331 ['This is', 'some text', <NA>, 'data.']
332 Length: 4, dtype: string
333
334 Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
335 will convert the values to strings.
336
337 >>> pd.array(['1', 1], dtype="object")
338 <NumpyExtensionArray>
339 ['1', 1]
340 Length: 2, dtype: object
341 >>> pd.array(['1', 1], dtype="string")
342 <StringArray>
343 ['1', '1']
344 Length: 2, dtype: string
345
346 However, instantiating StringArrays directly with non-strings will raise an error.
347
348 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
349
350 >>> pd.array(["a", None, "c"], dtype="string") == "a"
351 <BooleanArray>
352 [True, <NA>, False]
353 Length: 3, dtype: boolean
354 """
355
356 # undo the NumpyExtensionArray hack
357 _typ = "extension"
358
359 def __init__(self, values, copy: bool = False) -> None:
360 values = extract_array(values)
361
362 super().__init__(values, copy=copy)
363 if not isinstance(values, type(self)):
364 self._validate()
365 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
366
367 def _validate(self):
368 """Validate that we only store NA or strings."""
369 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
370 raise ValueError("StringArray requires a sequence of strings or pandas.NA")
371 if self._ndarray.dtype != "object":
372 raise ValueError(
373 "StringArray requires a sequence of strings or pandas.NA. Got "
374 f"'{self._ndarray.dtype}' dtype instead."
375 )
376 # Check to see if need to convert Na values to pd.NA
377 if self._ndarray.ndim > 2:
378 # Ravel if ndims > 2 b/c no cythonized version available
379 lib.convert_nans_to_NA(self._ndarray.ravel("K"))
380 else:
381 lib.convert_nans_to_NA(self._ndarray)
382
383 @classmethod
384 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
385 if dtype and not (isinstance(dtype, str) and dtype == "string"):
386 dtype = pandas_dtype(dtype)
387 assert isinstance(dtype, StringDtype) and dtype.storage == "python"
388
389 from pandas.core.arrays.masked import BaseMaskedArray
390
391 if isinstance(scalars, BaseMaskedArray):
392 # avoid costly conversion to object dtype
393 na_values = scalars._mask
394 result = scalars._data
395 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
396 result[na_values] = libmissing.NA
397
398 else:
399 if lib.is_pyarrow_array(scalars):
400 # pyarrow array; we cannot rely on the "to_numpy" check in
401 # ensure_string_array because calling scalars.to_numpy would set
402 # zero_copy_only to True which caused problems see GH#52076
403 scalars = np.array(scalars)
404 # convert non-na-likes to str, and nan-likes to StringDtype().na_value
405 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
406
407 # Manually creating new array avoids the validation step in the __init__, so is
408 # faster. Refactor need for validation?
409 new_string_array = cls.__new__(cls)
410 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
411
412 return new_string_array
413
414 @classmethod
415 def _from_sequence_of_strings(
416 cls, strings, *, dtype: Dtype | None = None, copy: bool = False
417 ):
418 return cls._from_sequence(strings, dtype=dtype, copy=copy)
419
420 @classmethod
421 def _empty(cls, shape, dtype) -> StringArray:
422 values = np.empty(shape, dtype=object)
423 values[:] = libmissing.NA
424 return cls(values).astype(dtype, copy=False)
425
426 def __arrow_array__(self, type=None):
427 """
428 Convert myself into a pyarrow Array.
429 """
430 import pyarrow as pa
431
432 if type is None:
433 type = pa.string()
434
435 values = self._ndarray.copy()
436 values[self.isna()] = None
437 return pa.array(values, type=type, from_pandas=True)
438
439 def _values_for_factorize(self):
440 arr = self._ndarray.copy()
441 mask = self.isna()
442 arr[mask] = None
443 return arr, None
444
445 def __setitem__(self, key, value) -> None:
446 value = extract_array(value, extract_numpy=True)
447 if isinstance(value, type(self)):
448 # extract_array doesn't extract NumpyExtensionArray subclasses
449 value = value._ndarray
450
451 key = check_array_indexer(self, key)
452 scalar_key = lib.is_scalar(key)
453 scalar_value = lib.is_scalar(value)
454 if scalar_key and not scalar_value:
455 raise ValueError("setting an array element with a sequence.")
456
457 # validate new items
458 if scalar_value:
459 if isna(value):
460 value = libmissing.NA
461 elif not isinstance(value, str):
462 raise TypeError(
463 f"Cannot set non-string value '{value}' into a StringArray."
464 )
465 else:
466 if not is_array_like(value):
467 value = np.asarray(value, dtype=object)
468 if len(value) and not lib.is_string_array(value, skipna=True):
469 raise TypeError("Must provide strings.")
470
471 mask = isna(value)
472 if mask.any():
473 value = value.copy()
474 value[isna(value)] = libmissing.NA
475
476 super().__setitem__(key, value)
477
478 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
479 # the super() method NDArrayBackedExtensionArray._putmask uses
480 # np.putmask which doesn't properly handle None/pd.NA, so using the
481 # base class implementation that uses __setitem__
482 ExtensionArray._putmask(self, mask, value)
483
484 def astype(self, dtype, copy: bool = True):
485 dtype = pandas_dtype(dtype)
486
487 if dtype == self.dtype:
488 if copy:
489 return self.copy()
490 return self
491
492 elif isinstance(dtype, IntegerDtype):
493 arr = self._ndarray.copy()
494 mask = self.isna()
495 arr[mask] = 0
496 values = arr.astype(dtype.numpy_dtype)
497 return IntegerArray(values, mask, copy=False)
498 elif isinstance(dtype, FloatingDtype):
499 arr = self.copy()
500 mask = self.isna()
501 arr[mask] = "0"
502 values = arr.astype(dtype.numpy_dtype)
503 return FloatingArray(values, mask, copy=False)
504 elif isinstance(dtype, ExtensionDtype):
505 # Skip the NumpyExtensionArray.astype method
506 return ExtensionArray.astype(self, dtype, copy)
507 elif np.issubdtype(dtype, np.floating):
508 arr = self._ndarray.copy()
509 mask = self.isna()
510 arr[mask] = 0
511 values = arr.astype(dtype)
512 values[mask] = np.nan
513 return values
514
515 return super().astype(dtype, copy)
516
517 def _reduce(
518 self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs
519 ):
520 if name in ["min", "max"]:
521 return getattr(self, name)(skipna=skipna, axis=axis)
522
523 raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
524
525 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
526 nv.validate_min((), kwargs)
527 result = masked_reductions.min(
528 values=self.to_numpy(), mask=self.isna(), skipna=skipna
529 )
530 return self._wrap_reduction_result(axis, result)
531
532 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
533 nv.validate_max((), kwargs)
534 result = masked_reductions.max(
535 values=self.to_numpy(), mask=self.isna(), skipna=skipna
536 )
537 return self._wrap_reduction_result(axis, result)
538
539 def value_counts(self, dropna: bool = True) -> Series:
540 from pandas.core.algorithms import value_counts_internal as value_counts
541
542 result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
543 result.index = result.index.astype(self.dtype)
544 return result
545
546 def memory_usage(self, deep: bool = False) -> int:
547 result = self._ndarray.nbytes
548 if deep:
549 return result + lib.memory_usage_of_objects(self._ndarray)
550 return result
551
552 @doc(ExtensionArray.searchsorted)
553 def searchsorted(
554 self,
555 value: NumpyValueArrayLike | ExtensionArray,
556 side: Literal["left", "right"] = "left",
557 sorter: NumpySorter | None = None,
558 ) -> npt.NDArray[np.intp] | np.intp:
559 if self._hasna:
560 raise ValueError(
561 "searchsorted requires array to be sorted, which is impossible "
562 "with NAs present."
563 )
564 return super().searchsorted(value=value, side=side, sorter=sorter)
565
566 def _cmp_method(self, other, op):
567 from pandas.arrays import BooleanArray
568
569 if isinstance(other, StringArray):
570 other = other._ndarray
571
572 mask = isna(self) | isna(other)
573 valid = ~mask
574
575 if not lib.is_scalar(other):
576 if len(other) != len(self):
577 # prevent improper broadcasting when other is 2D
578 raise ValueError(
579 f"Lengths of operands do not match: {len(self)} != {len(other)}"
580 )
581
582 other = np.asarray(other)
583 other = other[valid]
584
585 if op.__name__ in ops.ARITHMETIC_BINOPS:
586 result = np.empty_like(self._ndarray, dtype="object")
587 result[mask] = libmissing.NA
588 result[valid] = op(self._ndarray[valid], other)
589 return StringArray(result)
590 else:
591 # logical
592 result = np.zeros(len(self._ndarray), dtype="bool")
593 result[valid] = op(self._ndarray[valid], other)
594 return BooleanArray(result, mask)
595
596 _arith_method = _cmp_method
597
598 # ------------------------------------------------------------------------
599 # String methods interface
600 # error: Incompatible types in assignment (expression has type "NAType",
601 # base class "NumpyExtensionArray" defined the type as "float")
602 _str_na_value = libmissing.NA # type: ignore[assignment]
603
604 def _str_map(
605 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
606 ):
607 from pandas.arrays import BooleanArray
608
609 if dtype is None:
610 dtype = StringDtype(storage="python")
611 if na_value is None:
612 na_value = self.dtype.na_value
613
614 mask = isna(self)
615 arr = np.asarray(self)
616
617 if is_integer_dtype(dtype) or is_bool_dtype(dtype):
618 constructor: type[IntegerArray | BooleanArray]
619 if is_integer_dtype(dtype):
620 constructor = IntegerArray
621 else:
622 constructor = BooleanArray
623
624 na_value_is_na = isna(na_value)
625 if na_value_is_na:
626 na_value = 1
627 elif dtype == np.dtype("bool"):
628 na_value = bool(na_value)
629 result = lib.map_infer_mask(
630 arr,
631 f,
632 mask.view("uint8"),
633 convert=False,
634 na_value=na_value,
635 # error: Argument 1 to "dtype" has incompatible type
636 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
637 # "Type[object]"
638 dtype=np.dtype(dtype), # type: ignore[arg-type]
639 )
640
641 if not na_value_is_na:
642 mask[:] = False
643
644 return constructor(result, mask)
645
646 elif is_string_dtype(dtype) and not is_object_dtype(dtype):
647 # i.e. StringDtype
648 result = lib.map_infer_mask(
649 arr, f, mask.view("uint8"), convert=False, na_value=na_value
650 )
651 return StringArray(result)
652 else:
653 # This is when the result type is object. We reach this when
654 # -> We know the result type is truly object (e.g. .encode returns bytes
655 # or .findall returns a list).
656 # -> We don't know the result type. E.g. `.get` can return anything.
657 return lib.map_infer_mask(arr, f, mask.view("uint8"))