Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/string

1from __future__ import annotations

3from typing import (

4 TYPE_CHECKING,

5 Literal,

8import numpy as np

10from pandas._config import get_option

12from pandas._libs import (

13 lib,

14 missing as libmissing,

15)

16from pandas._libs.arrays import NDArrayBacked

17from pandas._typing import (

18 AxisInt,

19 Dtype,

20 Scalar,

21 npt,

22 type_t,

23)

24from pandas.compat import pa_version_under7p0

25from pandas.compat.numpy import function as nv

26from pandas.util._decorators import doc

28from pandas.core.dtypes.base import (

29 ExtensionDtype,

30 StorageExtensionDtype,

31 register_extension_dtype,

32)

33from pandas.core.dtypes.common import (

34 is_array_like,

35 is_bool_dtype,

36 is_dtype_equal,

37 is_integer_dtype,

38 is_object_dtype,

39 is_string_dtype,

40 pandas_dtype,

41)

43from pandas.core import ops

44from pandas.core.array_algos import masked_reductions

45from pandas.core.arrays import (

46 ExtensionArray,

47 FloatingArray,

48 IntegerArray,

49)

50from pandas.core.arrays.floating import FloatingDtype

51from pandas.core.arrays.integer import IntegerDtype

52from pandas.core.arrays.numpy_ import PandasArray

53from pandas.core.construction import extract_array

54from pandas.core.indexers import check_array_indexer

55from pandas.core.missing import isna

57if TYPE_CHECKING:

58 import pyarrow

60 from pandas._typing import (

61 NumpySorter,

62 NumpyValueArrayLike,

63 )

65 from pandas import Series

68@register_extension_dtype

69class StringDtype(StorageExtensionDtype):

70 """

71 Extension dtype for string data.

73 .. warning::

75 StringDtype is considered experimental. The implementation and

76 parts of the API may change without warning.

78 Parameters

79 ----------

80 storage : {"python", "pyarrow"}, optional

81 If not given, the value of ``pd.options.mode.string_storage``.

83 Attributes

84 ----------

85 None

87 Methods

88 -------

89 None

91 Examples

92 --------

93 >>> pd.StringDtype()

94 string[python]

96 >>> pd.StringDtype(storage="pyarrow")

97 string[pyarrow]

98 """

100 name = "string"

101

102 #: StringDtype().na_value uses pandas.NA

103 @property

104 def na_value(self) -> libmissing.NAType:

105 return libmissing.NA

106

107 _metadata = ("storage",)

108

109 def __init__(self, storage=None) -> None:

110 if storage is None:

111 storage = get_option("mode.string_storage")

112 if storage not in {"python", "pyarrow"}:

113 raise ValueError(

114 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."

115 )

116 if storage == "pyarrow" and pa_version_under7p0:

117 raise ImportError(

118 "pyarrow>=7.0.0 is required for PyArrow backed StringArray."

119 )

120 self.storage = storage

121

122 @property

123 def type(self) -> type[str]:

124 return str

125

126 @classmethod

127 def construct_from_string(cls, string):

128 """

129 Construct a StringDtype from a string.

130

131 Parameters

132 ----------

133 string : str

134 The type of the name. The storage type will be taking from `string`.

135 Valid options and their storage types are

136

137 ========================== ==============================================

138 string result storage

139 ========================== ==============================================

140 ``'string'`` pd.options.mode.string_storage, default python

141 ``'string[python]'`` python

142 ``'string[pyarrow]'`` pyarrow

143 ========================== ==============================================

144

145 Returns

146 -------

147 StringDtype

148

149 Raise

150 -----

151 TypeError

152 If the string is not a valid option.

153 """

154 if not isinstance(string, str):

155 raise TypeError(

156 f"'construct_from_string' expects a string, got {type(string)}"

157 )

158 if string == "string":

159 return cls()

160 elif string == "string[python]":

161 return cls(storage="python")

162 elif string == "string[pyarrow]":

163 return cls(storage="pyarrow")

164 else:

165 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

166

167 # https://github.com/pandas-dev/pandas/issues/36126

168 # error: Signature of "construct_array_type" incompatible with supertype

169 # "ExtensionDtype"

170 def construct_array_type( # type: ignore[override]

171 self,

172 ) -> type_t[BaseStringArray]:

173 """

174 Return the array type associated with this dtype.

175

176 Returns

177 -------

178 type

179 """

180 from pandas.core.arrays.string_arrow import ArrowStringArray

181

182 if self.storage == "python":

183 return StringArray

184 else:

185 return ArrowStringArray

186

187 def __from_arrow__(

188 self, array: pyarrow.Array | pyarrow.ChunkedArray

189 ) -> BaseStringArray:

190 """

191 Construct StringArray from pyarrow Array/ChunkedArray.

192 """

193 if self.storage == "pyarrow":

194 from pandas.core.arrays.string_arrow import ArrowStringArray

195

196 return ArrowStringArray(array)

197 else:

198 import pyarrow

199

200 if isinstance(array, pyarrow.Array):

201 chunks = [array]

202 else:

203 # pyarrow.ChunkedArray

204 chunks = array.chunks

205

206 results = []

207 for arr in chunks:

208 # using _from_sequence to ensure None is converted to NA

209 str_arr = StringArray._from_sequence(np.array(arr))

210 results.append(str_arr)

211

212 if results:

213 return StringArray._concat_same_type(results)

214 else:

215 return StringArray(np.array([], dtype="object"))

216

217

218class BaseStringArray(ExtensionArray):

219 """

220 Mixin class for StringArray, ArrowStringArray.

221 """

222

223 @doc(ExtensionArray.tolist)

224 def tolist(self):

225 if self.ndim > 1:

226 return [x.tolist() for x in self]

227 return list(self.to_numpy())

228

229

230class StringArray(BaseStringArray, PandasArray):

231 """

232 Extension array for string data.

233

234 .. warning::

235

236 StringArray is considered experimental. The implementation and

237 parts of the API may change without warning.

238

239 Parameters

240 ----------

241 values : array-like

242 The array of data.

243

244 .. warning::

245

246 Currently, this expects an object-dtype ndarray

247 where the elements are Python strings

248 or nan-likes (``None``, ``np.nan``, ``NA``).

249 This may change without warning in the future. Use

250 :meth:`pandas.array` with ``dtype="string"`` for a stable way of

251 creating a `StringArray` from any sequence.

252

253 .. versionchanged:: 1.5.0

254

255 StringArray now accepts array-likes containing

256 nan-likes(``None``, ``np.nan``) for the ``values`` parameter

257 in addition to strings and :attr:`pandas.NA`

258

259 copy : bool, default False

260 Whether to copy the array of data.

261

262 Attributes

263 ----------

264 None

265

266 Methods

267 -------

268 None

269

270 See Also

271 --------

272 :func:`pandas.array`

273 The recommended function for creating a StringArray.

274 Series.str

275 The string methods are available on Series backed by

276 a StringArray.

277

278 Notes

279 -----

280 StringArray returns a BooleanArray for comparison methods.

281

282 Examples

283 --------

284 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")

285 <StringArray>

286 ['This is', 'some text', <NA>, 'data.']

287 Length: 4, dtype: string

288

289 Unlike arrays instantiated with ``dtype="object"``, ``StringArray``

290 will convert the values to strings.

291

292 >>> pd.array(['1', 1], dtype="object")

293 <PandasArray>

294 ['1', 1]

295 Length: 2, dtype: object

296 >>> pd.array(['1', 1], dtype="string")

297 <StringArray>

298 ['1', '1']

299 Length: 2, dtype: string

300

301 However, instantiating StringArrays directly with non-strings will raise an error.

302

303 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:

304

305 >>> pd.array(["a", None, "c"], dtype="string") == "a"

306 <BooleanArray>

307 [True, <NA>, False]

308 Length: 3, dtype: boolean

309 """

310

311 # undo the PandasArray hack

312 _typ = "extension"

313

314 def __init__(self, values, copy: bool = False) -> None:

315 values = extract_array(values)

316

317 super().__init__(values, copy=copy)

318 if not isinstance(values, type(self)):

319 self._validate()

320 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))

321

322 def _validate(self):

323 """Validate that we only store NA or strings."""

324 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):

325 raise ValueError("StringArray requires a sequence of strings or pandas.NA")

326 if self._ndarray.dtype != "object":

327 raise ValueError(

328 "StringArray requires a sequence of strings or pandas.NA. Got "

329 f"'{self._ndarray.dtype}' dtype instead."

330 )

331 # Check to see if need to convert Na values to pd.NA

332 if self._ndarray.ndim > 2:

333 # Ravel if ndims > 2 b/c no cythonized version available

334 lib.convert_nans_to_NA(self._ndarray.ravel("K"))

335 else:

336 lib.convert_nans_to_NA(self._ndarray)

337

338 @classmethod

339 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):

340 if dtype and not (isinstance(dtype, str) and dtype == "string"):

341 dtype = pandas_dtype(dtype)

342 assert isinstance(dtype, StringDtype) and dtype.storage == "python"

343

344 from pandas.core.arrays.masked import BaseMaskedArray

345

346 if isinstance(scalars, BaseMaskedArray):

347 # avoid costly conversion to object dtype

348 na_values = scalars._mask

349 result = scalars._data

350 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)

351 result[na_values] = libmissing.NA

352

353 else:

354 if hasattr(scalars, "type"):

355 # pyarrow array

356 scalars = np.array(scalars)

357 # convert non-na-likes to str, and nan-likes to StringDtype().na_value

358 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)

359

360 # Manually creating new array avoids the validation step in the __init__, so is

361 # faster. Refactor need for validation?

362 new_string_array = cls.__new__(cls)

363 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))

364

365 return new_string_array

366

367 @classmethod

368 def _from_sequence_of_strings(

369 cls, strings, *, dtype: Dtype | None = None, copy: bool = False

370 ):

371 return cls._from_sequence(strings, dtype=dtype, copy=copy)

372

373 @classmethod

374 def _empty(cls, shape, dtype) -> StringArray:

375 values = np.empty(shape, dtype=object)

376 values[:] = libmissing.NA

377 return cls(values).astype(dtype, copy=False)

378

379 def __arrow_array__(self, type=None):

380 """

381 Convert myself into a pyarrow Array.

382 """

383 import pyarrow as pa

384

385 if type is None:

386 type = pa.string()

387

388 values = self._ndarray.copy()

389 values[self.isna()] = None

390 return pa.array(values, type=type, from_pandas=True)

391

392 def _values_for_factorize(self):

393 arr = self._ndarray.copy()

394 mask = self.isna()

395 arr[mask] = None

396 return arr, None

397

398 def __setitem__(self, key, value):

399 value = extract_array(value, extract_numpy=True)

400 if isinstance(value, type(self)):

401 # extract_array doesn't extract PandasArray subclasses

402 value = value._ndarray

403

404 key = check_array_indexer(self, key)

405 scalar_key = lib.is_scalar(key)

406 scalar_value = lib.is_scalar(value)

407 if scalar_key and not scalar_value:

408 raise ValueError("setting an array element with a sequence.")

409

410 # validate new items

411 if scalar_value:

412 if isna(value):

413 value = libmissing.NA

414 elif not isinstance(value, str):

415 raise TypeError(

416 f"Cannot set non-string value '{value}' into a StringArray."

417 )

418 else:

419 if not is_array_like(value):

420 value = np.asarray(value, dtype=object)

421 if len(value) and not lib.is_string_array(value, skipna=True):

422 raise TypeError("Must provide strings.")

423

424 mask = isna(value)

425 if mask.any():

426 value = value.copy()

427 value[isna(value)] = libmissing.NA

428

429 super().__setitem__(key, value)

430

431 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:

432 # the super() method NDArrayBackedExtensionArray._putmask uses

433 # np.putmask which doesn't properly handle None/pd.NA, so using the

434 # base class implementation that uses __setitem__

435 ExtensionArray._putmask(self, mask, value)

436

437 def astype(self, dtype, copy: bool = True):

438 dtype = pandas_dtype(dtype)

439

440 if is_dtype_equal(dtype, self.dtype):

441 if copy:

442 return self.copy()

443 return self

444

445 elif isinstance(dtype, IntegerDtype):

446 arr = self._ndarray.copy()

447 mask = self.isna()

448 arr[mask] = 0

449 values = arr.astype(dtype.numpy_dtype)

450 return IntegerArray(values, mask, copy=False)

451 elif isinstance(dtype, FloatingDtype):

452 arr = self.copy()

453 mask = self.isna()

454 arr[mask] = "0"

455 values = arr.astype(dtype.numpy_dtype)

456 return FloatingArray(values, mask, copy=False)

457 elif isinstance(dtype, ExtensionDtype):

458 # Skip the PandasArray.astype method

459 return ExtensionArray.astype(self, dtype, copy)

460 elif np.issubdtype(dtype, np.floating):

461 arr = self._ndarray.copy()

462 mask = self.isna()

463 arr[mask] = 0

464 values = arr.astype(dtype)

465 values[mask] = np.nan

466 return values

467

468 return super().astype(dtype, copy)

469

470 def _reduce(

471 self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs

472 ):

473 if name in ["min", "max"]:

474 return getattr(self, name)(skipna=skipna, axis=axis)

475

476 raise TypeError(f"Cannot perform reduction '{name}' with string dtype")

477

478 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:

479 nv.validate_min((), kwargs)

480 result = masked_reductions.min(

481 values=self.to_numpy(), mask=self.isna(), skipna=skipna

482 )

483 return self._wrap_reduction_result(axis, result)

484

485 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:

486 nv.validate_max((), kwargs)

487 result = masked_reductions.max(

488 values=self.to_numpy(), mask=self.isna(), skipna=skipna

489 )

490 return self._wrap_reduction_result(axis, result)

491

492 def value_counts(self, dropna: bool = True) -> Series:

493 from pandas import value_counts

494

495 result = value_counts(self._ndarray, dropna=dropna).astype("Int64")

496 result.index = result.index.astype(self.dtype)

497 return result

498

499 def memory_usage(self, deep: bool = False) -> int:

500 result = self._ndarray.nbytes

501 if deep:

502 return result + lib.memory_usage_of_objects(self._ndarray)

503 return result

504

505 @doc(ExtensionArray.searchsorted)

506 def searchsorted(

507 self,

508 value: NumpyValueArrayLike | ExtensionArray,

509 side: Literal["left", "right"] = "left",

510 sorter: NumpySorter = None,

511 ) -> npt.NDArray[np.intp] | np.intp:

512 if self._hasna:

513 raise ValueError(

514 "searchsorted requires array to be sorted, which is impossible "

515 "with NAs present."

516 )

517 return super().searchsorted(value=value, side=side, sorter=sorter)

518

519 def _cmp_method(self, other, op):

520 from pandas.arrays import BooleanArray

521

522 if isinstance(other, StringArray):

523 other = other._ndarray

524

525 mask = isna(self) | isna(other)

526 valid = ~mask

527

528 if not lib.is_scalar(other):

529 if len(other) != len(self):

530 # prevent improper broadcasting when other is 2D

531 raise ValueError(

532 f"Lengths of operands do not match: {len(self)} != {len(other)}"

533 )

534

535 other = np.asarray(other)

536 other = other[valid]

537

538 if op.__name__ in ops.ARITHMETIC_BINOPS:

539 result = np.empty_like(self._ndarray, dtype="object")

540 result[mask] = libmissing.NA

541 result[valid] = op(self._ndarray[valid], other)

542 return StringArray(result)

543 else:

544 # logical

545 result = np.zeros(len(self._ndarray), dtype="bool")

546 result[valid] = op(self._ndarray[valid], other)

547 return BooleanArray(result, mask)

548

549 _arith_method = _cmp_method

550

551 # ------------------------------------------------------------------------

552 # String methods interface

553 # error: Incompatible types in assignment (expression has type "NAType",

554 # base class "PandasArray" defined the type as "float")

555 _str_na_value = libmissing.NA # type: ignore[assignment]

556

557 def _str_map(

558 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True

559 ):

560 from pandas.arrays import BooleanArray

561

562 if dtype is None:

563 dtype = StringDtype(storage="python")

564 if na_value is None:

565 na_value = self.dtype.na_value

566

567 mask = isna(self)

568 arr = np.asarray(self)

569

570 if is_integer_dtype(dtype) or is_bool_dtype(dtype):

571 constructor: type[IntegerArray] | type[BooleanArray]

572 if is_integer_dtype(dtype):

573 constructor = IntegerArray

574 else:

575 constructor = BooleanArray

576

577 na_value_is_na = isna(na_value)

578 if na_value_is_na:

579 na_value = 1

580 result = lib.map_infer_mask(

581 arr,

582 f,

583 mask.view("uint8"),

584 convert=False,

585 na_value=na_value,

586 # error: Argument 1 to "dtype" has incompatible type

587 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected

588 # "Type[object]"

589 dtype=np.dtype(dtype), # type: ignore[arg-type]

590 )

591

592 if not na_value_is_na:

593 mask[:] = False

594

595 return constructor(result, mask)

596

597 elif is_string_dtype(dtype) and not is_object_dtype(dtype):

598 # i.e. StringDtype

599 result = lib.map_infer_mask(

600 arr, f, mask.view("uint8"), convert=False, na_value=na_value

601 )

602 return StringArray(result)

603 else:

604 # This is when the result type is object. We reach this when

605 # -> We know the result type is truly object (e.g. .encode returns bytes

606 # or .findall returns a list).

607 # -> We don't know the result type. E.g. `.get` can return anything.

608 return lib.map_infer_mask(arr, f, mask.view("uint8"))

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/string_.py: 25%

257 statements