Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/parsers/base

1from __future__ import annotations

3from collections import defaultdict

4from copy import copy

5import csv

6import datetime

7from enum import Enum

8import itertools

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13 cast,

14 final,

15 overload,

16)

17import warnings

19import numpy as np

21from pandas._libs import (

22 lib,

23 parsers,

24)

25import pandas._libs.ops as libops

26from pandas._libs.parsers import STR_NA_VALUES

27from pandas._libs.tslibs import parsing

28from pandas.compat._optional import import_optional_dependency

29from pandas.errors import (

30 ParserError,

31 ParserWarning,

32)

33from pandas.util._exceptions import find_stack_level

35from pandas.core.dtypes.astype import astype_array

36from pandas.core.dtypes.common import (

37 ensure_object,

38 is_bool_dtype,

39 is_dict_like,

40 is_extension_array_dtype,

41 is_float_dtype,

42 is_integer,

43 is_integer_dtype,

44 is_list_like,

45 is_object_dtype,

46 is_scalar,

47 is_string_dtype,

48 pandas_dtype,

49)

50from pandas.core.dtypes.dtypes import (

51 CategoricalDtype,

52 ExtensionDtype,

53)

54from pandas.core.dtypes.missing import isna

56from pandas import (

57 ArrowDtype,

58 DataFrame,

59 DatetimeIndex,

60 StringDtype,

61 concat,

62)

63from pandas.core import algorithms

64from pandas.core.arrays import (

65 ArrowExtensionArray,

66 BaseMaskedArray,

67 BooleanArray,

68 Categorical,

69 ExtensionArray,

70 FloatingArray,

71 IntegerArray,

72)

73from pandas.core.arrays.boolean import BooleanDtype

74from pandas.core.indexes.api import (

75 Index,

76 MultiIndex,

77 default_index,

78 ensure_index_from_sequences,

79)

80from pandas.core.series import Series

81from pandas.core.tools import datetimes as tools

83from pandas.io.common import is_potential_multi_index

85if TYPE_CHECKING:

86 from collections.abc import (

87 Hashable,

88 Iterable,

89 Mapping,

90 Sequence,

91 )

93 from pandas._typing import (

94 ArrayLike,

95 DtypeArg,

96 DtypeObj,

97 Scalar,

98 )

100

101class ParserBase:

102 class BadLineHandleMethod(Enum):

103 ERROR = 0

104 WARN = 1

105 SKIP = 2

106

107 _implicit_index: bool

108 _first_chunk: bool

109 keep_default_na: bool

110 dayfirst: bool

111 cache_dates: bool

112 keep_date_col: bool

113 usecols_dtype: str | None

114

115 def __init__(self, kwds) -> None:

116 self._implicit_index = False

117

118 self.names = kwds.get("names")

119 self.orig_names: Sequence[Hashable] | None = None

120

121 self.index_col = kwds.get("index_col", None)

122 self.unnamed_cols: set = set()

123 self.index_names: Sequence[Hashable] | None = None

124 self.col_names: Sequence[Hashable] | None = None

125

126 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))

127 self._parse_date_cols: Iterable = []

128 self.date_parser = kwds.pop("date_parser", lib.no_default)

129 self.date_format = kwds.pop("date_format", None)

130 self.dayfirst = kwds.pop("dayfirst", False)

131 self.keep_date_col = kwds.pop("keep_date_col", False)

132

133 self.na_values = kwds.get("na_values")

134 self.na_fvalues = kwds.get("na_fvalues")

135 self.na_filter = kwds.get("na_filter", False)

136 self.keep_default_na = kwds.get("keep_default_na", True)

137

138 self.dtype = copy(kwds.get("dtype", None))

139 self.converters = kwds.get("converters")

140 self.dtype_backend = kwds.get("dtype_backend")

141

142 self.true_values = kwds.get("true_values")

143 self.false_values = kwds.get("false_values")

144 self.cache_dates = kwds.pop("cache_dates", True)

145

146 self._date_conv = _make_date_converter(

147 date_parser=self.date_parser,

148 date_format=self.date_format,

149 dayfirst=self.dayfirst,

150 cache_dates=self.cache_dates,

151 )

152

153 # validate header options for mi

154 self.header = kwds.get("header")

155 if is_list_like(self.header, allow_sets=False):

156 if kwds.get("usecols"):

157 raise ValueError(

158 "cannot specify usecols when specifying a multi-index header"

159 )

160 if kwds.get("names"):

161 raise ValueError(

162 "cannot specify names when specifying a multi-index header"

163 )

164

165 # validate index_col that only contains integers

166 if self.index_col is not None:

167 # In this case we can pin down index_col as list[int]

168 if is_integer(self.index_col):

169 self.index_col = [self.index_col]

170 elif not (

171 is_list_like(self.index_col, allow_sets=False)

172 and all(map(is_integer, self.index_col))

173 ):

174 raise ValueError(

175 "index_col must only contain row numbers "

176 "when specifying a multi-index header"

177 )

178 else:

179 self.index_col = list(self.index_col)

180

181 self._name_processed = False

182

183 self._first_chunk = True

184

185 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])

186

187 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)

188 # Normally, this arg would get pre-processed earlier on

189 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)

190

191 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:

192 """

193 Check if parse_dates are in columns.

194

195 If user has provided names for parse_dates, check if those columns

196 are available.

197

198 Parameters

199 ----------

200 columns : list

201 List of names of the dataframe.

202

203 Returns

204 -------

205 The names of the columns which will get parsed later if a dict or list

206 is given as specification.

207

208 Raises

209 ------

210 ValueError

211 If column to parse_date is not in dataframe.

212

213 """

214 cols_needed: Iterable

215 if is_dict_like(self.parse_dates):

216 cols_needed = itertools.chain(*self.parse_dates.values())

217 elif is_list_like(self.parse_dates):

218 # a column in parse_dates could be represented

219 # ColReference = Union[int, str]

220 # DateGroups = List[ColReference]

221 # ParseDates = Union[DateGroups, List[DateGroups],

222 # Dict[ColReference, DateGroups]]

223 cols_needed = itertools.chain.from_iterable(

224 col if is_list_like(col) and not isinstance(col, tuple) else [col]

225 for col in self.parse_dates

226 )

227 else:

228 cols_needed = []

229

230 cols_needed = list(cols_needed)

231

232 # get only columns that are references using names (str), not by index

233 missing_cols = ", ".join(

234 sorted(

235 {

236 col

237 for col in cols_needed

238 if isinstance(col, str) and col not in columns

239 }

240 )

241 )

242 if missing_cols:

243 raise ValueError(

244 f"Missing column provided to 'parse_dates': '{missing_cols}'"

245 )

246 # Convert positions to actual column names

247 return [

248 col if (isinstance(col, str) or col in columns) else columns[col]

249 for col in cols_needed

250 ]

251

252 def close(self) -> None:

253 pass

254

255 @final

256 @property

257 def _has_complex_date_col(self) -> bool:

258 return isinstance(self.parse_dates, dict) or (

259 isinstance(self.parse_dates, list)

260 and len(self.parse_dates) > 0

261 and isinstance(self.parse_dates[0], list)

262 )

263

264 @final

265 def _should_parse_dates(self, i: int) -> bool:

266 if lib.is_bool(self.parse_dates):

267 return bool(self.parse_dates)

268 else:

269 if self.index_names is not None:

270 name = self.index_names[i]

271 else:

272 name = None

273 j = i if self.index_col is None else self.index_col[i]

274

275 return (j in self.parse_dates) or (

276 name is not None and name in self.parse_dates

277 )

278

279 @final

280 def _extract_multi_indexer_columns(

281 self,

282 header,

283 index_names: Sequence[Hashable] | None,

284 passed_names: bool = False,

285 ) -> tuple[

286 Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool

287 ]:

288 """

289 Extract and return the names, index_names, col_names if the column

290 names are a MultiIndex.

291

292 Parameters

293 ----------

294 header: list of lists

295 The header rows

296 index_names: list, optional

297 The names of the future index

298 passed_names: bool, default False

299 A flag specifying if names where passed

300

301 """

302 if len(header) < 2:

303 return header[0], index_names, None, passed_names

304

305 # the names are the tuples of the header that are not the index cols

306 # 0 is the name of the index, assuming index_col is a list of column

307 # numbers

308 ic = self.index_col

309 if ic is None:

310 ic = []

311

312 if not isinstance(ic, (list, tuple, np.ndarray)):

313 ic = [ic]

314 sic = set(ic)

315

316 # clean the index_names

317 index_names = header.pop(-1)

318 index_names, _, _ = self._clean_index_names(index_names, self.index_col)

319

320 # extract the columns

321 field_count = len(header[0])

322

323 # check if header lengths are equal

324 if not all(len(header_iter) == field_count for header_iter in header[1:]):

325 raise ParserError("Header rows must have an equal number of columns.")

326

327 def extract(r):

328 return tuple(r[i] for i in range(field_count) if i not in sic)

329

330 columns = list(zip(*(extract(r) for r in header)))

331 names = columns.copy()

332 for single_ic in sorted(ic):

333 names.insert(single_ic, single_ic)

334

335 # Clean the column names (if we have an index_col).

336 if len(ic):

337 col_names = [

338 r[ic[0]]

339 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)

340 else None

341 for r in header

342 ]

343 else:

344 col_names = [None] * len(header)

345

346 passed_names = True

347

348 return names, index_names, col_names, passed_names

349

350 @final

351 def _maybe_make_multi_index_columns(

352 self,

353 columns: Sequence[Hashable],

354 col_names: Sequence[Hashable] | None = None,

355 ) -> Sequence[Hashable] | MultiIndex:

356 # possibly create a column mi here

357 if is_potential_multi_index(columns):

358 list_columns = cast(list[tuple], columns)

359 return MultiIndex.from_tuples(list_columns, names=col_names)

360 return columns

361

362 @final

363 def _make_index(

364 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None

365 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:

366 index: Index | None

367 if not is_index_col(self.index_col) or not self.index_col:

368 index = None

369

370 elif not self._has_complex_date_col:

371 simple_index = self._get_simple_index(alldata, columns)

372 index = self._agg_index(simple_index)

373 elif self._has_complex_date_col:

374 if not self._name_processed:

375 (self.index_names, _, self.index_col) = self._clean_index_names(

376 list(columns), self.index_col

377 )

378 self._name_processed = True

379 date_index = self._get_complex_date_index(data, columns)

380 index = self._agg_index(date_index, try_parse_dates=False)

381

382 # add names for the index

383 if indexnamerow:

384 coffset = len(indexnamerow) - len(columns)

385 assert index is not None

386 index = index.set_names(indexnamerow[:coffset])

387

388 # maybe create a mi on the columns

389 columns = self._maybe_make_multi_index_columns(columns, self.col_names)

390

391 return index, columns

392

393 @final

394 def _get_simple_index(self, data, columns):

395 def ix(col):

396 if not isinstance(col, str):

397 return col

398 raise ValueError(f"Index {col} invalid")

399

400 to_remove = []

401 index = []

402 for idx in self.index_col:

403 i = ix(idx)

404 to_remove.append(i)

405 index.append(data[i])

406

407 # remove index items from content and columns, don't pop in

408 # loop

409 for i in sorted(to_remove, reverse=True):

410 data.pop(i)

411 if not self._implicit_index:

412 columns.pop(i)

413

414 return index

415

416 @final

417 def _get_complex_date_index(self, data, col_names):

418 def _get_name(icol):

419 if isinstance(icol, str):

420 return icol

421

422 if col_names is None:

423 raise ValueError(f"Must supply column order to use {icol!s} as index")

424

425 for i, c in enumerate(col_names):

426 if i == icol:

427 return c

428

429 to_remove = []

430 index = []

431 for idx in self.index_col:

432 name = _get_name(idx)

433 to_remove.append(name)

434 index.append(data[name])

435

436 # remove index items from content and columns, don't pop in

437 # loop

438 for c in sorted(to_remove, reverse=True):

439 data.pop(c)

440 col_names.remove(c)

441

442 return index

443

444 @final

445 def _clean_mapping(self, mapping):

446 """converts col numbers to names"""

447 if not isinstance(mapping, dict):

448 return mapping

449 clean = {}

450 # for mypy

451 assert self.orig_names is not None

452

453 for col, v in mapping.items():

454 if isinstance(col, int) and col not in self.orig_names:

455 col = self.orig_names[col]

456 clean[col] = v

457 if isinstance(mapping, defaultdict):

458 remaining_cols = set(self.orig_names) - set(clean.keys())

459 clean.update({col: mapping[col] for col in remaining_cols})

460 return clean

461

462 @final

463 def _agg_index(self, index, try_parse_dates: bool = True) -> Index:

464 arrays = []

465 converters = self._clean_mapping(self.converters)

466

467 for i, arr in enumerate(index):

468 if try_parse_dates and self._should_parse_dates(i):

469 arr = self._date_conv(

470 arr,

471 col=self.index_names[i] if self.index_names is not None else None,

472 )

473

474 if self.na_filter:

475 col_na_values = self.na_values

476 col_na_fvalues = self.na_fvalues

477 else:

478 col_na_values = set()

479 col_na_fvalues = set()

480

481 if isinstance(self.na_values, dict):

482 assert self.index_names is not None

483 col_name = self.index_names[i]

484 if col_name is not None:

485 col_na_values, col_na_fvalues = _get_na_values(

486 col_name, self.na_values, self.na_fvalues, self.keep_default_na

487 )

488

489 clean_dtypes = self._clean_mapping(self.dtype)

490

491 cast_type = None

492 index_converter = False

493 if self.index_names is not None:

494 if isinstance(clean_dtypes, dict):

495 cast_type = clean_dtypes.get(self.index_names[i], None)

496

497 if isinstance(converters, dict):

498 index_converter = converters.get(self.index_names[i]) is not None

499

500 try_num_bool = not (

501 cast_type and is_string_dtype(cast_type) or index_converter

502 )

503

504 arr, _ = self._infer_types(

505 arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool

506 )

507 arrays.append(arr)

508

509 names = self.index_names

510 index = ensure_index_from_sequences(arrays, names)

511

512 return index

513

514 @final

515 def _convert_to_ndarrays(

516 self,

517 dct: Mapping,

518 na_values,

519 na_fvalues,

520 verbose: bool = False,

521 converters=None,

522 dtypes=None,

523 ):

524 result = {}

525 for c, values in dct.items():

526 conv_f = None if converters is None else converters.get(c, None)

527 if isinstance(dtypes, dict):

528 cast_type = dtypes.get(c, None)

529 else:

530 # single dtype or None

531 cast_type = dtypes

532

533 if self.na_filter:

534 col_na_values, col_na_fvalues = _get_na_values(

535 c, na_values, na_fvalues, self.keep_default_na

536 )

537 else:

538 col_na_values, col_na_fvalues = set(), set()

539

540 if c in self._parse_date_cols:

541 # GH#26203 Do not convert columns which get converted to dates

542 # but replace nans to ensure to_datetime works

543 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)

544 np.putmask(values, mask, np.nan)

545 result[c] = values

546 continue

547

548 if conv_f is not None:

549 # conv_f applied to data before inference

550 if cast_type is not None:

551 warnings.warn(

552 (

553 "Both a converter and dtype were specified "

554 f"for column {c} - only the converter will be used."

555 ),

556 ParserWarning,

557 stacklevel=find_stack_level(),

558 )

559

560 try:

561 values = lib.map_infer(values, conv_f)

562 except ValueError:

563 mask = algorithms.isin(values, list(na_values)).view(np.uint8)

564 values = lib.map_infer_mask(values, conv_f, mask)

565

566 cvals, na_count = self._infer_types(

567 values,

568 set(col_na_values) | col_na_fvalues,

569 cast_type is None,

570 try_num_bool=False,

571 )

572 else:

573 is_ea = is_extension_array_dtype(cast_type)

574 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)

575 # skip inference if specified dtype is object

576 # or casting to an EA

577 try_num_bool = not (cast_type and is_str_or_ea_dtype)

578

579 # general type inference and conversion

580 cvals, na_count = self._infer_types(

581 values,

582 set(col_na_values) | col_na_fvalues,

583 cast_type is None,

584 try_num_bool,

585 )

586

587 # type specified in dtype param or cast_type is an EA

588 if cast_type is not None:

589 cast_type = pandas_dtype(cast_type)

590 if cast_type and (cvals.dtype != cast_type or is_ea):

591 if not is_ea and na_count > 0:

592 if is_bool_dtype(cast_type):

593 raise ValueError(f"Bool column has NA values in column {c}")

594 cvals = self._cast_types(cvals, cast_type, c)

595

596 result[c] = cvals

597 if verbose and na_count:

598 print(f"Filled {na_count} NA values in column {c!s}")

599 return result

600

601 @final

602 def _set_noconvert_dtype_columns(

603 self, col_indices: list[int], names: Sequence[Hashable]

604 ) -> set[int]:

605 """

606 Set the columns that should not undergo dtype conversions.

607

608 Currently, any column that is involved with date parsing will not

609 undergo such conversions. If usecols is specified, the positions of the columns

610 not to cast is relative to the usecols not to all columns.

611

612 Parameters

613 ----------

614 col_indices: The indices specifying order and positions of the columns

615 names: The column names which order is corresponding with the order

616 of col_indices

617

618 Returns

619 -------

620 A set of integers containing the positions of the columns not to convert.

621 """

622 usecols: list[int] | list[str] | None

623 noconvert_columns = set()

624 if self.usecols_dtype == "integer":

625 # A set of integers will be converted to a list in

626 # the correct order every single time.

627 usecols = sorted(self.usecols)

628 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):

629 # The names attribute should have the correct columns

630 # in the proper order for indexing with parse_dates.

631 usecols = col_indices

632 else:

633 # Usecols is empty.

634 usecols = None

635

636 def _set(x) -> int:

637 if usecols is not None and is_integer(x):

638 x = usecols[x]

639

640 if not is_integer(x):

641 x = col_indices[names.index(x)]

642

643 return x

644

645 if isinstance(self.parse_dates, list):

646 for val in self.parse_dates:

647 if isinstance(val, list):

648 for k in val:

649 noconvert_columns.add(_set(k))

650 else:

651 noconvert_columns.add(_set(val))

652

653 elif isinstance(self.parse_dates, dict):

654 for val in self.parse_dates.values():

655 if isinstance(val, list):

656 for k in val:

657 noconvert_columns.add(_set(k))

658 else:

659 noconvert_columns.add(_set(val))

660

661 elif self.parse_dates:

662 if isinstance(self.index_col, list):

663 for k in self.index_col:

664 noconvert_columns.add(_set(k))

665 elif self.index_col is not None:

666 noconvert_columns.add(_set(self.index_col))

667

668 return noconvert_columns

669

670 @final

671 def _infer_types(

672 self, values, na_values, no_dtype_specified, try_num_bool: bool = True

673 ) -> tuple[ArrayLike, int]:

674 """

675 Infer types of values, possibly casting

676

677 Parameters

678 ----------

679 values : ndarray

680 na_values : set

681 no_dtype_specified: Specifies if we want to cast explicitly

682 try_num_bool : bool, default try

683 try to cast values to numeric (first preference) or boolean

684

685 Returns

686 -------

687 converted : ndarray or ExtensionArray

688 na_count : int

689 """

690 na_count = 0

691 if issubclass(values.dtype.type, (np.number, np.bool_)):

692 # If our array has numeric dtype, we don't have to check for strings in isin

693 na_values = np.array([val for val in na_values if not isinstance(val, str)])

694 mask = algorithms.isin(values, na_values)

695 na_count = mask.astype("uint8", copy=False).sum()

696 if na_count > 0:

697 if is_integer_dtype(values):

698 values = values.astype(np.float64)

699 np.putmask(values, mask, np.nan)

700 return values, na_count

701

702 dtype_backend = self.dtype_backend

703 non_default_dtype_backend = (

704 no_dtype_specified and dtype_backend is not lib.no_default

705 )

706 result: ArrayLike

707

708 if try_num_bool and is_object_dtype(values.dtype):

709 # exclude e.g DatetimeIndex here

710 try:

711 result, result_mask = lib.maybe_convert_numeric(

712 values,

713 na_values,

714 False,

715 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]

716 )

717 except (ValueError, TypeError):

718 # e.g. encountering datetime string gets ValueError

719 # TypeError can be raised in floatify

720 na_count = parsers.sanitize_objects(values, na_values)

721 result = values

722 else:

723 if non_default_dtype_backend:

724 if result_mask is None:

725 result_mask = np.zeros(result.shape, dtype=np.bool_)

726

727 if result_mask.all():

728 result = IntegerArray(

729 np.ones(result_mask.shape, dtype=np.int64), result_mask

730 )

731 elif is_integer_dtype(result):

732 result = IntegerArray(result, result_mask)

733 elif is_bool_dtype(result):

734 result = BooleanArray(result, result_mask)

735 elif is_float_dtype(result):

736 result = FloatingArray(result, result_mask)

737

738 na_count = result_mask.sum()

739 else:

740 na_count = isna(result).sum()

741 else:

742 result = values

743 if values.dtype == np.object_:

744 na_count = parsers.sanitize_objects(values, na_values)

745

746 if result.dtype == np.object_ and try_num_bool:

747 result, bool_mask = libops.maybe_convert_bool(

748 np.asarray(values),

749 true_values=self.true_values,

750 false_values=self.false_values,

751 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]

752 )

753 if result.dtype == np.bool_ and non_default_dtype_backend:

754 if bool_mask is None:

755 bool_mask = np.zeros(result.shape, dtype=np.bool_)

756 result = BooleanArray(result, bool_mask)

757 elif result.dtype == np.object_ and non_default_dtype_backend:

758 # read_excel sends array of datetime objects

759 if not lib.is_datetime_array(result, skipna=True):

760 dtype = StringDtype()

761 cls = dtype.construct_array_type()

762 result = cls._from_sequence(values, dtype=dtype)

763

764 if dtype_backend == "pyarrow":

765 pa = import_optional_dependency("pyarrow")

766 if isinstance(result, np.ndarray):

767 result = ArrowExtensionArray(pa.array(result, from_pandas=True))

768 elif isinstance(result, BaseMaskedArray):

769 if result._mask.all():

770 # We want an arrow null array here

771 result = ArrowExtensionArray(pa.array([None] * len(result)))

772 else:

773 result = ArrowExtensionArray(

774 pa.array(result._data, mask=result._mask)

775 )

776 else:

777 result = ArrowExtensionArray(

778 pa.array(result.to_numpy(), from_pandas=True)

779 )

780

781 return result, na_count

782

783 @final

784 def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:

785 """

786 Cast values to specified type

787

788 Parameters

789 ----------

790 values : ndarray or ExtensionArray

791 cast_type : np.dtype or ExtensionDtype

792 dtype to cast values to

793 column : string

794 column name - used only for error reporting

795

796 Returns

797 -------

798 converted : ndarray or ExtensionArray

799 """

800 if isinstance(cast_type, CategoricalDtype):

801 known_cats = cast_type.categories is not None

802

803 if not is_object_dtype(values.dtype) and not known_cats:

804 # TODO: this is for consistency with

805 # c-parser which parses all categories

806 # as strings

807 values = lib.ensure_string_array(

808 values, skipna=False, convert_na_value=False

809 )

810

811 cats = Index(values).unique().dropna()

812 values = Categorical._from_inferred_categories(

813 cats, cats.get_indexer(values), cast_type, true_values=self.true_values

814 )

815

816 # use the EA's implementation of casting

817 elif isinstance(cast_type, ExtensionDtype):

818 array_type = cast_type.construct_array_type()

819 try:

820 if isinstance(cast_type, BooleanDtype):

821 # error: Unexpected keyword argument "true_values" for

822 # "_from_sequence_of_strings" of "ExtensionArray"

823 return array_type._from_sequence_of_strings( # type: ignore[call-arg]

824 values,

825 dtype=cast_type,

826 true_values=self.true_values,

827 false_values=self.false_values,

828 )

829 else:

830 return array_type._from_sequence_of_strings(values, dtype=cast_type)

831 except NotImplementedError as err:

832 raise NotImplementedError(

833 f"Extension Array: {array_type} must implement "

834 "_from_sequence_of_strings in order to be used in parser methods"

835 ) from err

836

837 elif isinstance(values, ExtensionArray):

838 values = values.astype(cast_type, copy=False)

839 elif issubclass(cast_type.type, str):

840 # TODO: why skipna=True here and False above? some tests depend

841 # on it here, but nothing fails if we change it above

842 # (as no tests get there as of 2022-12-06)

843 values = lib.ensure_string_array(

844 values, skipna=True, convert_na_value=False

845 )

846 else:

847 try:

848 values = astype_array(values, cast_type, copy=True)

849 except ValueError as err:

850 raise ValueError(

851 f"Unable to convert column {column} to type {cast_type}"

852 ) from err

853 return values

854

855 @overload

856 def _do_date_conversions(

857 self,

858 names: Index,

859 data: DataFrame,

860 ) -> tuple[Sequence[Hashable] | Index, DataFrame]:

861 ...

862

863 @overload

864 def _do_date_conversions(

865 self,

866 names: Sequence[Hashable],

867 data: Mapping[Hashable, ArrayLike],

868 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:

869 ...

870

871 @final

872 def _do_date_conversions(

873 self,

874 names: Sequence[Hashable] | Index,

875 data: Mapping[Hashable, ArrayLike] | DataFrame,

876 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:

877 # returns data, columns

878

879 if self.parse_dates is not None:

880 data, names = _process_date_conversion(

881 data,

882 self._date_conv,

883 self.parse_dates,

884 self.index_col,

885 self.index_names,

886 names,

887 keep_date_col=self.keep_date_col,

888 dtype_backend=self.dtype_backend,

889 )

890

891 return names, data

892

893 @final

894 def _check_data_length(

895 self,

896 columns: Sequence[Hashable],

897 data: Sequence[ArrayLike],

898 ) -> None:

899 """Checks if length of data is equal to length of column names.

900

901 One set of trailing commas is allowed. self.index_col not False

902 results in a ParserError previously when lengths do not match.

903

904 Parameters

905 ----------

906 columns: list of column names

907 data: list of array-likes containing the data column-wise.

908 """

909 if not self.index_col and len(columns) != len(data) and columns:

910 empty_str = is_object_dtype(data[-1]) and data[-1] == ""

911 # error: No overload variant of "__ror__" of "ndarray" matches

912 # argument type "ExtensionArray"

913 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]

914 if len(columns) == len(data) - 1 and np.all(empty_str_or_na):

915 return

916 warnings.warn(

917 "Length of header or names does not match length of data. This leads "

918 "to a loss of data with index_col=False.",

919 ParserWarning,

920 stacklevel=find_stack_level(),

921 )

922

923 @overload

924 def _evaluate_usecols(

925 self,

926 usecols: set[int] | Callable[[Hashable], object],

927 names: Sequence[Hashable],

928 ) -> set[int]:

929 ...

930

931 @overload

932 def _evaluate_usecols(

933 self, usecols: set[str], names: Sequence[Hashable]

934 ) -> set[str]:

935 ...

936

937 @final

938 def _evaluate_usecols(

939 self,

940 usecols: Callable[[Hashable], object] | set[str] | set[int],

941 names: Sequence[Hashable],

942 ) -> set[str] | set[int]:

943 """

944 Check whether or not the 'usecols' parameter

945 is a callable. If so, enumerates the 'names'

946 parameter and returns a set of indices for

947 each entry in 'names' that evaluates to True.

948 If not a callable, returns 'usecols'.

949 """

950 if callable(usecols):

951 return {i for i, name in enumerate(names) if usecols(name)}

952 return usecols

953

954 @final

955 def _validate_usecols_names(self, usecols, names: Sequence):

956 """

957 Validates that all usecols are present in a given

958 list of names. If not, raise a ValueError that

959 shows what usecols are missing.

960

961 Parameters

962 ----------

963 usecols : iterable of usecols

964 The columns to validate are present in names.

965 names : iterable of names

966 The column names to check against.

967

968 Returns

969 -------

970 usecols : iterable of usecols

971 The `usecols` parameter if the validation succeeds.

972

973 Raises

974 ------

975 ValueError : Columns were missing. Error message will list them.

976 """

977 missing = [c for c in usecols if c not in names]

978 if len(missing) > 0:

979 raise ValueError(

980 f"Usecols do not match columns, columns expected but not found: "

981 f"{missing}"

982 )

983

984 return usecols

985

986 @final

987 def _validate_usecols_arg(self, usecols):

988 """

989 Validate the 'usecols' parameter.

990

991 Checks whether or not the 'usecols' parameter contains all integers

992 (column selection by index), strings (column by name) or is a callable.

993 Raises a ValueError if that is not the case.

994

995 Parameters

996 ----------

997 usecols : list-like, callable, or None

998 List of columns to use when parsing or a callable that can be used

999 to filter a list of table columns.

1000

1001 Returns

1002 -------

1003 usecols_tuple : tuple

1004 A tuple of (verified_usecols, usecols_dtype).

1005

1006 'verified_usecols' is either a set if an array-like is passed in or

1007 'usecols' if a callable or None is passed in.

1008

1009 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like

1010 is passed in or None if a callable or None is passed in.

1011 """

1012 msg = (

1013 "'usecols' must either be list-like of all strings, all unicode, "

1014 "all integers or a callable."

1015 )

1016 if usecols is not None:

1017 if callable(usecols):

1018 return usecols, None

1019

1020 if not is_list_like(usecols):

1021 # see gh-20529

1022 #

1023 # Ensure it is iterable container but not string.

1024 raise ValueError(msg)

1025

1026 usecols_dtype = lib.infer_dtype(usecols, skipna=False)

1027

1028 if usecols_dtype not in ("empty", "integer", "string"):

1029 raise ValueError(msg)

1030

1031 usecols = set(usecols)

1032

1033 return usecols, usecols_dtype

1034 return usecols, None

1035

1036 @final

1037 def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:

1038 if not is_index_col(index_col):

1039 return None, columns, index_col

1040

1041 columns = list(columns)

1042

1043 # In case of no rows and multiindex columns we have to set index_names to

1044 # list of Nones GH#38292

1045 if not columns:

1046 return [None] * len(index_col), columns, index_col

1047

1048 cp_cols = list(columns)

1049 index_names: list[str | int | None] = []

1050

1051 # don't mutate

1052 index_col = list(index_col)

1053

1054 for i, c in enumerate(index_col):

1055 if isinstance(c, str):

1056 index_names.append(c)

1057 for j, name in enumerate(cp_cols):

1058 if name == c:

1059 index_col[i] = j

1060 columns.remove(name)

1061 break

1062 else:

1063 name = cp_cols[c]

1064 columns.remove(name)

1065 index_names.append(name)

1066

1067 # Only clean index names that were placeholders.

1068 for i, name in enumerate(index_names):

1069 if isinstance(name, str) and name in self.unnamed_cols:

1070 index_names[i] = None

1071

1072 return index_names, columns, index_col

1073

1074 @final

1075 def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):

1076 columns = list(columns)

1077

1078 index_col = self.index_col

1079 index_names = self.index_names

1080

1081 # Convert `dtype` to a defaultdict of some kind.

1082 # This will enable us to write `dtype[col_name]`

1083 # without worrying about KeyError issues later on.

1084 dtype_dict: defaultdict[Hashable, Any]

1085 if not is_dict_like(dtype):

1086 # if dtype == None, default will be object.

1087 default_dtype = dtype or object

1088 dtype_dict = defaultdict(lambda: default_dtype)

1089 else:

1090 dtype = cast(dict, dtype)

1091 dtype_dict = defaultdict(

1092 lambda: object,

1093 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},

1094 )

1095

1096 # Even though we have no data, the "index" of the empty DataFrame

1097 # could for example still be an empty MultiIndex. Thus, we need to

1098 # check whether we have any index columns specified, via either:

1099 #

1100 # 1) index_col (column indices)

1101 # 2) index_names (column names)

1102 #

1103 # Both must be non-null to ensure a successful construction. Otherwise,

1104 # we have to create a generic empty Index.

1105 index: Index

1106 if (index_col is None or index_col is False) or index_names is None:

1107 index = default_index(0)

1108 else:

1109 data = [Series([], dtype=dtype_dict[name]) for name in index_names]

1110 index = ensure_index_from_sequences(data, names=index_names)

1111 index_col.sort()

1112

1113 for i, n in enumerate(index_col):

1114 columns.pop(n - i)

1115

1116 col_dict = {

1117 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns

1118 }

1119

1120 return index, columns, col_dict

1121

1122

1123def _make_date_converter(

1124 date_parser=lib.no_default,

1125 dayfirst: bool = False,

1126 cache_dates: bool = True,

1127 date_format: dict[Hashable, str] | str | None = None,

1128):

1129 if date_parser is not lib.no_default:

1130 warnings.warn(

1131 "The argument 'date_parser' is deprecated and will "

1132 "be removed in a future version. "

1133 "Please use 'date_format' instead, or read your data in as 'object' dtype "

1134 "and then call 'to_datetime'.",

1135 FutureWarning,

1136 stacklevel=find_stack_level(),

1137 )

1138 if date_parser is not lib.no_default and date_format is not None:

1139 raise TypeError("Cannot use both 'date_parser' and 'date_format'")

1140

1141 def unpack_if_single_element(arg):

1142 # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615

1143 if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:

1144 return arg[0]

1145 return arg

1146

1147 def converter(*date_cols, col: Hashable):

1148 if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":

1149 return date_cols[0]

1150

1151 if date_parser is lib.no_default:

1152 strs = parsing.concat_date_cols(date_cols)

1153 date_fmt = (

1154 date_format.get(col) if isinstance(date_format, dict) else date_format

1155 )

1156

1157 with warnings.catch_warnings():

1158 warnings.filterwarnings(

1159 "ignore",

1160 ".*parsing datetimes with mixed time zones will raise an error",

1161 category=FutureWarning,

1162 )

1163 str_objs = ensure_object(strs)

1164 try:

1165 result = tools.to_datetime(

1166 str_objs,

1167 format=date_fmt,

1168 utc=False,

1169 dayfirst=dayfirst,

1170 cache=cache_dates,

1171 )

1172 except (ValueError, TypeError):

1173 # test_usecols_with_parse_dates4

1174 return str_objs

1175

1176 if isinstance(result, DatetimeIndex):

1177 arr = result.to_numpy()

1178 arr.flags.writeable = True

1179 return arr

1180 return result._values

1181 else:

1182 try:

1183 with warnings.catch_warnings():

1184 warnings.filterwarnings(

1185 "ignore",

1186 ".*parsing datetimes with mixed time zones "

1187 "will raise an error",

1188 category=FutureWarning,

1189 )

1190 pre_parsed = date_parser(

1191 *(unpack_if_single_element(arg) for arg in date_cols)

1192 )

1193 try:

1194 result = tools.to_datetime(

1195 pre_parsed,

1196 cache=cache_dates,

1197 )

1198 except (ValueError, TypeError):

1199 # test_read_csv_with_custom_date_parser

1200 result = pre_parsed

1201 if isinstance(result, datetime.datetime):

1202 raise Exception("scalar parser")

1203 return result

1204 except Exception:

1205 # e.g. test_datetime_fractional_seconds

1206 with warnings.catch_warnings():

1207 warnings.filterwarnings(

1208 "ignore",

1209 ".*parsing datetimes with mixed time zones "

1210 "will raise an error",

1211 category=FutureWarning,

1212 )

1213 pre_parsed = parsing.try_parse_dates(

1214 parsing.concat_date_cols(date_cols),

1215 parser=date_parser,

1216 )

1217 try:

1218 return tools.to_datetime(pre_parsed)

1219 except (ValueError, TypeError):

1220 # TODO: not reached in tests 2023-10-27; needed?

1221 return pre_parsed

1222

1223 return converter

1224

1225

1226parser_defaults = {

1227 "delimiter": None,

1228 "escapechar": None,

1229 "quotechar": '"',

1230 "quoting": csv.QUOTE_MINIMAL,

1231 "doublequote": True,

1232 "skipinitialspace": False,

1233 "lineterminator": None,

1234 "header": "infer",

1235 "index_col": None,

1236 "names": None,

1237 "skiprows": None,

1238 "skipfooter": 0,

1239 "nrows": None,

1240 "na_values": None,

1241 "keep_default_na": True,

1242 "true_values": None,

1243 "false_values": None,

1244 "converters": None,

1245 "dtype": None,

1246 "cache_dates": True,

1247 "thousands": None,

1248 "comment": None,

1249 "decimal": ".",

1250 # 'engine': 'c',

1251 "parse_dates": False,

1252 "keep_date_col": False,

1253 "dayfirst": False,

1254 "date_parser": lib.no_default,

1255 "date_format": None,

1256 "usecols": None,

1257 # 'iterator': False,

1258 "chunksize": None,

1259 "verbose": False,

1260 "encoding": None,

1261 "compression": None,

1262 "skip_blank_lines": True,

1263 "encoding_errors": "strict",

1264 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,

1265 "dtype_backend": lib.no_default,

1266}

1267

1268

1269def _process_date_conversion(

1270 data_dict,

1271 converter: Callable,

1272 parse_spec,

1273 index_col,

1274 index_names,

1275 columns,

1276 keep_date_col: bool = False,

1277 dtype_backend=lib.no_default,

1278):

1279 def _isindex(colspec):

1280 return (isinstance(index_col, list) and colspec in index_col) or (

1281 isinstance(index_names, list) and colspec in index_names

1282 )

1283

1284 new_cols = []

1285 new_data = {}

1286

1287 orig_names = columns

1288 columns = list(columns)

1289

1290 date_cols = set()

1291

1292 if parse_spec is None or isinstance(parse_spec, bool):

1293 return data_dict, columns

1294

1295 if isinstance(parse_spec, list):

1296 # list of column lists

1297 for colspec in parse_spec:

1298 if is_scalar(colspec) or isinstance(colspec, tuple):

1299 if isinstance(colspec, int) and colspec not in data_dict:

1300 colspec = orig_names[colspec]

1301 if _isindex(colspec):

1302 continue

1303 elif dtype_backend == "pyarrow":

1304 import pyarrow as pa

1305

1306 dtype = data_dict[colspec].dtype

1307 if isinstance(dtype, ArrowDtype) and (

1308 pa.types.is_timestamp(dtype.pyarrow_dtype)

1309 or pa.types.is_date(dtype.pyarrow_dtype)

1310 ):

1311 continue

1312

1313 # Pyarrow engine returns Series which we need to convert to

1314 # numpy array before converter, its a no-op for other parsers

1315 data_dict[colspec] = converter(

1316 np.asarray(data_dict[colspec]), col=colspec

1317 )

1318 else:

1319 new_name, col, old_names = _try_convert_dates(

1320 converter, colspec, data_dict, orig_names

1321 )

1322 if new_name in data_dict:

1323 raise ValueError(f"New date column already in dict {new_name}")

1324 new_data[new_name] = col

1325 new_cols.append(new_name)

1326 date_cols.update(old_names)

1327

1328 elif isinstance(parse_spec, dict):

1329 # dict of new name to column list

1330 for new_name, colspec in parse_spec.items():

1331 if new_name in data_dict:

1332 raise ValueError(f"Date column {new_name} already in dict")

1333

1334 _, col, old_names = _try_convert_dates(

1335 converter,

1336 colspec,

1337 data_dict,

1338 orig_names,

1339 target_name=new_name,

1340 )

1341

1342 new_data[new_name] = col

1343

1344 # If original column can be converted to date we keep the converted values

1345 # This can only happen if values are from single column

1346 if len(colspec) == 1:

1347 new_data[colspec[0]] = col

1348

1349 new_cols.append(new_name)

1350 date_cols.update(old_names)

1351

1352 if isinstance(data_dict, DataFrame):

1353 data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False)

1354 else:

1355 data_dict.update(new_data)

1356 new_cols.extend(columns)

1357

1358 if not keep_date_col:

1359 for c in list(date_cols):

1360 data_dict.pop(c)

1361 new_cols.remove(c)

1362

1363 return data_dict, new_cols

1364

1365

1366def _try_convert_dates(

1367 parser: Callable, colspec, data_dict, columns, target_name: str | None = None

1368):

1369 colset = set(columns)

1370 colnames = []

1371

1372 for c in colspec:

1373 if c in colset:

1374 colnames.append(c)

1375 elif isinstance(c, int) and c not in columns:

1376 colnames.append(columns[c])

1377 else:

1378 colnames.append(c)

1379

1380 new_name: tuple | str

1381 if all(isinstance(x, tuple) for x in colnames):

1382 new_name = tuple(map("_".join, zip(*colnames)))

1383 else:

1384 new_name = "_".join([str(x) for x in colnames])

1385 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]

1386

1387 new_col = parser(*to_parse, col=new_name if target_name is None else target_name)

1388 return new_name, new_col, colnames

1389

1390

1391def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):

1392 """

1393 Get the NaN values for a given column.

1394

1395 Parameters

1396 ----------

1397 col : str

1398 The name of the column.

1399 na_values : array-like, dict

1400 The object listing the NaN values as strings.

1401 na_fvalues : array-like, dict

1402 The object listing the NaN values as floats.

1403 keep_default_na : bool

1404 If `na_values` is a dict, and the column is not mapped in the

1405 dictionary, whether to return the default NaN values or the empty set.

1406

1407 Returns

1408 -------

1409 nan_tuple : A length-two tuple composed of

1410

1411 1) na_values : the string NaN values for that column.

1412 2) na_fvalues : the float NaN values for that column.

1413 """

1414 if isinstance(na_values, dict):

1415 if col in na_values:

1416 return na_values[col], na_fvalues[col]

1417 else:

1418 if keep_default_na:

1419 return STR_NA_VALUES, set()

1420

1421 return set(), set()

1422 else:

1423 return na_values, na_fvalues

1424

1425

1426def _validate_parse_dates_arg(parse_dates):

1427 """

1428 Check whether or not the 'parse_dates' parameter

1429 is a non-boolean scalar. Raises a ValueError if

1430 that is the case.

1431 """

1432 msg = (

1433 "Only booleans, lists, and dictionaries are accepted "

1434 "for the 'parse_dates' parameter"

1435 )

1436

1437 if not (

1438 parse_dates is None

1439 or lib.is_bool(parse_dates)

1440 or isinstance(parse_dates, (list, dict))

1441 ):

1442 raise TypeError(msg)

1443

1444 return parse_dates

1445

1446

1447def is_index_col(col) -> bool:

1448 return col is not None and col is not False

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/parsers/base_parser.py: 45%

619 statements