Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

578 statements  

1from __future__ import annotations 

2 

3from collections import defaultdict 

4from copy import copy 

5import csv 

6import datetime 

7from enum import Enum 

8import itertools 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13 Hashable, 

14 Iterable, 

15 List, 

16 Mapping, 

17 Sequence, 

18 Tuple, 

19 cast, 

20 final, 

21 overload, 

22) 

23import warnings 

24 

25import numpy as np 

26 

27from pandas._libs import ( 

28 lib, 

29 parsers, 

30) 

31import pandas._libs.ops as libops 

32from pandas._libs.parsers import STR_NA_VALUES 

33from pandas._libs.tslibs import parsing 

34from pandas._typing import ( 

35 ArrayLike, 

36 DtypeArg, 

37 DtypeObj, 

38 Scalar, 

39) 

40from pandas.compat._optional import import_optional_dependency 

41from pandas.errors import ( 

42 ParserError, 

43 ParserWarning, 

44) 

45from pandas.util._exceptions import find_stack_level 

46 

47from pandas.core.dtypes.astype import astype_array 

48from pandas.core.dtypes.common import ( 

49 ensure_object, 

50 is_bool_dtype, 

51 is_dict_like, 

52 is_dtype_equal, 

53 is_extension_array_dtype, 

54 is_float_dtype, 

55 is_integer, 

56 is_integer_dtype, 

57 is_list_like, 

58 is_object_dtype, 

59 is_scalar, 

60 is_string_dtype, 

61 pandas_dtype, 

62) 

63from pandas.core.dtypes.dtypes import ( 

64 CategoricalDtype, 

65 ExtensionDtype, 

66) 

67from pandas.core.dtypes.missing import isna 

68 

69from pandas import ( 

70 ArrowDtype, 

71 DatetimeIndex, 

72 StringDtype, 

73) 

74from pandas.core import algorithms 

75from pandas.core.arrays import ( 

76 ArrowExtensionArray, 

77 BooleanArray, 

78 Categorical, 

79 ExtensionArray, 

80 FloatingArray, 

81 IntegerArray, 

82) 

83from pandas.core.arrays.boolean import BooleanDtype 

84from pandas.core.indexes.api import ( 

85 Index, 

86 MultiIndex, 

87 default_index, 

88 ensure_index_from_sequences, 

89) 

90from pandas.core.series import Series 

91from pandas.core.tools import datetimes as tools 

92 

93from pandas.io.common import is_potential_multi_index 

94 

95if TYPE_CHECKING: 

96 from pandas import DataFrame 

97 

98 

99class ParserBase: 

100 class BadLineHandleMethod(Enum): 

101 ERROR = 0 

102 WARN = 1 

103 SKIP = 2 

104 

105 _implicit_index: bool = False 

106 _first_chunk: bool 

107 

108 def __init__(self, kwds) -> None: 

109 self.names = kwds.get("names") 

110 self.orig_names: Sequence[Hashable] | None = None 

111 

112 self.index_col = kwds.get("index_col", None) 

113 self.unnamed_cols: set = set() 

114 self.index_names: Sequence[Hashable] | None = None 

115 self.col_names: Sequence[Hashable] | None = None 

116 

117 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) 

118 self._parse_date_cols: Iterable = [] 

119 self.date_parser = kwds.pop("date_parser", lib.no_default) 

120 self.date_format = kwds.pop("date_format", None) 

121 self.dayfirst = kwds.pop("dayfirst", False) 

122 self.keep_date_col = kwds.pop("keep_date_col", False) 

123 

124 self.na_values = kwds.get("na_values") 

125 self.na_fvalues = kwds.get("na_fvalues") 

126 self.na_filter = kwds.get("na_filter", False) 

127 self.keep_default_na = kwds.get("keep_default_na", True) 

128 

129 self.dtype = copy(kwds.get("dtype", None)) 

130 self.converters = kwds.get("converters") 

131 self.dtype_backend = kwds.get("dtype_backend") 

132 

133 self.true_values = kwds.get("true_values") 

134 self.false_values = kwds.get("false_values") 

135 self.cache_dates = kwds.pop("cache_dates", True) 

136 

137 self._date_conv = _make_date_converter( 

138 date_parser=self.date_parser, 

139 date_format=self.date_format, 

140 dayfirst=self.dayfirst, 

141 cache_dates=self.cache_dates, 

142 ) 

143 

144 # validate header options for mi 

145 self.header = kwds.get("header") 

146 if is_list_like(self.header, allow_sets=False): 

147 if kwds.get("usecols"): 

148 raise ValueError( 

149 "cannot specify usecols when specifying a multi-index header" 

150 ) 

151 if kwds.get("names"): 

152 raise ValueError( 

153 "cannot specify names when specifying a multi-index header" 

154 ) 

155 

156 # validate index_col that only contains integers 

157 if self.index_col is not None: 

158 if not ( 

159 is_list_like(self.index_col, allow_sets=False) 

160 and all(map(is_integer, self.index_col)) 

161 or is_integer(self.index_col) 

162 ): 

163 raise ValueError( 

164 "index_col must only contain row numbers " 

165 "when specifying a multi-index header" 

166 ) 

167 

168 self._name_processed = False 

169 

170 self._first_chunk = True 

171 

172 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) 

173 

174 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) 

175 # Normally, this arg would get pre-processed earlier on 

176 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) 

177 

178 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: 

179 """ 

180 Check if parse_dates are in columns. 

181 

182 If user has provided names for parse_dates, check if those columns 

183 are available. 

184 

185 Parameters 

186 ---------- 

187 columns : list 

188 List of names of the dataframe. 

189 

190 Returns 

191 ------- 

192 The names of the columns which will get parsed later if a dict or list 

193 is given as specification. 

194 

195 Raises 

196 ------ 

197 ValueError 

198 If column to parse_date is not in dataframe. 

199 

200 """ 

201 cols_needed: Iterable 

202 if is_dict_like(self.parse_dates): 

203 cols_needed = itertools.chain(*self.parse_dates.values()) 

204 elif is_list_like(self.parse_dates): 

205 # a column in parse_dates could be represented 

206 # ColReference = Union[int, str] 

207 # DateGroups = List[ColReference] 

208 # ParseDates = Union[DateGroups, List[DateGroups], 

209 # Dict[ColReference, DateGroups]] 

210 cols_needed = itertools.chain.from_iterable( 

211 col if is_list_like(col) and not isinstance(col, tuple) else [col] 

212 for col in self.parse_dates 

213 ) 

214 else: 

215 cols_needed = [] 

216 

217 cols_needed = list(cols_needed) 

218 

219 # get only columns that are references using names (str), not by index 

220 missing_cols = ", ".join( 

221 sorted( 

222 { 

223 col 

224 for col in cols_needed 

225 if isinstance(col, str) and col not in columns 

226 } 

227 ) 

228 ) 

229 if missing_cols: 

230 raise ValueError( 

231 f"Missing column provided to 'parse_dates': '{missing_cols}'" 

232 ) 

233 # Convert positions to actual column names 

234 return [ 

235 col if (isinstance(col, str) or col in columns) else columns[col] 

236 for col in cols_needed 

237 ] 

238 

239 def close(self) -> None: 

240 pass 

241 

242 @final 

243 @property 

244 def _has_complex_date_col(self) -> bool: 

245 return isinstance(self.parse_dates, dict) or ( 

246 isinstance(self.parse_dates, list) 

247 and len(self.parse_dates) > 0 

248 and isinstance(self.parse_dates[0], list) 

249 ) 

250 

251 @final 

252 def _should_parse_dates(self, i: int) -> bool: 

253 if isinstance(self.parse_dates, bool): 

254 return self.parse_dates 

255 else: 

256 if self.index_names is not None: 

257 name = self.index_names[i] 

258 else: 

259 name = None 

260 j = i if self.index_col is None else self.index_col[i] 

261 

262 if is_scalar(self.parse_dates): 

263 return (j == self.parse_dates) or ( 

264 name is not None and name == self.parse_dates 

265 ) 

266 else: 

267 return (j in self.parse_dates) or ( 

268 name is not None and name in self.parse_dates 

269 ) 

270 

271 @final 

272 def _extract_multi_indexer_columns( 

273 self, 

274 header, 

275 index_names: Sequence[Hashable] | None, 

276 passed_names: bool = False, 

277 ) -> tuple[ 

278 Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool 

279 ]: 

280 """ 

281 Extract and return the names, index_names, col_names if the column 

282 names are a MultiIndex. 

283 

284 Parameters 

285 ---------- 

286 header: list of lists 

287 The header rows 

288 index_names: list, optional 

289 The names of the future index 

290 passed_names: bool, default False 

291 A flag specifying if names where passed 

292 

293 """ 

294 if len(header) < 2: 

295 return header[0], index_names, None, passed_names 

296 

297 # the names are the tuples of the header that are not the index cols 

298 # 0 is the name of the index, assuming index_col is a list of column 

299 # numbers 

300 ic = self.index_col 

301 if ic is None: 

302 ic = [] 

303 

304 if not isinstance(ic, (list, tuple, np.ndarray)): 

305 ic = [ic] 

306 sic = set(ic) 

307 

308 # clean the index_names 

309 index_names = header.pop(-1) 

310 index_names, _, _ = self._clean_index_names(index_names, self.index_col) 

311 

312 # extract the columns 

313 field_count = len(header[0]) 

314 

315 # check if header lengths are equal 

316 if not all(len(header_iter) == field_count for header_iter in header[1:]): 

317 raise ParserError("Header rows must have an equal number of columns.") 

318 

319 def extract(r): 

320 return tuple(r[i] for i in range(field_count) if i not in sic) 

321 

322 columns = list(zip(*(extract(r) for r in header))) 

323 names = columns.copy() 

324 for single_ic in sorted(ic): 

325 names.insert(single_ic, single_ic) 

326 

327 # Clean the column names (if we have an index_col). 

328 if len(ic): 

329 col_names = [ 

330 r[ic[0]] 

331 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) 

332 else None 

333 for r in header 

334 ] 

335 else: 

336 col_names = [None] * len(header) 

337 

338 passed_names = True 

339 

340 return names, index_names, col_names, passed_names 

341 

342 @final 

343 def _maybe_make_multi_index_columns( 

344 self, 

345 columns: Sequence[Hashable], 

346 col_names: Sequence[Hashable] | None = None, 

347 ) -> Sequence[Hashable] | MultiIndex: 

348 # possibly create a column mi here 

349 if is_potential_multi_index(columns): 

350 list_columns = cast(List[Tuple], columns) 

351 return MultiIndex.from_tuples(list_columns, names=col_names) 

352 return columns 

353 

354 @final 

355 def _make_index( 

356 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None 

357 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: 

358 index: Index | None 

359 if not is_index_col(self.index_col) or not self.index_col: 

360 index = None 

361 

362 elif not self._has_complex_date_col: 

363 simple_index = self._get_simple_index(alldata, columns) 

364 index = self._agg_index(simple_index) 

365 elif self._has_complex_date_col: 

366 if not self._name_processed: 

367 (self.index_names, _, self.index_col) = self._clean_index_names( 

368 list(columns), self.index_col 

369 ) 

370 self._name_processed = True 

371 date_index = self._get_complex_date_index(data, columns) 

372 index = self._agg_index(date_index, try_parse_dates=False) 

373 

374 # add names for the index 

375 if indexnamerow: 

376 coffset = len(indexnamerow) - len(columns) 

377 assert index is not None 

378 index = index.set_names(indexnamerow[:coffset]) 

379 

380 # maybe create a mi on the columns 

381 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 

382 

383 return index, columns 

384 

385 @final 

386 def _get_simple_index(self, data, columns): 

387 def ix(col): 

388 if not isinstance(col, str): 

389 return col 

390 raise ValueError(f"Index {col} invalid") 

391 

392 to_remove = [] 

393 index = [] 

394 for idx in self.index_col: 

395 i = ix(idx) 

396 to_remove.append(i) 

397 index.append(data[i]) 

398 

399 # remove index items from content and columns, don't pop in 

400 # loop 

401 for i in sorted(to_remove, reverse=True): 

402 data.pop(i) 

403 if not self._implicit_index: 

404 columns.pop(i) 

405 

406 return index 

407 

408 @final 

409 def _get_complex_date_index(self, data, col_names): 

410 def _get_name(icol): 

411 if isinstance(icol, str): 

412 return icol 

413 

414 if col_names is None: 

415 raise ValueError(f"Must supply column order to use {icol!s} as index") 

416 

417 for i, c in enumerate(col_names): 

418 if i == icol: 

419 return c 

420 

421 to_remove = [] 

422 index = [] 

423 for idx in self.index_col: 

424 name = _get_name(idx) 

425 to_remove.append(name) 

426 index.append(data[name]) 

427 

428 # remove index items from content and columns, don't pop in 

429 # loop 

430 for c in sorted(to_remove, reverse=True): 

431 data.pop(c) 

432 col_names.remove(c) 

433 

434 return index 

435 

436 def _clean_mapping(self, mapping): 

437 """converts col numbers to names""" 

438 if not isinstance(mapping, dict): 

439 return mapping 

440 clean = {} 

441 # for mypy 

442 assert self.orig_names is not None 

443 

444 for col, v in mapping.items(): 

445 if isinstance(col, int) and col not in self.orig_names: 

446 col = self.orig_names[col] 

447 clean[col] = v 

448 if isinstance(mapping, defaultdict): 

449 remaining_cols = set(self.orig_names) - set(clean.keys()) 

450 clean.update({col: mapping[col] for col in remaining_cols}) 

451 return clean 

452 

453 @final 

454 def _agg_index(self, index, try_parse_dates: bool = True) -> Index: 

455 arrays = [] 

456 converters = self._clean_mapping(self.converters) 

457 

458 for i, arr in enumerate(index): 

459 if try_parse_dates and self._should_parse_dates(i): 

460 arr = self._date_conv( 

461 arr, 

462 col=self.index_names[i] if self.index_names is not None else None, 

463 ) 

464 

465 if self.na_filter: 

466 col_na_values = self.na_values 

467 col_na_fvalues = self.na_fvalues 

468 else: 

469 col_na_values = set() 

470 col_na_fvalues = set() 

471 

472 if isinstance(self.na_values, dict): 

473 assert self.index_names is not None 

474 col_name = self.index_names[i] 

475 if col_name is not None: 

476 col_na_values, col_na_fvalues = _get_na_values( 

477 col_name, self.na_values, self.na_fvalues, self.keep_default_na 

478 ) 

479 

480 clean_dtypes = self._clean_mapping(self.dtype) 

481 

482 cast_type = None 

483 index_converter = False 

484 if self.index_names is not None: 

485 if isinstance(clean_dtypes, dict): 

486 cast_type = clean_dtypes.get(self.index_names[i], None) 

487 

488 if isinstance(converters, dict): 

489 index_converter = converters.get(self.index_names[i]) is not None 

490 

491 try_num_bool = not ( 

492 cast_type and is_string_dtype(cast_type) or index_converter 

493 ) 

494 

495 arr, _ = self._infer_types( 

496 arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool 

497 ) 

498 arrays.append(arr) 

499 

500 names = self.index_names 

501 index = ensure_index_from_sequences(arrays, names) 

502 

503 return index 

504 

505 @final 

506 def _convert_to_ndarrays( 

507 self, 

508 dct: Mapping, 

509 na_values, 

510 na_fvalues, 

511 verbose: bool = False, 

512 converters=None, 

513 dtypes=None, 

514 ): 

515 result = {} 

516 for c, values in dct.items(): 

517 conv_f = None if converters is None else converters.get(c, None) 

518 if isinstance(dtypes, dict): 

519 cast_type = dtypes.get(c, None) 

520 else: 

521 # single dtype or None 

522 cast_type = dtypes 

523 

524 if self.na_filter: 

525 col_na_values, col_na_fvalues = _get_na_values( 

526 c, na_values, na_fvalues, self.keep_default_na 

527 ) 

528 else: 

529 col_na_values, col_na_fvalues = set(), set() 

530 

531 if c in self._parse_date_cols: 

532 # GH#26203 Do not convert columns which get converted to dates 

533 # but replace nans to ensure to_datetime works 

534 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) 

535 np.putmask(values, mask, np.nan) 

536 result[c] = values 

537 continue 

538 

539 if conv_f is not None: 

540 # conv_f applied to data before inference 

541 if cast_type is not None: 

542 warnings.warn( 

543 ( 

544 "Both a converter and dtype were specified " 

545 f"for column {c} - only the converter will be used." 

546 ), 

547 ParserWarning, 

548 stacklevel=find_stack_level(), 

549 ) 

550 

551 try: 

552 values = lib.map_infer(values, conv_f) 

553 except ValueError: 

554 # error: Argument 2 to "isin" has incompatible type "List[Any]"; 

555 # expected "Union[Union[ExtensionArray, ndarray], Index, Series]" 

556 mask = algorithms.isin( 

557 values, list(na_values) # type: ignore[arg-type] 

558 ).view(np.uint8) 

559 values = lib.map_infer_mask(values, conv_f, mask) 

560 

561 cvals, na_count = self._infer_types( 

562 values, 

563 set(col_na_values) | col_na_fvalues, 

564 cast_type is None, 

565 try_num_bool=False, 

566 ) 

567 else: 

568 is_ea = is_extension_array_dtype(cast_type) 

569 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) 

570 # skip inference if specified dtype is object 

571 # or casting to an EA 

572 try_num_bool = not (cast_type and is_str_or_ea_dtype) 

573 

574 # general type inference and conversion 

575 cvals, na_count = self._infer_types( 

576 values, 

577 set(col_na_values) | col_na_fvalues, 

578 cast_type is None, 

579 try_num_bool, 

580 ) 

581 

582 # type specified in dtype param or cast_type is an EA 

583 if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea): 

584 if not is_ea and na_count > 0: 

585 if is_bool_dtype(cast_type): 

586 raise ValueError(f"Bool column has NA values in column {c}") 

587 cast_type = pandas_dtype(cast_type) 

588 cvals = self._cast_types(cvals, cast_type, c) 

589 

590 result[c] = cvals 

591 if verbose and na_count: 

592 print(f"Filled {na_count} NA values in column {c!s}") 

593 return result 

594 

595 @final 

596 def _set_noconvert_dtype_columns( 

597 self, col_indices: list[int], names: Sequence[Hashable] 

598 ) -> set[int]: 

599 """ 

600 Set the columns that should not undergo dtype conversions. 

601 

602 Currently, any column that is involved with date parsing will not 

603 undergo such conversions. If usecols is specified, the positions of the columns 

604 not to cast is relative to the usecols not to all columns. 

605 

606 Parameters 

607 ---------- 

608 col_indices: The indices specifying order and positions of the columns 

609 names: The column names which order is corresponding with the order 

610 of col_indices 

611 

612 Returns 

613 ------- 

614 A set of integers containing the positions of the columns not to convert. 

615 """ 

616 usecols: list[int] | list[str] | None 

617 noconvert_columns = set() 

618 if self.usecols_dtype == "integer": 

619 # A set of integers will be converted to a list in 

620 # the correct order every single time. 

621 usecols = sorted(self.usecols) 

622 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): 

623 # The names attribute should have the correct columns 

624 # in the proper order for indexing with parse_dates. 

625 usecols = col_indices 

626 else: 

627 # Usecols is empty. 

628 usecols = None 

629 

630 def _set(x) -> int: 

631 if usecols is not None and is_integer(x): 

632 x = usecols[x] 

633 

634 if not is_integer(x): 

635 x = col_indices[names.index(x)] 

636 

637 return x 

638 

639 if isinstance(self.parse_dates, list): 

640 for val in self.parse_dates: 

641 if isinstance(val, list): 

642 for k in val: 

643 noconvert_columns.add(_set(k)) 

644 else: 

645 noconvert_columns.add(_set(val)) 

646 

647 elif isinstance(self.parse_dates, dict): 

648 for val in self.parse_dates.values(): 

649 if isinstance(val, list): 

650 for k in val: 

651 noconvert_columns.add(_set(k)) 

652 else: 

653 noconvert_columns.add(_set(val)) 

654 

655 elif self.parse_dates: 

656 if isinstance(self.index_col, list): 

657 for k in self.index_col: 

658 noconvert_columns.add(_set(k)) 

659 elif self.index_col is not None: 

660 noconvert_columns.add(_set(self.index_col)) 

661 

662 return noconvert_columns 

663 

664 def _infer_types( 

665 self, values, na_values, no_dtype_specified, try_num_bool: bool = True 

666 ) -> tuple[ArrayLike, int]: 

667 """ 

668 Infer types of values, possibly casting 

669 

670 Parameters 

671 ---------- 

672 values : ndarray 

673 na_values : set 

674 no_dtype_specified: Specifies if we want to cast explicitly 

675 try_num_bool : bool, default try 

676 try to cast values to numeric (first preference) or boolean 

677 

678 Returns 

679 ------- 

680 converted : ndarray or ExtensionArray 

681 na_count : int 

682 """ 

683 na_count = 0 

684 if issubclass(values.dtype.type, (np.number, np.bool_)): 

685 # If our array has numeric dtype, we don't have to check for strings in isin 

686 na_values = np.array([val for val in na_values if not isinstance(val, str)]) 

687 mask = algorithms.isin(values, na_values) 

688 na_count = mask.astype("uint8", copy=False).sum() 

689 if na_count > 0: 

690 if is_integer_dtype(values): 

691 values = values.astype(np.float64) 

692 np.putmask(values, mask, np.nan) 

693 return values, na_count 

694 

695 dtype_backend = self.dtype_backend 

696 non_default_dtype_backend = ( 

697 no_dtype_specified and dtype_backend is not lib.no_default 

698 ) 

699 result: ArrayLike 

700 

701 if try_num_bool and is_object_dtype(values.dtype): 

702 # exclude e.g DatetimeIndex here 

703 try: 

704 result, result_mask = lib.maybe_convert_numeric( 

705 values, 

706 na_values, 

707 False, 

708 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa 

709 ) 

710 except (ValueError, TypeError): 

711 # e.g. encountering datetime string gets ValueError 

712 # TypeError can be raised in floatify 

713 na_count = parsers.sanitize_objects(values, na_values) 

714 result = values 

715 else: 

716 if non_default_dtype_backend: 

717 if result_mask is None: 

718 result_mask = np.zeros(result.shape, dtype=np.bool_) 

719 

720 if result_mask.all(): 

721 result = IntegerArray( 

722 np.ones(result_mask.shape, dtype=np.int64), result_mask 

723 ) 

724 elif is_integer_dtype(result): 

725 result = IntegerArray(result, result_mask) 

726 elif is_bool_dtype(result): 

727 result = BooleanArray(result, result_mask) 

728 elif is_float_dtype(result): 

729 result = FloatingArray(result, result_mask) 

730 

731 na_count = result_mask.sum() 

732 else: 

733 na_count = isna(result).sum() 

734 else: 

735 result = values 

736 if values.dtype == np.object_: 

737 na_count = parsers.sanitize_objects(values, na_values) 

738 

739 if result.dtype == np.object_ and try_num_bool: 

740 result, bool_mask = libops.maybe_convert_bool( 

741 np.asarray(values), 

742 true_values=self.true_values, 

743 false_values=self.false_values, 

744 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa 

745 ) 

746 if result.dtype == np.bool_ and non_default_dtype_backend: 

747 if bool_mask is None: 

748 bool_mask = np.zeros(result.shape, dtype=np.bool_) 

749 result = BooleanArray(result, bool_mask) 

750 elif result.dtype == np.object_ and non_default_dtype_backend: 

751 # read_excel sends array of datetime objects 

752 inferred_type = lib.infer_dtype(result) 

753 if inferred_type != "datetime": 

754 result = StringDtype().construct_array_type()._from_sequence(values) 

755 

756 if dtype_backend == "pyarrow": 

757 pa = import_optional_dependency("pyarrow") 

758 if isinstance(result, np.ndarray): 

759 result = ArrowExtensionArray(pa.array(result, from_pandas=True)) 

760 else: 

761 # ExtensionArray 

762 result = ArrowExtensionArray( 

763 pa.array(result.to_numpy(), from_pandas=True) 

764 ) 

765 

766 return result, na_count 

767 

768 def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: 

769 """ 

770 Cast values to specified type 

771 

772 Parameters 

773 ---------- 

774 values : ndarray or ExtensionArray 

775 cast_type : np.dtype or ExtensionDtype 

776 dtype to cast values to 

777 column : string 

778 column name - used only for error reporting 

779 

780 Returns 

781 ------- 

782 converted : ndarray or ExtensionArray 

783 """ 

784 if isinstance(cast_type, CategoricalDtype): 

785 known_cats = cast_type.categories is not None 

786 

787 if not is_object_dtype(values.dtype) and not known_cats: 

788 # TODO: this is for consistency with 

789 # c-parser which parses all categories 

790 # as strings 

791 values = lib.ensure_string_array( 

792 values, skipna=False, convert_na_value=False 

793 ) 

794 

795 cats = Index(values).unique().dropna() 

796 values = Categorical._from_inferred_categories( 

797 cats, cats.get_indexer(values), cast_type, true_values=self.true_values 

798 ) 

799 

800 # use the EA's implementation of casting 

801 elif isinstance(cast_type, ExtensionDtype): 

802 array_type = cast_type.construct_array_type() 

803 try: 

804 if isinstance(cast_type, BooleanDtype): 

805 # error: Unexpected keyword argument "true_values" for 

806 # "_from_sequence_of_strings" of "ExtensionArray" 

807 return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa:E501 

808 values, 

809 dtype=cast_type, 

810 true_values=self.true_values, 

811 false_values=self.false_values, 

812 ) 

813 else: 

814 return array_type._from_sequence_of_strings(values, dtype=cast_type) 

815 except NotImplementedError as err: 

816 raise NotImplementedError( 

817 f"Extension Array: {array_type} must implement " 

818 "_from_sequence_of_strings in order to be used in parser methods" 

819 ) from err 

820 

821 elif isinstance(values, ExtensionArray): 

822 values = values.astype(cast_type, copy=False) 

823 elif issubclass(cast_type.type, str): 

824 # TODO: why skipna=True here and False above? some tests depend 

825 # on it here, but nothing fails if we change it above 

826 # (as no tests get there as of 2022-12-06) 

827 values = lib.ensure_string_array( 

828 values, skipna=True, convert_na_value=False 

829 ) 

830 else: 

831 try: 

832 values = astype_array(values, cast_type, copy=True) 

833 except ValueError as err: 

834 raise ValueError( 

835 f"Unable to convert column {column} to type {cast_type}" 

836 ) from err 

837 return values 

838 

839 @overload 

840 def _do_date_conversions( 

841 self, 

842 names: Index, 

843 data: DataFrame, 

844 ) -> tuple[Sequence[Hashable] | Index, DataFrame]: 

845 ... 

846 

847 @overload 

848 def _do_date_conversions( 

849 self, 

850 names: Sequence[Hashable], 

851 data: Mapping[Hashable, ArrayLike], 

852 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: 

853 ... 

854 

855 def _do_date_conversions( 

856 self, 

857 names: Sequence[Hashable] | Index, 

858 data: Mapping[Hashable, ArrayLike] | DataFrame, 

859 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: 

860 # returns data, columns 

861 

862 if self.parse_dates is not None: 

863 data, names = _process_date_conversion( 

864 data, 

865 self._date_conv, 

866 self.parse_dates, 

867 self.index_col, 

868 self.index_names, 

869 names, 

870 keep_date_col=self.keep_date_col, 

871 dtype_backend=self.dtype_backend, 

872 ) 

873 

874 return names, data 

875 

876 def _check_data_length( 

877 self, 

878 columns: Sequence[Hashable], 

879 data: Sequence[ArrayLike], 

880 ) -> None: 

881 """Checks if length of data is equal to length of column names. 

882 

883 One set of trailing commas is allowed. self.index_col not False 

884 results in a ParserError previously when lengths do not match. 

885 

886 Parameters 

887 ---------- 

888 columns: list of column names 

889 data: list of array-likes containing the data column-wise. 

890 """ 

891 if not self.index_col and len(columns) != len(data) and columns: 

892 empty_str = is_object_dtype(data[-1]) and data[-1] == "" 

893 # error: No overload variant of "__ror__" of "ndarray" matches 

894 # argument type "ExtensionArray" 

895 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator] 

896 if len(columns) == len(data) - 1 and np.all(empty_str_or_na): 

897 return 

898 warnings.warn( 

899 "Length of header or names does not match length of data. This leads " 

900 "to a loss of data with index_col=False.", 

901 ParserWarning, 

902 stacklevel=find_stack_level(), 

903 ) 

904 

905 @overload 

906 def _evaluate_usecols( 

907 self, 

908 usecols: set[int] | Callable[[Hashable], object], 

909 names: Sequence[Hashable], 

910 ) -> set[int]: 

911 ... 

912 

913 @overload 

914 def _evaluate_usecols( 

915 self, usecols: set[str], names: Sequence[Hashable] 

916 ) -> set[str]: 

917 ... 

918 

919 def _evaluate_usecols( 

920 self, 

921 usecols: Callable[[Hashable], object] | set[str] | set[int], 

922 names: Sequence[Hashable], 

923 ) -> set[str] | set[int]: 

924 """ 

925 Check whether or not the 'usecols' parameter 

926 is a callable. If so, enumerates the 'names' 

927 parameter and returns a set of indices for 

928 each entry in 'names' that evaluates to True. 

929 If not a callable, returns 'usecols'. 

930 """ 

931 if callable(usecols): 

932 return {i for i, name in enumerate(names) if usecols(name)} 

933 return usecols 

934 

935 def _validate_usecols_names(self, usecols, names): 

936 """ 

937 Validates that all usecols are present in a given 

938 list of names. If not, raise a ValueError that 

939 shows what usecols are missing. 

940 

941 Parameters 

942 ---------- 

943 usecols : iterable of usecols 

944 The columns to validate are present in names. 

945 names : iterable of names 

946 The column names to check against. 

947 

948 Returns 

949 ------- 

950 usecols : iterable of usecols 

951 The `usecols` parameter if the validation succeeds. 

952 

953 Raises 

954 ------ 

955 ValueError : Columns were missing. Error message will list them. 

956 """ 

957 missing = [c for c in usecols if c not in names] 

958 if len(missing) > 0: 

959 raise ValueError( 

960 f"Usecols do not match columns, columns expected but not found: " 

961 f"{missing}" 

962 ) 

963 

964 return usecols 

965 

966 def _validate_usecols_arg(self, usecols): 

967 """ 

968 Validate the 'usecols' parameter. 

969 

970 Checks whether or not the 'usecols' parameter contains all integers 

971 (column selection by index), strings (column by name) or is a callable. 

972 Raises a ValueError if that is not the case. 

973 

974 Parameters 

975 ---------- 

976 usecols : list-like, callable, or None 

977 List of columns to use when parsing or a callable that can be used 

978 to filter a list of table columns. 

979 

980 Returns 

981 ------- 

982 usecols_tuple : tuple 

983 A tuple of (verified_usecols, usecols_dtype). 

984 

985 'verified_usecols' is either a set if an array-like is passed in or 

986 'usecols' if a callable or None is passed in. 

987 

988 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like 

989 is passed in or None if a callable or None is passed in. 

990 """ 

991 msg = ( 

992 "'usecols' must either be list-like of all strings, all unicode, " 

993 "all integers or a callable." 

994 ) 

995 if usecols is not None: 

996 if callable(usecols): 

997 return usecols, None 

998 

999 if not is_list_like(usecols): 

1000 # see gh-20529 

1001 # 

1002 # Ensure it is iterable container but not string. 

1003 raise ValueError(msg) 

1004 

1005 usecols_dtype = lib.infer_dtype(usecols, skipna=False) 

1006 

1007 if usecols_dtype not in ("empty", "integer", "string"): 

1008 raise ValueError(msg) 

1009 

1010 usecols = set(usecols) 

1011 

1012 return usecols, usecols_dtype 

1013 return usecols, None 

1014 

1015 def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: 

1016 if not is_index_col(index_col): 

1017 return None, columns, index_col 

1018 

1019 columns = list(columns) 

1020 

1021 # In case of no rows and multiindex columns we have to set index_names to 

1022 # list of Nones GH#38292 

1023 if not columns: 

1024 return [None] * len(index_col), columns, index_col 

1025 

1026 cp_cols = list(columns) 

1027 index_names: list[str | int | None] = [] 

1028 

1029 # don't mutate 

1030 index_col = list(index_col) 

1031 

1032 for i, c in enumerate(index_col): 

1033 if isinstance(c, str): 

1034 index_names.append(c) 

1035 for j, name in enumerate(cp_cols): 

1036 if name == c: 

1037 index_col[i] = j 

1038 columns.remove(name) 

1039 break 

1040 else: 

1041 name = cp_cols[c] 

1042 columns.remove(name) 

1043 index_names.append(name) 

1044 

1045 # Only clean index names that were placeholders. 

1046 for i, name in enumerate(index_names): 

1047 if isinstance(name, str) and name in self.unnamed_cols: 

1048 index_names[i] = None 

1049 

1050 return index_names, columns, index_col 

1051 

1052 def _get_empty_meta( 

1053 self, columns, index_col, index_names, dtype: DtypeArg | None = None 

1054 ): 

1055 columns = list(columns) 

1056 

1057 # Convert `dtype` to a defaultdict of some kind. 

1058 # This will enable us to write `dtype[col_name]` 

1059 # without worrying about KeyError issues later on. 

1060 dtype_dict: defaultdict[Hashable, Any] 

1061 if not is_dict_like(dtype): 

1062 # if dtype == None, default will be object. 

1063 default_dtype = dtype or object 

1064 dtype_dict = defaultdict(lambda: default_dtype) 

1065 else: 

1066 dtype = cast(dict, dtype) 

1067 dtype_dict = defaultdict( 

1068 lambda: object, 

1069 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, 

1070 ) 

1071 

1072 # Even though we have no data, the "index" of the empty DataFrame 

1073 # could for example still be an empty MultiIndex. Thus, we need to 

1074 # check whether we have any index columns specified, via either: 

1075 # 

1076 # 1) index_col (column indices) 

1077 # 2) index_names (column names) 

1078 # 

1079 # Both must be non-null to ensure a successful construction. Otherwise, 

1080 # we have to create a generic empty Index. 

1081 index: Index 

1082 if (index_col is None or index_col is False) or index_names is None: 

1083 index = default_index(0) 

1084 else: 

1085 data = [Series([], dtype=dtype_dict[name]) for name in index_names] 

1086 index = ensure_index_from_sequences(data, names=index_names) 

1087 index_col.sort() 

1088 

1089 for i, n in enumerate(index_col): 

1090 columns.pop(n - i) 

1091 

1092 col_dict = { 

1093 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns 

1094 } 

1095 

1096 return index, columns, col_dict 

1097 

1098 

1099def _make_date_converter( 

1100 date_parser=lib.no_default, 

1101 dayfirst: bool = False, 

1102 cache_dates: bool = True, 

1103 date_format: dict[Hashable, str] | str | None = None, 

1104): 

1105 if date_parser is not lib.no_default: 

1106 warnings.warn( 

1107 "The argument 'date_parser' is deprecated and will " 

1108 "be removed in a future version. " 

1109 "Please use 'date_format' instead, or read your data in as 'object' dtype " 

1110 "and then call 'to_datetime'.", 

1111 FutureWarning, 

1112 stacklevel=find_stack_level(), 

1113 ) 

1114 if date_parser is not lib.no_default and date_format is not None: 

1115 raise TypeError("Cannot use both 'date_parser' and 'date_format'") 

1116 

1117 def unpack_if_single_element(arg): 

1118 # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 

1119 if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: 

1120 return arg[0] 

1121 return arg 

1122 

1123 def converter(*date_cols, col: Hashable): 

1124 if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": 

1125 return date_cols[0] 

1126 

1127 if date_parser is lib.no_default: 

1128 strs = parsing.concat_date_cols(date_cols) 

1129 date_fmt = ( 

1130 date_format.get(col) if isinstance(date_format, dict) else date_format 

1131 ) 

1132 

1133 result = tools.to_datetime( 

1134 ensure_object(strs), 

1135 format=date_fmt, 

1136 utc=False, 

1137 dayfirst=dayfirst, 

1138 errors="ignore", 

1139 cache=cache_dates, 

1140 ) 

1141 if isinstance(result, DatetimeIndex): 

1142 arr = result.to_numpy() 

1143 arr.flags.writeable = True 

1144 return arr 

1145 return result._values 

1146 else: 

1147 try: 

1148 result = tools.to_datetime( 

1149 date_parser(*(unpack_if_single_element(arg) for arg in date_cols)), 

1150 errors="ignore", 

1151 cache=cache_dates, 

1152 ) 

1153 if isinstance(result, datetime.datetime): 

1154 raise Exception("scalar parser") 

1155 return result 

1156 except Exception: 

1157 return tools.to_datetime( 

1158 parsing.try_parse_dates( 

1159 parsing.concat_date_cols(date_cols), 

1160 parser=date_parser, 

1161 ), 

1162 errors="ignore", 

1163 ) 

1164 

1165 return converter 

1166 

1167 

1168parser_defaults = { 

1169 "delimiter": None, 

1170 "escapechar": None, 

1171 "quotechar": '"', 

1172 "quoting": csv.QUOTE_MINIMAL, 

1173 "doublequote": True, 

1174 "skipinitialspace": False, 

1175 "lineterminator": None, 

1176 "header": "infer", 

1177 "index_col": None, 

1178 "names": None, 

1179 "skiprows": None, 

1180 "skipfooter": 0, 

1181 "nrows": None, 

1182 "na_values": None, 

1183 "keep_default_na": True, 

1184 "true_values": None, 

1185 "false_values": None, 

1186 "converters": None, 

1187 "dtype": None, 

1188 "cache_dates": True, 

1189 "thousands": None, 

1190 "comment": None, 

1191 "decimal": ".", 

1192 # 'engine': 'c', 

1193 "parse_dates": False, 

1194 "keep_date_col": False, 

1195 "dayfirst": False, 

1196 "date_parser": lib.no_default, 

1197 "date_format": None, 

1198 "usecols": None, 

1199 # 'iterator': False, 

1200 "chunksize": None, 

1201 "verbose": False, 

1202 "encoding": None, 

1203 "compression": None, 

1204 "skip_blank_lines": True, 

1205 "encoding_errors": "strict", 

1206 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, 

1207 "dtype_backend": lib.no_default, 

1208} 

1209 

1210 

1211def _process_date_conversion( 

1212 data_dict, 

1213 converter: Callable, 

1214 parse_spec, 

1215 index_col, 

1216 index_names, 

1217 columns, 

1218 keep_date_col: bool = False, 

1219 dtype_backend=lib.no_default, 

1220): 

1221 def _isindex(colspec): 

1222 return (isinstance(index_col, list) and colspec in index_col) or ( 

1223 isinstance(index_names, list) and colspec in index_names 

1224 ) 

1225 

1226 new_cols = [] 

1227 new_data = {} 

1228 

1229 orig_names = columns 

1230 columns = list(columns) 

1231 

1232 date_cols = set() 

1233 

1234 if parse_spec is None or isinstance(parse_spec, bool): 

1235 return data_dict, columns 

1236 

1237 if isinstance(parse_spec, list): 

1238 # list of column lists 

1239 for colspec in parse_spec: 

1240 if is_scalar(colspec) or isinstance(colspec, tuple): 

1241 if isinstance(colspec, int) and colspec not in data_dict: 

1242 colspec = orig_names[colspec] 

1243 if _isindex(colspec): 

1244 continue 

1245 elif dtype_backend == "pyarrow": 

1246 import pyarrow as pa 

1247 

1248 dtype = data_dict[colspec].dtype 

1249 if isinstance(dtype, ArrowDtype) and ( 

1250 pa.types.is_timestamp(dtype.pyarrow_dtype) 

1251 or pa.types.is_date(dtype.pyarrow_dtype) 

1252 ): 

1253 continue 

1254 

1255 # Pyarrow engine returns Series which we need to convert to 

1256 # numpy array before converter, its a no-op for other parsers 

1257 data_dict[colspec] = converter( 

1258 np.asarray(data_dict[colspec]), col=colspec 

1259 ) 

1260 else: 

1261 new_name, col, old_names = _try_convert_dates( 

1262 converter, colspec, data_dict, orig_names 

1263 ) 

1264 if new_name in data_dict: 

1265 raise ValueError(f"New date column already in dict {new_name}") 

1266 new_data[new_name] = col 

1267 new_cols.append(new_name) 

1268 date_cols.update(old_names) 

1269 

1270 elif isinstance(parse_spec, dict): 

1271 # dict of new name to column list 

1272 for new_name, colspec in parse_spec.items(): 

1273 if new_name in data_dict: 

1274 raise ValueError(f"Date column {new_name} already in dict") 

1275 

1276 _, col, old_names = _try_convert_dates( 

1277 converter, 

1278 colspec, 

1279 data_dict, 

1280 orig_names, 

1281 target_name=new_name, 

1282 ) 

1283 

1284 new_data[new_name] = col 

1285 

1286 # If original column can be converted to date we keep the converted values 

1287 # This can only happen if values are from single column 

1288 if len(colspec) == 1: 

1289 new_data[colspec[0]] = col 

1290 

1291 new_cols.append(new_name) 

1292 date_cols.update(old_names) 

1293 

1294 data_dict.update(new_data) 

1295 new_cols.extend(columns) 

1296 

1297 if not keep_date_col: 

1298 for c in list(date_cols): 

1299 data_dict.pop(c) 

1300 new_cols.remove(c) 

1301 

1302 return data_dict, new_cols 

1303 

1304 

1305def _try_convert_dates( 

1306 parser: Callable, colspec, data_dict, columns, target_name: str | None = None 

1307): 

1308 colset = set(columns) 

1309 colnames = [] 

1310 

1311 for c in colspec: 

1312 if c in colset: 

1313 colnames.append(c) 

1314 elif isinstance(c, int) and c not in columns: 

1315 colnames.append(columns[c]) 

1316 else: 

1317 colnames.append(c) 

1318 

1319 new_name: tuple | str 

1320 if all(isinstance(x, tuple) for x in colnames): 

1321 new_name = tuple(map("_".join, zip(*colnames))) 

1322 else: 

1323 new_name = "_".join([str(x) for x in colnames]) 

1324 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] 

1325 

1326 new_col = parser(*to_parse, col=new_name if target_name is None else target_name) 

1327 return new_name, new_col, colnames 

1328 

1329 

1330def _get_na_values(col, na_values, na_fvalues, keep_default_na): 

1331 """ 

1332 Get the NaN values for a given column. 

1333 

1334 Parameters 

1335 ---------- 

1336 col : str 

1337 The name of the column. 

1338 na_values : array-like, dict 

1339 The object listing the NaN values as strings. 

1340 na_fvalues : array-like, dict 

1341 The object listing the NaN values as floats. 

1342 keep_default_na : bool 

1343 If `na_values` is a dict, and the column is not mapped in the 

1344 dictionary, whether to return the default NaN values or the empty set. 

1345 

1346 Returns 

1347 ------- 

1348 nan_tuple : A length-two tuple composed of 

1349 

1350 1) na_values : the string NaN values for that column. 

1351 2) na_fvalues : the float NaN values for that column. 

1352 """ 

1353 if isinstance(na_values, dict): 

1354 if col in na_values: 

1355 return na_values[col], na_fvalues[col] 

1356 else: 

1357 if keep_default_na: 

1358 return STR_NA_VALUES, set() 

1359 

1360 return set(), set() 

1361 else: 

1362 return na_values, na_fvalues 

1363 

1364 

1365def _validate_parse_dates_arg(parse_dates): 

1366 """ 

1367 Check whether or not the 'parse_dates' parameter 

1368 is a non-boolean scalar. Raises a ValueError if 

1369 that is the case. 

1370 """ 

1371 msg = ( 

1372 "Only booleans, lists, and dictionaries are accepted " 

1373 "for the 'parse_dates' parameter" 

1374 ) 

1375 

1376 if parse_dates is not None: 

1377 if is_scalar(parse_dates): 

1378 if not lib.is_bool(parse_dates): 

1379 raise TypeError(msg) 

1380 

1381 elif not isinstance(parse_dates, (list, dict)): 

1382 raise TypeError(msg) 

1383 

1384 return parse_dates 

1385 

1386 

1387def is_index_col(col) -> bool: 

1388 return col is not None and col is not False