Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/parsers/base_parser.py: 45%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

619 statements  

1from __future__ import annotations 

2 

3from collections import defaultdict 

4from copy import copy 

5import csv 

6import datetime 

7from enum import Enum 

8import itertools 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13 cast, 

14 final, 

15 overload, 

16) 

17import warnings 

18 

19import numpy as np 

20 

21from pandas._libs import ( 

22 lib, 

23 parsers, 

24) 

25import pandas._libs.ops as libops 

26from pandas._libs.parsers import STR_NA_VALUES 

27from pandas._libs.tslibs import parsing 

28from pandas.compat._optional import import_optional_dependency 

29from pandas.errors import ( 

30 ParserError, 

31 ParserWarning, 

32) 

33from pandas.util._exceptions import find_stack_level 

34 

35from pandas.core.dtypes.astype import astype_array 

36from pandas.core.dtypes.common import ( 

37 ensure_object, 

38 is_bool_dtype, 

39 is_dict_like, 

40 is_extension_array_dtype, 

41 is_float_dtype, 

42 is_integer, 

43 is_integer_dtype, 

44 is_list_like, 

45 is_object_dtype, 

46 is_scalar, 

47 is_string_dtype, 

48 pandas_dtype, 

49) 

50from pandas.core.dtypes.dtypes import ( 

51 CategoricalDtype, 

52 ExtensionDtype, 

53) 

54from pandas.core.dtypes.missing import isna 

55 

56from pandas import ( 

57 ArrowDtype, 

58 DataFrame, 

59 DatetimeIndex, 

60 StringDtype, 

61 concat, 

62) 

63from pandas.core import algorithms 

64from pandas.core.arrays import ( 

65 ArrowExtensionArray, 

66 BaseMaskedArray, 

67 BooleanArray, 

68 Categorical, 

69 ExtensionArray, 

70 FloatingArray, 

71 IntegerArray, 

72) 

73from pandas.core.arrays.boolean import BooleanDtype 

74from pandas.core.indexes.api import ( 

75 Index, 

76 MultiIndex, 

77 default_index, 

78 ensure_index_from_sequences, 

79) 

80from pandas.core.series import Series 

81from pandas.core.tools import datetimes as tools 

82 

83from pandas.io.common import is_potential_multi_index 

84 

85if TYPE_CHECKING: 

86 from collections.abc import ( 

87 Hashable, 

88 Iterable, 

89 Mapping, 

90 Sequence, 

91 ) 

92 

93 from pandas._typing import ( 

94 ArrayLike, 

95 DtypeArg, 

96 DtypeObj, 

97 Scalar, 

98 ) 

99 

100 

101class ParserBase: 

102 class BadLineHandleMethod(Enum): 

103 ERROR = 0 

104 WARN = 1 

105 SKIP = 2 

106 

107 _implicit_index: bool 

108 _first_chunk: bool 

109 keep_default_na: bool 

110 dayfirst: bool 

111 cache_dates: bool 

112 keep_date_col: bool 

113 usecols_dtype: str | None 

114 

115 def __init__(self, kwds) -> None: 

116 self._implicit_index = False 

117 

118 self.names = kwds.get("names") 

119 self.orig_names: Sequence[Hashable] | None = None 

120 

121 self.index_col = kwds.get("index_col", None) 

122 self.unnamed_cols: set = set() 

123 self.index_names: Sequence[Hashable] | None = None 

124 self.col_names: Sequence[Hashable] | None = None 

125 

126 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) 

127 self._parse_date_cols: Iterable = [] 

128 self.date_parser = kwds.pop("date_parser", lib.no_default) 

129 self.date_format = kwds.pop("date_format", None) 

130 self.dayfirst = kwds.pop("dayfirst", False) 

131 self.keep_date_col = kwds.pop("keep_date_col", False) 

132 

133 self.na_values = kwds.get("na_values") 

134 self.na_fvalues = kwds.get("na_fvalues") 

135 self.na_filter = kwds.get("na_filter", False) 

136 self.keep_default_na = kwds.get("keep_default_na", True) 

137 

138 self.dtype = copy(kwds.get("dtype", None)) 

139 self.converters = kwds.get("converters") 

140 self.dtype_backend = kwds.get("dtype_backend") 

141 

142 self.true_values = kwds.get("true_values") 

143 self.false_values = kwds.get("false_values") 

144 self.cache_dates = kwds.pop("cache_dates", True) 

145 

146 self._date_conv = _make_date_converter( 

147 date_parser=self.date_parser, 

148 date_format=self.date_format, 

149 dayfirst=self.dayfirst, 

150 cache_dates=self.cache_dates, 

151 ) 

152 

153 # validate header options for mi 

154 self.header = kwds.get("header") 

155 if is_list_like(self.header, allow_sets=False): 

156 if kwds.get("usecols"): 

157 raise ValueError( 

158 "cannot specify usecols when specifying a multi-index header" 

159 ) 

160 if kwds.get("names"): 

161 raise ValueError( 

162 "cannot specify names when specifying a multi-index header" 

163 ) 

164 

165 # validate index_col that only contains integers 

166 if self.index_col is not None: 

167 # In this case we can pin down index_col as list[int] 

168 if is_integer(self.index_col): 

169 self.index_col = [self.index_col] 

170 elif not ( 

171 is_list_like(self.index_col, allow_sets=False) 

172 and all(map(is_integer, self.index_col)) 

173 ): 

174 raise ValueError( 

175 "index_col must only contain row numbers " 

176 "when specifying a multi-index header" 

177 ) 

178 else: 

179 self.index_col = list(self.index_col) 

180 

181 self._name_processed = False 

182 

183 self._first_chunk = True 

184 

185 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) 

186 

187 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) 

188 # Normally, this arg would get pre-processed earlier on 

189 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) 

190 

191 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: 

192 """ 

193 Check if parse_dates are in columns. 

194 

195 If user has provided names for parse_dates, check if those columns 

196 are available. 

197 

198 Parameters 

199 ---------- 

200 columns : list 

201 List of names of the dataframe. 

202 

203 Returns 

204 ------- 

205 The names of the columns which will get parsed later if a dict or list 

206 is given as specification. 

207 

208 Raises 

209 ------ 

210 ValueError 

211 If column to parse_date is not in dataframe. 

212 

213 """ 

214 cols_needed: Iterable 

215 if is_dict_like(self.parse_dates): 

216 cols_needed = itertools.chain(*self.parse_dates.values()) 

217 elif is_list_like(self.parse_dates): 

218 # a column in parse_dates could be represented 

219 # ColReference = Union[int, str] 

220 # DateGroups = List[ColReference] 

221 # ParseDates = Union[DateGroups, List[DateGroups], 

222 # Dict[ColReference, DateGroups]] 

223 cols_needed = itertools.chain.from_iterable( 

224 col if is_list_like(col) and not isinstance(col, tuple) else [col] 

225 for col in self.parse_dates 

226 ) 

227 else: 

228 cols_needed = [] 

229 

230 cols_needed = list(cols_needed) 

231 

232 # get only columns that are references using names (str), not by index 

233 missing_cols = ", ".join( 

234 sorted( 

235 { 

236 col 

237 for col in cols_needed 

238 if isinstance(col, str) and col not in columns 

239 } 

240 ) 

241 ) 

242 if missing_cols: 

243 raise ValueError( 

244 f"Missing column provided to 'parse_dates': '{missing_cols}'" 

245 ) 

246 # Convert positions to actual column names 

247 return [ 

248 col if (isinstance(col, str) or col in columns) else columns[col] 

249 for col in cols_needed 

250 ] 

251 

252 def close(self) -> None: 

253 pass 

254 

255 @final 

256 @property 

257 def _has_complex_date_col(self) -> bool: 

258 return isinstance(self.parse_dates, dict) or ( 

259 isinstance(self.parse_dates, list) 

260 and len(self.parse_dates) > 0 

261 and isinstance(self.parse_dates[0], list) 

262 ) 

263 

264 @final 

265 def _should_parse_dates(self, i: int) -> bool: 

266 if lib.is_bool(self.parse_dates): 

267 return bool(self.parse_dates) 

268 else: 

269 if self.index_names is not None: 

270 name = self.index_names[i] 

271 else: 

272 name = None 

273 j = i if self.index_col is None else self.index_col[i] 

274 

275 return (j in self.parse_dates) or ( 

276 name is not None and name in self.parse_dates 

277 ) 

278 

279 @final 

280 def _extract_multi_indexer_columns( 

281 self, 

282 header, 

283 index_names: Sequence[Hashable] | None, 

284 passed_names: bool = False, 

285 ) -> tuple[ 

286 Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool 

287 ]: 

288 """ 

289 Extract and return the names, index_names, col_names if the column 

290 names are a MultiIndex. 

291 

292 Parameters 

293 ---------- 

294 header: list of lists 

295 The header rows 

296 index_names: list, optional 

297 The names of the future index 

298 passed_names: bool, default False 

299 A flag specifying if names where passed 

300 

301 """ 

302 if len(header) < 2: 

303 return header[0], index_names, None, passed_names 

304 

305 # the names are the tuples of the header that are not the index cols 

306 # 0 is the name of the index, assuming index_col is a list of column 

307 # numbers 

308 ic = self.index_col 

309 if ic is None: 

310 ic = [] 

311 

312 if not isinstance(ic, (list, tuple, np.ndarray)): 

313 ic = [ic] 

314 sic = set(ic) 

315 

316 # clean the index_names 

317 index_names = header.pop(-1) 

318 index_names, _, _ = self._clean_index_names(index_names, self.index_col) 

319 

320 # extract the columns 

321 field_count = len(header[0]) 

322 

323 # check if header lengths are equal 

324 if not all(len(header_iter) == field_count for header_iter in header[1:]): 

325 raise ParserError("Header rows must have an equal number of columns.") 

326 

327 def extract(r): 

328 return tuple(r[i] for i in range(field_count) if i not in sic) 

329 

330 columns = list(zip(*(extract(r) for r in header))) 

331 names = columns.copy() 

332 for single_ic in sorted(ic): 

333 names.insert(single_ic, single_ic) 

334 

335 # Clean the column names (if we have an index_col). 

336 if len(ic): 

337 col_names = [ 

338 r[ic[0]] 

339 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) 

340 else None 

341 for r in header 

342 ] 

343 else: 

344 col_names = [None] * len(header) 

345 

346 passed_names = True 

347 

348 return names, index_names, col_names, passed_names 

349 

350 @final 

351 def _maybe_make_multi_index_columns( 

352 self, 

353 columns: Sequence[Hashable], 

354 col_names: Sequence[Hashable] | None = None, 

355 ) -> Sequence[Hashable] | MultiIndex: 

356 # possibly create a column mi here 

357 if is_potential_multi_index(columns): 

358 list_columns = cast(list[tuple], columns) 

359 return MultiIndex.from_tuples(list_columns, names=col_names) 

360 return columns 

361 

362 @final 

363 def _make_index( 

364 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None 

365 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: 

366 index: Index | None 

367 if not is_index_col(self.index_col) or not self.index_col: 

368 index = None 

369 

370 elif not self._has_complex_date_col: 

371 simple_index = self._get_simple_index(alldata, columns) 

372 index = self._agg_index(simple_index) 

373 elif self._has_complex_date_col: 

374 if not self._name_processed: 

375 (self.index_names, _, self.index_col) = self._clean_index_names( 

376 list(columns), self.index_col 

377 ) 

378 self._name_processed = True 

379 date_index = self._get_complex_date_index(data, columns) 

380 index = self._agg_index(date_index, try_parse_dates=False) 

381 

382 # add names for the index 

383 if indexnamerow: 

384 coffset = len(indexnamerow) - len(columns) 

385 assert index is not None 

386 index = index.set_names(indexnamerow[:coffset]) 

387 

388 # maybe create a mi on the columns 

389 columns = self._maybe_make_multi_index_columns(columns, self.col_names) 

390 

391 return index, columns 

392 

393 @final 

394 def _get_simple_index(self, data, columns): 

395 def ix(col): 

396 if not isinstance(col, str): 

397 return col 

398 raise ValueError(f"Index {col} invalid") 

399 

400 to_remove = [] 

401 index = [] 

402 for idx in self.index_col: 

403 i = ix(idx) 

404 to_remove.append(i) 

405 index.append(data[i]) 

406 

407 # remove index items from content and columns, don't pop in 

408 # loop 

409 for i in sorted(to_remove, reverse=True): 

410 data.pop(i) 

411 if not self._implicit_index: 

412 columns.pop(i) 

413 

414 return index 

415 

416 @final 

417 def _get_complex_date_index(self, data, col_names): 

418 def _get_name(icol): 

419 if isinstance(icol, str): 

420 return icol 

421 

422 if col_names is None: 

423 raise ValueError(f"Must supply column order to use {icol!s} as index") 

424 

425 for i, c in enumerate(col_names): 

426 if i == icol: 

427 return c 

428 

429 to_remove = [] 

430 index = [] 

431 for idx in self.index_col: 

432 name = _get_name(idx) 

433 to_remove.append(name) 

434 index.append(data[name]) 

435 

436 # remove index items from content and columns, don't pop in 

437 # loop 

438 for c in sorted(to_remove, reverse=True): 

439 data.pop(c) 

440 col_names.remove(c) 

441 

442 return index 

443 

444 @final 

445 def _clean_mapping(self, mapping): 

446 """converts col numbers to names""" 

447 if not isinstance(mapping, dict): 

448 return mapping 

449 clean = {} 

450 # for mypy 

451 assert self.orig_names is not None 

452 

453 for col, v in mapping.items(): 

454 if isinstance(col, int) and col not in self.orig_names: 

455 col = self.orig_names[col] 

456 clean[col] = v 

457 if isinstance(mapping, defaultdict): 

458 remaining_cols = set(self.orig_names) - set(clean.keys()) 

459 clean.update({col: mapping[col] for col in remaining_cols}) 

460 return clean 

461 

462 @final 

463 def _agg_index(self, index, try_parse_dates: bool = True) -> Index: 

464 arrays = [] 

465 converters = self._clean_mapping(self.converters) 

466 

467 for i, arr in enumerate(index): 

468 if try_parse_dates and self._should_parse_dates(i): 

469 arr = self._date_conv( 

470 arr, 

471 col=self.index_names[i] if self.index_names is not None else None, 

472 ) 

473 

474 if self.na_filter: 

475 col_na_values = self.na_values 

476 col_na_fvalues = self.na_fvalues 

477 else: 

478 col_na_values = set() 

479 col_na_fvalues = set() 

480 

481 if isinstance(self.na_values, dict): 

482 assert self.index_names is not None 

483 col_name = self.index_names[i] 

484 if col_name is not None: 

485 col_na_values, col_na_fvalues = _get_na_values( 

486 col_name, self.na_values, self.na_fvalues, self.keep_default_na 

487 ) 

488 

489 clean_dtypes = self._clean_mapping(self.dtype) 

490 

491 cast_type = None 

492 index_converter = False 

493 if self.index_names is not None: 

494 if isinstance(clean_dtypes, dict): 

495 cast_type = clean_dtypes.get(self.index_names[i], None) 

496 

497 if isinstance(converters, dict): 

498 index_converter = converters.get(self.index_names[i]) is not None 

499 

500 try_num_bool = not ( 

501 cast_type and is_string_dtype(cast_type) or index_converter 

502 ) 

503 

504 arr, _ = self._infer_types( 

505 arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool 

506 ) 

507 arrays.append(arr) 

508 

509 names = self.index_names 

510 index = ensure_index_from_sequences(arrays, names) 

511 

512 return index 

513 

514 @final 

515 def _convert_to_ndarrays( 

516 self, 

517 dct: Mapping, 

518 na_values, 

519 na_fvalues, 

520 verbose: bool = False, 

521 converters=None, 

522 dtypes=None, 

523 ): 

524 result = {} 

525 for c, values in dct.items(): 

526 conv_f = None if converters is None else converters.get(c, None) 

527 if isinstance(dtypes, dict): 

528 cast_type = dtypes.get(c, None) 

529 else: 

530 # single dtype or None 

531 cast_type = dtypes 

532 

533 if self.na_filter: 

534 col_na_values, col_na_fvalues = _get_na_values( 

535 c, na_values, na_fvalues, self.keep_default_na 

536 ) 

537 else: 

538 col_na_values, col_na_fvalues = set(), set() 

539 

540 if c in self._parse_date_cols: 

541 # GH#26203 Do not convert columns which get converted to dates 

542 # but replace nans to ensure to_datetime works 

543 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) 

544 np.putmask(values, mask, np.nan) 

545 result[c] = values 

546 continue 

547 

548 if conv_f is not None: 

549 # conv_f applied to data before inference 

550 if cast_type is not None: 

551 warnings.warn( 

552 ( 

553 "Both a converter and dtype were specified " 

554 f"for column {c} - only the converter will be used." 

555 ), 

556 ParserWarning, 

557 stacklevel=find_stack_level(), 

558 ) 

559 

560 try: 

561 values = lib.map_infer(values, conv_f) 

562 except ValueError: 

563 mask = algorithms.isin(values, list(na_values)).view(np.uint8) 

564 values = lib.map_infer_mask(values, conv_f, mask) 

565 

566 cvals, na_count = self._infer_types( 

567 values, 

568 set(col_na_values) | col_na_fvalues, 

569 cast_type is None, 

570 try_num_bool=False, 

571 ) 

572 else: 

573 is_ea = is_extension_array_dtype(cast_type) 

574 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) 

575 # skip inference if specified dtype is object 

576 # or casting to an EA 

577 try_num_bool = not (cast_type and is_str_or_ea_dtype) 

578 

579 # general type inference and conversion 

580 cvals, na_count = self._infer_types( 

581 values, 

582 set(col_na_values) | col_na_fvalues, 

583 cast_type is None, 

584 try_num_bool, 

585 ) 

586 

587 # type specified in dtype param or cast_type is an EA 

588 if cast_type is not None: 

589 cast_type = pandas_dtype(cast_type) 

590 if cast_type and (cvals.dtype != cast_type or is_ea): 

591 if not is_ea and na_count > 0: 

592 if is_bool_dtype(cast_type): 

593 raise ValueError(f"Bool column has NA values in column {c}") 

594 cvals = self._cast_types(cvals, cast_type, c) 

595 

596 result[c] = cvals 

597 if verbose and na_count: 

598 print(f"Filled {na_count} NA values in column {c!s}") 

599 return result 

600 

601 @final 

602 def _set_noconvert_dtype_columns( 

603 self, col_indices: list[int], names: Sequence[Hashable] 

604 ) -> set[int]: 

605 """ 

606 Set the columns that should not undergo dtype conversions. 

607 

608 Currently, any column that is involved with date parsing will not 

609 undergo such conversions. If usecols is specified, the positions of the columns 

610 not to cast is relative to the usecols not to all columns. 

611 

612 Parameters 

613 ---------- 

614 col_indices: The indices specifying order and positions of the columns 

615 names: The column names which order is corresponding with the order 

616 of col_indices 

617 

618 Returns 

619 ------- 

620 A set of integers containing the positions of the columns not to convert. 

621 """ 

622 usecols: list[int] | list[str] | None 

623 noconvert_columns = set() 

624 if self.usecols_dtype == "integer": 

625 # A set of integers will be converted to a list in 

626 # the correct order every single time. 

627 usecols = sorted(self.usecols) 

628 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): 

629 # The names attribute should have the correct columns 

630 # in the proper order for indexing with parse_dates. 

631 usecols = col_indices 

632 else: 

633 # Usecols is empty. 

634 usecols = None 

635 

636 def _set(x) -> int: 

637 if usecols is not None and is_integer(x): 

638 x = usecols[x] 

639 

640 if not is_integer(x): 

641 x = col_indices[names.index(x)] 

642 

643 return x 

644 

645 if isinstance(self.parse_dates, list): 

646 for val in self.parse_dates: 

647 if isinstance(val, list): 

648 for k in val: 

649 noconvert_columns.add(_set(k)) 

650 else: 

651 noconvert_columns.add(_set(val)) 

652 

653 elif isinstance(self.parse_dates, dict): 

654 for val in self.parse_dates.values(): 

655 if isinstance(val, list): 

656 for k in val: 

657 noconvert_columns.add(_set(k)) 

658 else: 

659 noconvert_columns.add(_set(val)) 

660 

661 elif self.parse_dates: 

662 if isinstance(self.index_col, list): 

663 for k in self.index_col: 

664 noconvert_columns.add(_set(k)) 

665 elif self.index_col is not None: 

666 noconvert_columns.add(_set(self.index_col)) 

667 

668 return noconvert_columns 

669 

670 @final 

671 def _infer_types( 

672 self, values, na_values, no_dtype_specified, try_num_bool: bool = True 

673 ) -> tuple[ArrayLike, int]: 

674 """ 

675 Infer types of values, possibly casting 

676 

677 Parameters 

678 ---------- 

679 values : ndarray 

680 na_values : set 

681 no_dtype_specified: Specifies if we want to cast explicitly 

682 try_num_bool : bool, default try 

683 try to cast values to numeric (first preference) or boolean 

684 

685 Returns 

686 ------- 

687 converted : ndarray or ExtensionArray 

688 na_count : int 

689 """ 

690 na_count = 0 

691 if issubclass(values.dtype.type, (np.number, np.bool_)): 

692 # If our array has numeric dtype, we don't have to check for strings in isin 

693 na_values = np.array([val for val in na_values if not isinstance(val, str)]) 

694 mask = algorithms.isin(values, na_values) 

695 na_count = mask.astype("uint8", copy=False).sum() 

696 if na_count > 0: 

697 if is_integer_dtype(values): 

698 values = values.astype(np.float64) 

699 np.putmask(values, mask, np.nan) 

700 return values, na_count 

701 

702 dtype_backend = self.dtype_backend 

703 non_default_dtype_backend = ( 

704 no_dtype_specified and dtype_backend is not lib.no_default 

705 ) 

706 result: ArrayLike 

707 

708 if try_num_bool and is_object_dtype(values.dtype): 

709 # exclude e.g DatetimeIndex here 

710 try: 

711 result, result_mask = lib.maybe_convert_numeric( 

712 values, 

713 na_values, 

714 False, 

715 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] 

716 ) 

717 except (ValueError, TypeError): 

718 # e.g. encountering datetime string gets ValueError 

719 # TypeError can be raised in floatify 

720 na_count = parsers.sanitize_objects(values, na_values) 

721 result = values 

722 else: 

723 if non_default_dtype_backend: 

724 if result_mask is None: 

725 result_mask = np.zeros(result.shape, dtype=np.bool_) 

726 

727 if result_mask.all(): 

728 result = IntegerArray( 

729 np.ones(result_mask.shape, dtype=np.int64), result_mask 

730 ) 

731 elif is_integer_dtype(result): 

732 result = IntegerArray(result, result_mask) 

733 elif is_bool_dtype(result): 

734 result = BooleanArray(result, result_mask) 

735 elif is_float_dtype(result): 

736 result = FloatingArray(result, result_mask) 

737 

738 na_count = result_mask.sum() 

739 else: 

740 na_count = isna(result).sum() 

741 else: 

742 result = values 

743 if values.dtype == np.object_: 

744 na_count = parsers.sanitize_objects(values, na_values) 

745 

746 if result.dtype == np.object_ and try_num_bool: 

747 result, bool_mask = libops.maybe_convert_bool( 

748 np.asarray(values), 

749 true_values=self.true_values, 

750 false_values=self.false_values, 

751 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] 

752 ) 

753 if result.dtype == np.bool_ and non_default_dtype_backend: 

754 if bool_mask is None: 

755 bool_mask = np.zeros(result.shape, dtype=np.bool_) 

756 result = BooleanArray(result, bool_mask) 

757 elif result.dtype == np.object_ and non_default_dtype_backend: 

758 # read_excel sends array of datetime objects 

759 if not lib.is_datetime_array(result, skipna=True): 

760 dtype = StringDtype() 

761 cls = dtype.construct_array_type() 

762 result = cls._from_sequence(values, dtype=dtype) 

763 

764 if dtype_backend == "pyarrow": 

765 pa = import_optional_dependency("pyarrow") 

766 if isinstance(result, np.ndarray): 

767 result = ArrowExtensionArray(pa.array(result, from_pandas=True)) 

768 elif isinstance(result, BaseMaskedArray): 

769 if result._mask.all(): 

770 # We want an arrow null array here 

771 result = ArrowExtensionArray(pa.array([None] * len(result))) 

772 else: 

773 result = ArrowExtensionArray( 

774 pa.array(result._data, mask=result._mask) 

775 ) 

776 else: 

777 result = ArrowExtensionArray( 

778 pa.array(result.to_numpy(), from_pandas=True) 

779 ) 

780 

781 return result, na_count 

782 

783 @final 

784 def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: 

785 """ 

786 Cast values to specified type 

787 

788 Parameters 

789 ---------- 

790 values : ndarray or ExtensionArray 

791 cast_type : np.dtype or ExtensionDtype 

792 dtype to cast values to 

793 column : string 

794 column name - used only for error reporting 

795 

796 Returns 

797 ------- 

798 converted : ndarray or ExtensionArray 

799 """ 

800 if isinstance(cast_type, CategoricalDtype): 

801 known_cats = cast_type.categories is not None 

802 

803 if not is_object_dtype(values.dtype) and not known_cats: 

804 # TODO: this is for consistency with 

805 # c-parser which parses all categories 

806 # as strings 

807 values = lib.ensure_string_array( 

808 values, skipna=False, convert_na_value=False 

809 ) 

810 

811 cats = Index(values).unique().dropna() 

812 values = Categorical._from_inferred_categories( 

813 cats, cats.get_indexer(values), cast_type, true_values=self.true_values 

814 ) 

815 

816 # use the EA's implementation of casting 

817 elif isinstance(cast_type, ExtensionDtype): 

818 array_type = cast_type.construct_array_type() 

819 try: 

820 if isinstance(cast_type, BooleanDtype): 

821 # error: Unexpected keyword argument "true_values" for 

822 # "_from_sequence_of_strings" of "ExtensionArray" 

823 return array_type._from_sequence_of_strings( # type: ignore[call-arg] 

824 values, 

825 dtype=cast_type, 

826 true_values=self.true_values, 

827 false_values=self.false_values, 

828 ) 

829 else: 

830 return array_type._from_sequence_of_strings(values, dtype=cast_type) 

831 except NotImplementedError as err: 

832 raise NotImplementedError( 

833 f"Extension Array: {array_type} must implement " 

834 "_from_sequence_of_strings in order to be used in parser methods" 

835 ) from err 

836 

837 elif isinstance(values, ExtensionArray): 

838 values = values.astype(cast_type, copy=False) 

839 elif issubclass(cast_type.type, str): 

840 # TODO: why skipna=True here and False above? some tests depend 

841 # on it here, but nothing fails if we change it above 

842 # (as no tests get there as of 2022-12-06) 

843 values = lib.ensure_string_array( 

844 values, skipna=True, convert_na_value=False 

845 ) 

846 else: 

847 try: 

848 values = astype_array(values, cast_type, copy=True) 

849 except ValueError as err: 

850 raise ValueError( 

851 f"Unable to convert column {column} to type {cast_type}" 

852 ) from err 

853 return values 

854 

855 @overload 

856 def _do_date_conversions( 

857 self, 

858 names: Index, 

859 data: DataFrame, 

860 ) -> tuple[Sequence[Hashable] | Index, DataFrame]: 

861 ... 

862 

863 @overload 

864 def _do_date_conversions( 

865 self, 

866 names: Sequence[Hashable], 

867 data: Mapping[Hashable, ArrayLike], 

868 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: 

869 ... 

870 

871 @final 

872 def _do_date_conversions( 

873 self, 

874 names: Sequence[Hashable] | Index, 

875 data: Mapping[Hashable, ArrayLike] | DataFrame, 

876 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: 

877 # returns data, columns 

878 

879 if self.parse_dates is not None: 

880 data, names = _process_date_conversion( 

881 data, 

882 self._date_conv, 

883 self.parse_dates, 

884 self.index_col, 

885 self.index_names, 

886 names, 

887 keep_date_col=self.keep_date_col, 

888 dtype_backend=self.dtype_backend, 

889 ) 

890 

891 return names, data 

892 

893 @final 

894 def _check_data_length( 

895 self, 

896 columns: Sequence[Hashable], 

897 data: Sequence[ArrayLike], 

898 ) -> None: 

899 """Checks if length of data is equal to length of column names. 

900 

901 One set of trailing commas is allowed. self.index_col not False 

902 results in a ParserError previously when lengths do not match. 

903 

904 Parameters 

905 ---------- 

906 columns: list of column names 

907 data: list of array-likes containing the data column-wise. 

908 """ 

909 if not self.index_col and len(columns) != len(data) and columns: 

910 empty_str = is_object_dtype(data[-1]) and data[-1] == "" 

911 # error: No overload variant of "__ror__" of "ndarray" matches 

912 # argument type "ExtensionArray" 

913 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator] 

914 if len(columns) == len(data) - 1 and np.all(empty_str_or_na): 

915 return 

916 warnings.warn( 

917 "Length of header or names does not match length of data. This leads " 

918 "to a loss of data with index_col=False.", 

919 ParserWarning, 

920 stacklevel=find_stack_level(), 

921 ) 

922 

923 @overload 

924 def _evaluate_usecols( 

925 self, 

926 usecols: set[int] | Callable[[Hashable], object], 

927 names: Sequence[Hashable], 

928 ) -> set[int]: 

929 ... 

930 

931 @overload 

932 def _evaluate_usecols( 

933 self, usecols: set[str], names: Sequence[Hashable] 

934 ) -> set[str]: 

935 ... 

936 

937 @final 

938 def _evaluate_usecols( 

939 self, 

940 usecols: Callable[[Hashable], object] | set[str] | set[int], 

941 names: Sequence[Hashable], 

942 ) -> set[str] | set[int]: 

943 """ 

944 Check whether or not the 'usecols' parameter 

945 is a callable. If so, enumerates the 'names' 

946 parameter and returns a set of indices for 

947 each entry in 'names' that evaluates to True. 

948 If not a callable, returns 'usecols'. 

949 """ 

950 if callable(usecols): 

951 return {i for i, name in enumerate(names) if usecols(name)} 

952 return usecols 

953 

954 @final 

955 def _validate_usecols_names(self, usecols, names: Sequence): 

956 """ 

957 Validates that all usecols are present in a given 

958 list of names. If not, raise a ValueError that 

959 shows what usecols are missing. 

960 

961 Parameters 

962 ---------- 

963 usecols : iterable of usecols 

964 The columns to validate are present in names. 

965 names : iterable of names 

966 The column names to check against. 

967 

968 Returns 

969 ------- 

970 usecols : iterable of usecols 

971 The `usecols` parameter if the validation succeeds. 

972 

973 Raises 

974 ------ 

975 ValueError : Columns were missing. Error message will list them. 

976 """ 

977 missing = [c for c in usecols if c not in names] 

978 if len(missing) > 0: 

979 raise ValueError( 

980 f"Usecols do not match columns, columns expected but not found: " 

981 f"{missing}" 

982 ) 

983 

984 return usecols 

985 

986 @final 

987 def _validate_usecols_arg(self, usecols): 

988 """ 

989 Validate the 'usecols' parameter. 

990 

991 Checks whether or not the 'usecols' parameter contains all integers 

992 (column selection by index), strings (column by name) or is a callable. 

993 Raises a ValueError if that is not the case. 

994 

995 Parameters 

996 ---------- 

997 usecols : list-like, callable, or None 

998 List of columns to use when parsing or a callable that can be used 

999 to filter a list of table columns. 

1000 

1001 Returns 

1002 ------- 

1003 usecols_tuple : tuple 

1004 A tuple of (verified_usecols, usecols_dtype). 

1005 

1006 'verified_usecols' is either a set if an array-like is passed in or 

1007 'usecols' if a callable or None is passed in. 

1008 

1009 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like 

1010 is passed in or None if a callable or None is passed in. 

1011 """ 

1012 msg = ( 

1013 "'usecols' must either be list-like of all strings, all unicode, " 

1014 "all integers or a callable." 

1015 ) 

1016 if usecols is not None: 

1017 if callable(usecols): 

1018 return usecols, None 

1019 

1020 if not is_list_like(usecols): 

1021 # see gh-20529 

1022 # 

1023 # Ensure it is iterable container but not string. 

1024 raise ValueError(msg) 

1025 

1026 usecols_dtype = lib.infer_dtype(usecols, skipna=False) 

1027 

1028 if usecols_dtype not in ("empty", "integer", "string"): 

1029 raise ValueError(msg) 

1030 

1031 usecols = set(usecols) 

1032 

1033 return usecols, usecols_dtype 

1034 return usecols, None 

1035 

1036 @final 

1037 def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: 

1038 if not is_index_col(index_col): 

1039 return None, columns, index_col 

1040 

1041 columns = list(columns) 

1042 

1043 # In case of no rows and multiindex columns we have to set index_names to 

1044 # list of Nones GH#38292 

1045 if not columns: 

1046 return [None] * len(index_col), columns, index_col 

1047 

1048 cp_cols = list(columns) 

1049 index_names: list[str | int | None] = [] 

1050 

1051 # don't mutate 

1052 index_col = list(index_col) 

1053 

1054 for i, c in enumerate(index_col): 

1055 if isinstance(c, str): 

1056 index_names.append(c) 

1057 for j, name in enumerate(cp_cols): 

1058 if name == c: 

1059 index_col[i] = j 

1060 columns.remove(name) 

1061 break 

1062 else: 

1063 name = cp_cols[c] 

1064 columns.remove(name) 

1065 index_names.append(name) 

1066 

1067 # Only clean index names that were placeholders. 

1068 for i, name in enumerate(index_names): 

1069 if isinstance(name, str) and name in self.unnamed_cols: 

1070 index_names[i] = None 

1071 

1072 return index_names, columns, index_col 

1073 

1074 @final 

1075 def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): 

1076 columns = list(columns) 

1077 

1078 index_col = self.index_col 

1079 index_names = self.index_names 

1080 

1081 # Convert `dtype` to a defaultdict of some kind. 

1082 # This will enable us to write `dtype[col_name]` 

1083 # without worrying about KeyError issues later on. 

1084 dtype_dict: defaultdict[Hashable, Any] 

1085 if not is_dict_like(dtype): 

1086 # if dtype == None, default will be object. 

1087 default_dtype = dtype or object 

1088 dtype_dict = defaultdict(lambda: default_dtype) 

1089 else: 

1090 dtype = cast(dict, dtype) 

1091 dtype_dict = defaultdict( 

1092 lambda: object, 

1093 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, 

1094 ) 

1095 

1096 # Even though we have no data, the "index" of the empty DataFrame 

1097 # could for example still be an empty MultiIndex. Thus, we need to 

1098 # check whether we have any index columns specified, via either: 

1099 # 

1100 # 1) index_col (column indices) 

1101 # 2) index_names (column names) 

1102 # 

1103 # Both must be non-null to ensure a successful construction. Otherwise, 

1104 # we have to create a generic empty Index. 

1105 index: Index 

1106 if (index_col is None or index_col is False) or index_names is None: 

1107 index = default_index(0) 

1108 else: 

1109 data = [Series([], dtype=dtype_dict[name]) for name in index_names] 

1110 index = ensure_index_from_sequences(data, names=index_names) 

1111 index_col.sort() 

1112 

1113 for i, n in enumerate(index_col): 

1114 columns.pop(n - i) 

1115 

1116 col_dict = { 

1117 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns 

1118 } 

1119 

1120 return index, columns, col_dict 

1121 

1122 

1123def _make_date_converter( 

1124 date_parser=lib.no_default, 

1125 dayfirst: bool = False, 

1126 cache_dates: bool = True, 

1127 date_format: dict[Hashable, str] | str | None = None, 

1128): 

1129 if date_parser is not lib.no_default: 

1130 warnings.warn( 

1131 "The argument 'date_parser' is deprecated and will " 

1132 "be removed in a future version. " 

1133 "Please use 'date_format' instead, or read your data in as 'object' dtype " 

1134 "and then call 'to_datetime'.", 

1135 FutureWarning, 

1136 stacklevel=find_stack_level(), 

1137 ) 

1138 if date_parser is not lib.no_default and date_format is not None: 

1139 raise TypeError("Cannot use both 'date_parser' and 'date_format'") 

1140 

1141 def unpack_if_single_element(arg): 

1142 # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 

1143 if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: 

1144 return arg[0] 

1145 return arg 

1146 

1147 def converter(*date_cols, col: Hashable): 

1148 if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": 

1149 return date_cols[0] 

1150 

1151 if date_parser is lib.no_default: 

1152 strs = parsing.concat_date_cols(date_cols) 

1153 date_fmt = ( 

1154 date_format.get(col) if isinstance(date_format, dict) else date_format 

1155 ) 

1156 

1157 with warnings.catch_warnings(): 

1158 warnings.filterwarnings( 

1159 "ignore", 

1160 ".*parsing datetimes with mixed time zones will raise an error", 

1161 category=FutureWarning, 

1162 ) 

1163 str_objs = ensure_object(strs) 

1164 try: 

1165 result = tools.to_datetime( 

1166 str_objs, 

1167 format=date_fmt, 

1168 utc=False, 

1169 dayfirst=dayfirst, 

1170 cache=cache_dates, 

1171 ) 

1172 except (ValueError, TypeError): 

1173 # test_usecols_with_parse_dates4 

1174 return str_objs 

1175 

1176 if isinstance(result, DatetimeIndex): 

1177 arr = result.to_numpy() 

1178 arr.flags.writeable = True 

1179 return arr 

1180 return result._values 

1181 else: 

1182 try: 

1183 with warnings.catch_warnings(): 

1184 warnings.filterwarnings( 

1185 "ignore", 

1186 ".*parsing datetimes with mixed time zones " 

1187 "will raise an error", 

1188 category=FutureWarning, 

1189 ) 

1190 pre_parsed = date_parser( 

1191 *(unpack_if_single_element(arg) for arg in date_cols) 

1192 ) 

1193 try: 

1194 result = tools.to_datetime( 

1195 pre_parsed, 

1196 cache=cache_dates, 

1197 ) 

1198 except (ValueError, TypeError): 

1199 # test_read_csv_with_custom_date_parser 

1200 result = pre_parsed 

1201 if isinstance(result, datetime.datetime): 

1202 raise Exception("scalar parser") 

1203 return result 

1204 except Exception: 

1205 # e.g. test_datetime_fractional_seconds 

1206 with warnings.catch_warnings(): 

1207 warnings.filterwarnings( 

1208 "ignore", 

1209 ".*parsing datetimes with mixed time zones " 

1210 "will raise an error", 

1211 category=FutureWarning, 

1212 ) 

1213 pre_parsed = parsing.try_parse_dates( 

1214 parsing.concat_date_cols(date_cols), 

1215 parser=date_parser, 

1216 ) 

1217 try: 

1218 return tools.to_datetime(pre_parsed) 

1219 except (ValueError, TypeError): 

1220 # TODO: not reached in tests 2023-10-27; needed? 

1221 return pre_parsed 

1222 

1223 return converter 

1224 

1225 

1226parser_defaults = { 

1227 "delimiter": None, 

1228 "escapechar": None, 

1229 "quotechar": '"', 

1230 "quoting": csv.QUOTE_MINIMAL, 

1231 "doublequote": True, 

1232 "skipinitialspace": False, 

1233 "lineterminator": None, 

1234 "header": "infer", 

1235 "index_col": None, 

1236 "names": None, 

1237 "skiprows": None, 

1238 "skipfooter": 0, 

1239 "nrows": None, 

1240 "na_values": None, 

1241 "keep_default_na": True, 

1242 "true_values": None, 

1243 "false_values": None, 

1244 "converters": None, 

1245 "dtype": None, 

1246 "cache_dates": True, 

1247 "thousands": None, 

1248 "comment": None, 

1249 "decimal": ".", 

1250 # 'engine': 'c', 

1251 "parse_dates": False, 

1252 "keep_date_col": False, 

1253 "dayfirst": False, 

1254 "date_parser": lib.no_default, 

1255 "date_format": None, 

1256 "usecols": None, 

1257 # 'iterator': False, 

1258 "chunksize": None, 

1259 "verbose": False, 

1260 "encoding": None, 

1261 "compression": None, 

1262 "skip_blank_lines": True, 

1263 "encoding_errors": "strict", 

1264 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, 

1265 "dtype_backend": lib.no_default, 

1266} 

1267 

1268 

1269def _process_date_conversion( 

1270 data_dict, 

1271 converter: Callable, 

1272 parse_spec, 

1273 index_col, 

1274 index_names, 

1275 columns, 

1276 keep_date_col: bool = False, 

1277 dtype_backend=lib.no_default, 

1278): 

1279 def _isindex(colspec): 

1280 return (isinstance(index_col, list) and colspec in index_col) or ( 

1281 isinstance(index_names, list) and colspec in index_names 

1282 ) 

1283 

1284 new_cols = [] 

1285 new_data = {} 

1286 

1287 orig_names = columns 

1288 columns = list(columns) 

1289 

1290 date_cols = set() 

1291 

1292 if parse_spec is None or isinstance(parse_spec, bool): 

1293 return data_dict, columns 

1294 

1295 if isinstance(parse_spec, list): 

1296 # list of column lists 

1297 for colspec in parse_spec: 

1298 if is_scalar(colspec) or isinstance(colspec, tuple): 

1299 if isinstance(colspec, int) and colspec not in data_dict: 

1300 colspec = orig_names[colspec] 

1301 if _isindex(colspec): 

1302 continue 

1303 elif dtype_backend == "pyarrow": 

1304 import pyarrow as pa 

1305 

1306 dtype = data_dict[colspec].dtype 

1307 if isinstance(dtype, ArrowDtype) and ( 

1308 pa.types.is_timestamp(dtype.pyarrow_dtype) 

1309 or pa.types.is_date(dtype.pyarrow_dtype) 

1310 ): 

1311 continue 

1312 

1313 # Pyarrow engine returns Series which we need to convert to 

1314 # numpy array before converter, its a no-op for other parsers 

1315 data_dict[colspec] = converter( 

1316 np.asarray(data_dict[colspec]), col=colspec 

1317 ) 

1318 else: 

1319 new_name, col, old_names = _try_convert_dates( 

1320 converter, colspec, data_dict, orig_names 

1321 ) 

1322 if new_name in data_dict: 

1323 raise ValueError(f"New date column already in dict {new_name}") 

1324 new_data[new_name] = col 

1325 new_cols.append(new_name) 

1326 date_cols.update(old_names) 

1327 

1328 elif isinstance(parse_spec, dict): 

1329 # dict of new name to column list 

1330 for new_name, colspec in parse_spec.items(): 

1331 if new_name in data_dict: 

1332 raise ValueError(f"Date column {new_name} already in dict") 

1333 

1334 _, col, old_names = _try_convert_dates( 

1335 converter, 

1336 colspec, 

1337 data_dict, 

1338 orig_names, 

1339 target_name=new_name, 

1340 ) 

1341 

1342 new_data[new_name] = col 

1343 

1344 # If original column can be converted to date we keep the converted values 

1345 # This can only happen if values are from single column 

1346 if len(colspec) == 1: 

1347 new_data[colspec[0]] = col 

1348 

1349 new_cols.append(new_name) 

1350 date_cols.update(old_names) 

1351 

1352 if isinstance(data_dict, DataFrame): 

1353 data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False) 

1354 else: 

1355 data_dict.update(new_data) 

1356 new_cols.extend(columns) 

1357 

1358 if not keep_date_col: 

1359 for c in list(date_cols): 

1360 data_dict.pop(c) 

1361 new_cols.remove(c) 

1362 

1363 return data_dict, new_cols 

1364 

1365 

1366def _try_convert_dates( 

1367 parser: Callable, colspec, data_dict, columns, target_name: str | None = None 

1368): 

1369 colset = set(columns) 

1370 colnames = [] 

1371 

1372 for c in colspec: 

1373 if c in colset: 

1374 colnames.append(c) 

1375 elif isinstance(c, int) and c not in columns: 

1376 colnames.append(columns[c]) 

1377 else: 

1378 colnames.append(c) 

1379 

1380 new_name: tuple | str 

1381 if all(isinstance(x, tuple) for x in colnames): 

1382 new_name = tuple(map("_".join, zip(*colnames))) 

1383 else: 

1384 new_name = "_".join([str(x) for x in colnames]) 

1385 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] 

1386 

1387 new_col = parser(*to_parse, col=new_name if target_name is None else target_name) 

1388 return new_name, new_col, colnames 

1389 

1390 

1391def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): 

1392 """ 

1393 Get the NaN values for a given column. 

1394 

1395 Parameters 

1396 ---------- 

1397 col : str 

1398 The name of the column. 

1399 na_values : array-like, dict 

1400 The object listing the NaN values as strings. 

1401 na_fvalues : array-like, dict 

1402 The object listing the NaN values as floats. 

1403 keep_default_na : bool 

1404 If `na_values` is a dict, and the column is not mapped in the 

1405 dictionary, whether to return the default NaN values or the empty set. 

1406 

1407 Returns 

1408 ------- 

1409 nan_tuple : A length-two tuple composed of 

1410 

1411 1) na_values : the string NaN values for that column. 

1412 2) na_fvalues : the float NaN values for that column. 

1413 """ 

1414 if isinstance(na_values, dict): 

1415 if col in na_values: 

1416 return na_values[col], na_fvalues[col] 

1417 else: 

1418 if keep_default_na: 

1419 return STR_NA_VALUES, set() 

1420 

1421 return set(), set() 

1422 else: 

1423 return na_values, na_fvalues 

1424 

1425 

1426def _validate_parse_dates_arg(parse_dates): 

1427 """ 

1428 Check whether or not the 'parse_dates' parameter 

1429 is a non-boolean scalar. Raises a ValueError if 

1430 that is the case. 

1431 """ 

1432 msg = ( 

1433 "Only booleans, lists, and dictionaries are accepted " 

1434 "for the 'parse_dates' parameter" 

1435 ) 

1436 

1437 if not ( 

1438 parse_dates is None 

1439 or lib.is_bool(parse_dates) 

1440 or isinstance(parse_dates, (list, dict)) 

1441 ): 

1442 raise TypeError(msg) 

1443 

1444 return parse_dates 

1445 

1446 

1447def is_index_col(col) -> bool: 

1448 return col is not None and col is not False