Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/parsers/base

1from __future__ import annotations

3from collections import defaultdict

4from copy import copy

5import csv

6import datetime

7from enum import Enum

8import itertools

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13 Hashable,

14 Iterable,

15 List,

16 Mapping,

17 Sequence,

18 Tuple,

19 cast,

20 final,

21 overload,

22)

23import warnings

25import numpy as np

27from pandas._libs import (

28 lib,

29 parsers,

30)

31import pandas._libs.ops as libops

32from pandas._libs.parsers import STR_NA_VALUES

33from pandas._libs.tslibs import parsing

34from pandas._typing import (

35 ArrayLike,

36 DtypeArg,

37 DtypeObj,

38 Scalar,

39)

40from pandas.compat._optional import import_optional_dependency

41from pandas.errors import (

42 ParserError,

43 ParserWarning,

44)

45from pandas.util._exceptions import find_stack_level

47from pandas.core.dtypes.astype import astype_array

48from pandas.core.dtypes.common import (

49 ensure_object,

50 is_bool_dtype,

51 is_dict_like,

52 is_dtype_equal,

53 is_extension_array_dtype,

54 is_float_dtype,

55 is_integer,

56 is_integer_dtype,

57 is_list_like,

58 is_object_dtype,

59 is_scalar,

60 is_string_dtype,

61 pandas_dtype,

62)

63from pandas.core.dtypes.dtypes import (

64 CategoricalDtype,

65 ExtensionDtype,

66)

67from pandas.core.dtypes.missing import isna

69from pandas import (

70 ArrowDtype,

71 DatetimeIndex,

72 StringDtype,

73)

74from pandas.core import algorithms

75from pandas.core.arrays import (

76 ArrowExtensionArray,

77 BooleanArray,

78 Categorical,

79 ExtensionArray,

80 FloatingArray,

81 IntegerArray,

82)

83from pandas.core.arrays.boolean import BooleanDtype

84from pandas.core.indexes.api import (

85 Index,

86 MultiIndex,

87 default_index,

88 ensure_index_from_sequences,

89)

90from pandas.core.series import Series

91from pandas.core.tools import datetimes as tools

93from pandas.io.common import is_potential_multi_index

95if TYPE_CHECKING:

96 from pandas import DataFrame

99class ParserBase:

100 class BadLineHandleMethod(Enum):

101 ERROR = 0

102 WARN = 1

103 SKIP = 2

104

105 _implicit_index: bool = False

106 _first_chunk: bool

107

108 def __init__(self, kwds) -> None:

109 self.names = kwds.get("names")

110 self.orig_names: Sequence[Hashable] | None = None

111

112 self.index_col = kwds.get("index_col", None)

113 self.unnamed_cols: set = set()

114 self.index_names: Sequence[Hashable] | None = None

115 self.col_names: Sequence[Hashable] | None = None

116

117 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))

118 self._parse_date_cols: Iterable = []

119 self.date_parser = kwds.pop("date_parser", lib.no_default)

120 self.date_format = kwds.pop("date_format", None)

121 self.dayfirst = kwds.pop("dayfirst", False)

122 self.keep_date_col = kwds.pop("keep_date_col", False)

123

124 self.na_values = kwds.get("na_values")

125 self.na_fvalues = kwds.get("na_fvalues")

126 self.na_filter = kwds.get("na_filter", False)

127 self.keep_default_na = kwds.get("keep_default_na", True)

128

129 self.dtype = copy(kwds.get("dtype", None))

130 self.converters = kwds.get("converters")

131 self.dtype_backend = kwds.get("dtype_backend")

132

133 self.true_values = kwds.get("true_values")

134 self.false_values = kwds.get("false_values")

135 self.cache_dates = kwds.pop("cache_dates", True)

136

137 self._date_conv = _make_date_converter(

138 date_parser=self.date_parser,

139 date_format=self.date_format,

140 dayfirst=self.dayfirst,

141 cache_dates=self.cache_dates,

142 )

143

144 # validate header options for mi

145 self.header = kwds.get("header")

146 if is_list_like(self.header, allow_sets=False):

147 if kwds.get("usecols"):

148 raise ValueError(

149 "cannot specify usecols when specifying a multi-index header"

150 )

151 if kwds.get("names"):

152 raise ValueError(

153 "cannot specify names when specifying a multi-index header"

154 )

155

156 # validate index_col that only contains integers

157 if self.index_col is not None:

158 if not (

159 is_list_like(self.index_col, allow_sets=False)

160 and all(map(is_integer, self.index_col))

161 or is_integer(self.index_col)

162 ):

163 raise ValueError(

164 "index_col must only contain row numbers "

165 "when specifying a multi-index header"

166 )

167

168 self._name_processed = False

169

170 self._first_chunk = True

171

172 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])

173

174 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)

175 # Normally, this arg would get pre-processed earlier on

176 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)

177

178 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:

179 """

180 Check if parse_dates are in columns.

181

182 If user has provided names for parse_dates, check if those columns

183 are available.

184

185 Parameters

186 ----------

187 columns : list

188 List of names of the dataframe.

189

190 Returns

191 -------

192 The names of the columns which will get parsed later if a dict or list

193 is given as specification.

194

195 Raises

196 ------

197 ValueError

198 If column to parse_date is not in dataframe.

199

200 """

201 cols_needed: Iterable

202 if is_dict_like(self.parse_dates):

203 cols_needed = itertools.chain(*self.parse_dates.values())

204 elif is_list_like(self.parse_dates):

205 # a column in parse_dates could be represented

206 # ColReference = Union[int, str]

207 # DateGroups = List[ColReference]

208 # ParseDates = Union[DateGroups, List[DateGroups],

209 # Dict[ColReference, DateGroups]]

210 cols_needed = itertools.chain.from_iterable(

211 col if is_list_like(col) and not isinstance(col, tuple) else [col]

212 for col in self.parse_dates

213 )

214 else:

215 cols_needed = []

216

217 cols_needed = list(cols_needed)

218

219 # get only columns that are references using names (str), not by index

220 missing_cols = ", ".join(

221 sorted(

222 {

223 col

224 for col in cols_needed

225 if isinstance(col, str) and col not in columns

226 }

227 )

228 )

229 if missing_cols:

230 raise ValueError(

231 f"Missing column provided to 'parse_dates': '{missing_cols}'"

232 )

233 # Convert positions to actual column names

234 return [

235 col if (isinstance(col, str) or col in columns) else columns[col]

236 for col in cols_needed

237 ]

238

239 def close(self) -> None:

240 pass

241

242 @final

243 @property

244 def _has_complex_date_col(self) -> bool:

245 return isinstance(self.parse_dates, dict) or (

246 isinstance(self.parse_dates, list)

247 and len(self.parse_dates) > 0

248 and isinstance(self.parse_dates[0], list)

249 )

250

251 @final

252 def _should_parse_dates(self, i: int) -> bool:

253 if isinstance(self.parse_dates, bool):

254 return self.parse_dates

255 else:

256 if self.index_names is not None:

257 name = self.index_names[i]

258 else:

259 name = None

260 j = i if self.index_col is None else self.index_col[i]

261

262 if is_scalar(self.parse_dates):

263 return (j == self.parse_dates) or (

264 name is not None and name == self.parse_dates

265 )

266 else:

267 return (j in self.parse_dates) or (

268 name is not None and name in self.parse_dates

269 )

270

271 @final

272 def _extract_multi_indexer_columns(

273 self,

274 header,

275 index_names: Sequence[Hashable] | None,

276 passed_names: bool = False,

277 ) -> tuple[

278 Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool

279 ]:

280 """

281 Extract and return the names, index_names, col_names if the column

282 names are a MultiIndex.

283

284 Parameters

285 ----------

286 header: list of lists

287 The header rows

288 index_names: list, optional

289 The names of the future index

290 passed_names: bool, default False

291 A flag specifying if names where passed

292

293 """

294 if len(header) < 2:

295 return header[0], index_names, None, passed_names

296

297 # the names are the tuples of the header that are not the index cols

298 # 0 is the name of the index, assuming index_col is a list of column

299 # numbers

300 ic = self.index_col

301 if ic is None:

302 ic = []

303

304 if not isinstance(ic, (list, tuple, np.ndarray)):

305 ic = [ic]

306 sic = set(ic)

307

308 # clean the index_names

309 index_names = header.pop(-1)

310 index_names, _, _ = self._clean_index_names(index_names, self.index_col)

311

312 # extract the columns

313 field_count = len(header[0])

314

315 # check if header lengths are equal

316 if not all(len(header_iter) == field_count for header_iter in header[1:]):

317 raise ParserError("Header rows must have an equal number of columns.")

318

319 def extract(r):

320 return tuple(r[i] for i in range(field_count) if i not in sic)

321

322 columns = list(zip(*(extract(r) for r in header)))

323 names = columns.copy()

324 for single_ic in sorted(ic):

325 names.insert(single_ic, single_ic)

326

327 # Clean the column names (if we have an index_col).

328 if len(ic):

329 col_names = [

330 r[ic[0]]

331 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)

332 else None

333 for r in header

334 ]

335 else:

336 col_names = [None] * len(header)

337

338 passed_names = True

339

340 return names, index_names, col_names, passed_names

341

342 @final

343 def _maybe_make_multi_index_columns(

344 self,

345 columns: Sequence[Hashable],

346 col_names: Sequence[Hashable] | None = None,

347 ) -> Sequence[Hashable] | MultiIndex:

348 # possibly create a column mi here

349 if is_potential_multi_index(columns):

350 list_columns = cast(List[Tuple], columns)

351 return MultiIndex.from_tuples(list_columns, names=col_names)

352 return columns

353

354 @final

355 def _make_index(

356 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None

357 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:

358 index: Index | None

359 if not is_index_col(self.index_col) or not self.index_col:

360 index = None

361

362 elif not self._has_complex_date_col:

363 simple_index = self._get_simple_index(alldata, columns)

364 index = self._agg_index(simple_index)

365 elif self._has_complex_date_col:

366 if not self._name_processed:

367 (self.index_names, _, self.index_col) = self._clean_index_names(

368 list(columns), self.index_col

369 )

370 self._name_processed = True

371 date_index = self._get_complex_date_index(data, columns)

372 index = self._agg_index(date_index, try_parse_dates=False)

373

374 # add names for the index

375 if indexnamerow:

376 coffset = len(indexnamerow) - len(columns)

377 assert index is not None

378 index = index.set_names(indexnamerow[:coffset])

379

380 # maybe create a mi on the columns

381 columns = self._maybe_make_multi_index_columns(columns, self.col_names)

382

383 return index, columns

384

385 @final

386 def _get_simple_index(self, data, columns):

387 def ix(col):

388 if not isinstance(col, str):

389 return col

390 raise ValueError(f"Index {col} invalid")

391

392 to_remove = []

393 index = []

394 for idx in self.index_col:

395 i = ix(idx)

396 to_remove.append(i)

397 index.append(data[i])

398

399 # remove index items from content and columns, don't pop in

400 # loop

401 for i in sorted(to_remove, reverse=True):

402 data.pop(i)

403 if not self._implicit_index:

404 columns.pop(i)

405

406 return index

407

408 @final

409 def _get_complex_date_index(self, data, col_names):

410 def _get_name(icol):

411 if isinstance(icol, str):

412 return icol

413

414 if col_names is None:

415 raise ValueError(f"Must supply column order to use {icol!s} as index")

416

417 for i, c in enumerate(col_names):

418 if i == icol:

419 return c

420

421 to_remove = []

422 index = []

423 for idx in self.index_col:

424 name = _get_name(idx)

425 to_remove.append(name)

426 index.append(data[name])

427

428 # remove index items from content and columns, don't pop in

429 # loop

430 for c in sorted(to_remove, reverse=True):

431 data.pop(c)

432 col_names.remove(c)

433

434 return index

435

436 def _clean_mapping(self, mapping):

437 """converts col numbers to names"""

438 if not isinstance(mapping, dict):

439 return mapping

440 clean = {}

441 # for mypy

442 assert self.orig_names is not None

443

444 for col, v in mapping.items():

445 if isinstance(col, int) and col not in self.orig_names:

446 col = self.orig_names[col]

447 clean[col] = v

448 if isinstance(mapping, defaultdict):

449 remaining_cols = set(self.orig_names) - set(clean.keys())

450 clean.update({col: mapping[col] for col in remaining_cols})

451 return clean

452

453 @final

454 def _agg_index(self, index, try_parse_dates: bool = True) -> Index:

455 arrays = []

456 converters = self._clean_mapping(self.converters)

457

458 for i, arr in enumerate(index):

459 if try_parse_dates and self._should_parse_dates(i):

460 arr = self._date_conv(

461 arr,

462 col=self.index_names[i] if self.index_names is not None else None,

463 )

464

465 if self.na_filter:

466 col_na_values = self.na_values

467 col_na_fvalues = self.na_fvalues

468 else:

469 col_na_values = set()

470 col_na_fvalues = set()

471

472 if isinstance(self.na_values, dict):

473 assert self.index_names is not None

474 col_name = self.index_names[i]

475 if col_name is not None:

476 col_na_values, col_na_fvalues = _get_na_values(

477 col_name, self.na_values, self.na_fvalues, self.keep_default_na

478 )

479

480 clean_dtypes = self._clean_mapping(self.dtype)

481

482 cast_type = None

483 index_converter = False

484 if self.index_names is not None:

485 if isinstance(clean_dtypes, dict):

486 cast_type = clean_dtypes.get(self.index_names[i], None)

487

488 if isinstance(converters, dict):

489 index_converter = converters.get(self.index_names[i]) is not None

490

491 try_num_bool = not (

492 cast_type and is_string_dtype(cast_type) or index_converter

493 )

494

495 arr, _ = self._infer_types(

496 arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool

497 )

498 arrays.append(arr)

499

500 names = self.index_names

501 index = ensure_index_from_sequences(arrays, names)

502

503 return index

504

505 @final

506 def _convert_to_ndarrays(

507 self,

508 dct: Mapping,

509 na_values,

510 na_fvalues,

511 verbose: bool = False,

512 converters=None,

513 dtypes=None,

514 ):

515 result = {}

516 for c, values in dct.items():

517 conv_f = None if converters is None else converters.get(c, None)

518 if isinstance(dtypes, dict):

519 cast_type = dtypes.get(c, None)

520 else:

521 # single dtype or None

522 cast_type = dtypes

523

524 if self.na_filter:

525 col_na_values, col_na_fvalues = _get_na_values(

526 c, na_values, na_fvalues, self.keep_default_na

527 )

528 else:

529 col_na_values, col_na_fvalues = set(), set()

530

531 if c in self._parse_date_cols:

532 # GH#26203 Do not convert columns which get converted to dates

533 # but replace nans to ensure to_datetime works

534 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)

535 np.putmask(values, mask, np.nan)

536 result[c] = values

537 continue

538

539 if conv_f is not None:

540 # conv_f applied to data before inference

541 if cast_type is not None:

542 warnings.warn(

543 (

544 "Both a converter and dtype were specified "

545 f"for column {c} - only the converter will be used."

546 ),

547 ParserWarning,

548 stacklevel=find_stack_level(),

549 )

550

551 try:

552 values = lib.map_infer(values, conv_f)

553 except ValueError:

554 # error: Argument 2 to "isin" has incompatible type "List[Any]";

555 # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"

556 mask = algorithms.isin(

557 values, list(na_values) # type: ignore[arg-type]

558 ).view(np.uint8)

559 values = lib.map_infer_mask(values, conv_f, mask)

560

561 cvals, na_count = self._infer_types(

562 values,

563 set(col_na_values) | col_na_fvalues,

564 cast_type is None,

565 try_num_bool=False,

566 )

567 else:

568 is_ea = is_extension_array_dtype(cast_type)

569 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)

570 # skip inference if specified dtype is object

571 # or casting to an EA

572 try_num_bool = not (cast_type and is_str_or_ea_dtype)

573

574 # general type inference and conversion

575 cvals, na_count = self._infer_types(

576 values,

577 set(col_na_values) | col_na_fvalues,

578 cast_type is None,

579 try_num_bool,

580 )

581

582 # type specified in dtype param or cast_type is an EA

583 if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea):

584 if not is_ea and na_count > 0:

585 if is_bool_dtype(cast_type):

586 raise ValueError(f"Bool column has NA values in column {c}")

587 cast_type = pandas_dtype(cast_type)

588 cvals = self._cast_types(cvals, cast_type, c)

589

590 result[c] = cvals

591 if verbose and na_count:

592 print(f"Filled {na_count} NA values in column {c!s}")

593 return result

594

595 @final

596 def _set_noconvert_dtype_columns(

597 self, col_indices: list[int], names: Sequence[Hashable]

598 ) -> set[int]:

599 """

600 Set the columns that should not undergo dtype conversions.

601

602 Currently, any column that is involved with date parsing will not

603 undergo such conversions. If usecols is specified, the positions of the columns

604 not to cast is relative to the usecols not to all columns.

605

606 Parameters

607 ----------

608 col_indices: The indices specifying order and positions of the columns

609 names: The column names which order is corresponding with the order

610 of col_indices

611

612 Returns

613 -------

614 A set of integers containing the positions of the columns not to convert.

615 """

616 usecols: list[int] | list[str] | None

617 noconvert_columns = set()

618 if self.usecols_dtype == "integer":

619 # A set of integers will be converted to a list in

620 # the correct order every single time.

621 usecols = sorted(self.usecols)

622 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):

623 # The names attribute should have the correct columns

624 # in the proper order for indexing with parse_dates.

625 usecols = col_indices

626 else:

627 # Usecols is empty.

628 usecols = None

629

630 def _set(x) -> int:

631 if usecols is not None and is_integer(x):

632 x = usecols[x]

633

634 if not is_integer(x):

635 x = col_indices[names.index(x)]

636

637 return x

638

639 if isinstance(self.parse_dates, list):

640 for val in self.parse_dates:

641 if isinstance(val, list):

642 for k in val:

643 noconvert_columns.add(_set(k))

644 else:

645 noconvert_columns.add(_set(val))

646

647 elif isinstance(self.parse_dates, dict):

648 for val in self.parse_dates.values():

649 if isinstance(val, list):

650 for k in val:

651 noconvert_columns.add(_set(k))

652 else:

653 noconvert_columns.add(_set(val))

654

655 elif self.parse_dates:

656 if isinstance(self.index_col, list):

657 for k in self.index_col:

658 noconvert_columns.add(_set(k))

659 elif self.index_col is not None:

660 noconvert_columns.add(_set(self.index_col))

661

662 return noconvert_columns

663

664 def _infer_types(

665 self, values, na_values, no_dtype_specified, try_num_bool: bool = True

666 ) -> tuple[ArrayLike, int]:

667 """

668 Infer types of values, possibly casting

669

670 Parameters

671 ----------

672 values : ndarray

673 na_values : set

674 no_dtype_specified: Specifies if we want to cast explicitly

675 try_num_bool : bool, default try

676 try to cast values to numeric (first preference) or boolean

677

678 Returns

679 -------

680 converted : ndarray or ExtensionArray

681 na_count : int

682 """

683 na_count = 0

684 if issubclass(values.dtype.type, (np.number, np.bool_)):

685 # If our array has numeric dtype, we don't have to check for strings in isin

686 na_values = np.array([val for val in na_values if not isinstance(val, str)])

687 mask = algorithms.isin(values, na_values)

688 na_count = mask.astype("uint8", copy=False).sum()

689 if na_count > 0:

690 if is_integer_dtype(values):

691 values = values.astype(np.float64)

692 np.putmask(values, mask, np.nan)

693 return values, na_count

694

695 dtype_backend = self.dtype_backend

696 non_default_dtype_backend = (

697 no_dtype_specified and dtype_backend is not lib.no_default

698 )

699 result: ArrayLike

700

701 if try_num_bool and is_object_dtype(values.dtype):

702 # exclude e.g DatetimeIndex here

703 try:

704 result, result_mask = lib.maybe_convert_numeric(

705 values,

706 na_values,

707 False,

708 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa

709 )

710 except (ValueError, TypeError):

711 # e.g. encountering datetime string gets ValueError

712 # TypeError can be raised in floatify

713 na_count = parsers.sanitize_objects(values, na_values)

714 result = values

715 else:

716 if non_default_dtype_backend:

717 if result_mask is None:

718 result_mask = np.zeros(result.shape, dtype=np.bool_)

719

720 if result_mask.all():

721 result = IntegerArray(

722 np.ones(result_mask.shape, dtype=np.int64), result_mask

723 )

724 elif is_integer_dtype(result):

725 result = IntegerArray(result, result_mask)

726 elif is_bool_dtype(result):

727 result = BooleanArray(result, result_mask)

728 elif is_float_dtype(result):

729 result = FloatingArray(result, result_mask)

730

731 na_count = result_mask.sum()

732 else:

733 na_count = isna(result).sum()

734 else:

735 result = values

736 if values.dtype == np.object_:

737 na_count = parsers.sanitize_objects(values, na_values)

738

739 if result.dtype == np.object_ and try_num_bool:

740 result, bool_mask = libops.maybe_convert_bool(

741 np.asarray(values),

742 true_values=self.true_values,

743 false_values=self.false_values,

744 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa

745 )

746 if result.dtype == np.bool_ and non_default_dtype_backend:

747 if bool_mask is None:

748 bool_mask = np.zeros(result.shape, dtype=np.bool_)

749 result = BooleanArray(result, bool_mask)

750 elif result.dtype == np.object_ and non_default_dtype_backend:

751 # read_excel sends array of datetime objects

752 inferred_type = lib.infer_dtype(result)

753 if inferred_type != "datetime":

754 result = StringDtype().construct_array_type()._from_sequence(values)

755

756 if dtype_backend == "pyarrow":

757 pa = import_optional_dependency("pyarrow")

758 if isinstance(result, np.ndarray):

759 result = ArrowExtensionArray(pa.array(result, from_pandas=True))

760 else:

761 # ExtensionArray

762 result = ArrowExtensionArray(

763 pa.array(result.to_numpy(), from_pandas=True)

764 )

765

766 return result, na_count

767

768 def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:

769 """

770 Cast values to specified type

771

772 Parameters

773 ----------

774 values : ndarray or ExtensionArray

775 cast_type : np.dtype or ExtensionDtype

776 dtype to cast values to

777 column : string

778 column name - used only for error reporting

779

780 Returns

781 -------

782 converted : ndarray or ExtensionArray

783 """

784 if isinstance(cast_type, CategoricalDtype):

785 known_cats = cast_type.categories is not None

786

787 if not is_object_dtype(values.dtype) and not known_cats:

788 # TODO: this is for consistency with

789 # c-parser which parses all categories

790 # as strings

791 values = lib.ensure_string_array(

792 values, skipna=False, convert_na_value=False

793 )

794

795 cats = Index(values).unique().dropna()

796 values = Categorical._from_inferred_categories(

797 cats, cats.get_indexer(values), cast_type, true_values=self.true_values

798 )

799

800 # use the EA's implementation of casting

801 elif isinstance(cast_type, ExtensionDtype):

802 array_type = cast_type.construct_array_type()

803 try:

804 if isinstance(cast_type, BooleanDtype):

805 # error: Unexpected keyword argument "true_values" for

806 # "_from_sequence_of_strings" of "ExtensionArray"

807 return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa:E501

808 values,

809 dtype=cast_type,

810 true_values=self.true_values,

811 false_values=self.false_values,

812 )

813 else:

814 return array_type._from_sequence_of_strings(values, dtype=cast_type)

815 except NotImplementedError as err:

816 raise NotImplementedError(

817 f"Extension Array: {array_type} must implement "

818 "_from_sequence_of_strings in order to be used in parser methods"

819 ) from err

820

821 elif isinstance(values, ExtensionArray):

822 values = values.astype(cast_type, copy=False)

823 elif issubclass(cast_type.type, str):

824 # TODO: why skipna=True here and False above? some tests depend

825 # on it here, but nothing fails if we change it above

826 # (as no tests get there as of 2022-12-06)

827 values = lib.ensure_string_array(

828 values, skipna=True, convert_na_value=False

829 )

830 else:

831 try:

832 values = astype_array(values, cast_type, copy=True)

833 except ValueError as err:

834 raise ValueError(

835 f"Unable to convert column {column} to type {cast_type}"

836 ) from err

837 return values

838

839 @overload

840 def _do_date_conversions(

841 self,

842 names: Index,

843 data: DataFrame,

844 ) -> tuple[Sequence[Hashable] | Index, DataFrame]:

845 ...

846

847 @overload

848 def _do_date_conversions(

849 self,

850 names: Sequence[Hashable],

851 data: Mapping[Hashable, ArrayLike],

852 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:

853 ...

854

855 def _do_date_conversions(

856 self,

857 names: Sequence[Hashable] | Index,

858 data: Mapping[Hashable, ArrayLike] | DataFrame,

859 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:

860 # returns data, columns

861

862 if self.parse_dates is not None:

863 data, names = _process_date_conversion(

864 data,

865 self._date_conv,

866 self.parse_dates,

867 self.index_col,

868 self.index_names,

869 names,

870 keep_date_col=self.keep_date_col,

871 dtype_backend=self.dtype_backend,

872 )

873

874 return names, data

875

876 def _check_data_length(

877 self,

878 columns: Sequence[Hashable],

879 data: Sequence[ArrayLike],

880 ) -> None:

881 """Checks if length of data is equal to length of column names.

882

883 One set of trailing commas is allowed. self.index_col not False

884 results in a ParserError previously when lengths do not match.

885

886 Parameters

887 ----------

888 columns: list of column names

889 data: list of array-likes containing the data column-wise.

890 """

891 if not self.index_col and len(columns) != len(data) and columns:

892 empty_str = is_object_dtype(data[-1]) and data[-1] == ""

893 # error: No overload variant of "__ror__" of "ndarray" matches

894 # argument type "ExtensionArray"

895 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]

896 if len(columns) == len(data) - 1 and np.all(empty_str_or_na):

897 return

898 warnings.warn(

899 "Length of header or names does not match length of data. This leads "

900 "to a loss of data with index_col=False.",

901 ParserWarning,

902 stacklevel=find_stack_level(),

903 )

904

905 @overload

906 def _evaluate_usecols(

907 self,

908 usecols: set[int] | Callable[[Hashable], object],

909 names: Sequence[Hashable],

910 ) -> set[int]:

911 ...

912

913 @overload

914 def _evaluate_usecols(

915 self, usecols: set[str], names: Sequence[Hashable]

916 ) -> set[str]:

917 ...

918

919 def _evaluate_usecols(

920 self,

921 usecols: Callable[[Hashable], object] | set[str] | set[int],

922 names: Sequence[Hashable],

923 ) -> set[str] | set[int]:

924 """

925 Check whether or not the 'usecols' parameter

926 is a callable. If so, enumerates the 'names'

927 parameter and returns a set of indices for

928 each entry in 'names' that evaluates to True.

929 If not a callable, returns 'usecols'.

930 """

931 if callable(usecols):

932 return {i for i, name in enumerate(names) if usecols(name)}

933 return usecols

934

935 def _validate_usecols_names(self, usecols, names):

936 """

937 Validates that all usecols are present in a given

938 list of names. If not, raise a ValueError that

939 shows what usecols are missing.

940

941 Parameters

942 ----------

943 usecols : iterable of usecols

944 The columns to validate are present in names.

945 names : iterable of names

946 The column names to check against.

947

948 Returns

949 -------

950 usecols : iterable of usecols

951 The `usecols` parameter if the validation succeeds.

952

953 Raises

954 ------

955 ValueError : Columns were missing. Error message will list them.

956 """

957 missing = [c for c in usecols if c not in names]

958 if len(missing) > 0:

959 raise ValueError(

960 f"Usecols do not match columns, columns expected but not found: "

961 f"{missing}"

962 )

963

964 return usecols

965

966 def _validate_usecols_arg(self, usecols):

967 """

968 Validate the 'usecols' parameter.

969

970 Checks whether or not the 'usecols' parameter contains all integers

971 (column selection by index), strings (column by name) or is a callable.

972 Raises a ValueError if that is not the case.

973

974 Parameters

975 ----------

976 usecols : list-like, callable, or None

977 List of columns to use when parsing or a callable that can be used

978 to filter a list of table columns.

979

980 Returns

981 -------

982 usecols_tuple : tuple

983 A tuple of (verified_usecols, usecols_dtype).

984

985 'verified_usecols' is either a set if an array-like is passed in or

986 'usecols' if a callable or None is passed in.

987

988 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like

989 is passed in or None if a callable or None is passed in.

990 """

991 msg = (

992 "'usecols' must either be list-like of all strings, all unicode, "

993 "all integers or a callable."

994 )

995 if usecols is not None:

996 if callable(usecols):

997 return usecols, None

998

999 if not is_list_like(usecols):

1000 # see gh-20529

1001 #

1002 # Ensure it is iterable container but not string.

1003 raise ValueError(msg)

1004

1005 usecols_dtype = lib.infer_dtype(usecols, skipna=False)

1006

1007 if usecols_dtype not in ("empty", "integer", "string"):

1008 raise ValueError(msg)

1009

1010 usecols = set(usecols)

1011

1012 return usecols, usecols_dtype

1013 return usecols, None

1014

1015 def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:

1016 if not is_index_col(index_col):

1017 return None, columns, index_col

1018

1019 columns = list(columns)

1020

1021 # In case of no rows and multiindex columns we have to set index_names to

1022 # list of Nones GH#38292

1023 if not columns:

1024 return [None] * len(index_col), columns, index_col

1025

1026 cp_cols = list(columns)

1027 index_names: list[str | int | None] = []

1028

1029 # don't mutate

1030 index_col = list(index_col)

1031

1032 for i, c in enumerate(index_col):

1033 if isinstance(c, str):

1034 index_names.append(c)

1035 for j, name in enumerate(cp_cols):

1036 if name == c:

1037 index_col[i] = j

1038 columns.remove(name)

1039 break

1040 else:

1041 name = cp_cols[c]

1042 columns.remove(name)

1043 index_names.append(name)

1044

1045 # Only clean index names that were placeholders.

1046 for i, name in enumerate(index_names):

1047 if isinstance(name, str) and name in self.unnamed_cols:

1048 index_names[i] = None

1049

1050 return index_names, columns, index_col

1051

1052 def _get_empty_meta(

1053 self, columns, index_col, index_names, dtype: DtypeArg | None = None

1054 ):

1055 columns = list(columns)

1056

1057 # Convert `dtype` to a defaultdict of some kind.

1058 # This will enable us to write `dtype[col_name]`

1059 # without worrying about KeyError issues later on.

1060 dtype_dict: defaultdict[Hashable, Any]

1061 if not is_dict_like(dtype):

1062 # if dtype == None, default will be object.

1063 default_dtype = dtype or object

1064 dtype_dict = defaultdict(lambda: default_dtype)

1065 else:

1066 dtype = cast(dict, dtype)

1067 dtype_dict = defaultdict(

1068 lambda: object,

1069 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},

1070 )

1071

1072 # Even though we have no data, the "index" of the empty DataFrame

1073 # could for example still be an empty MultiIndex. Thus, we need to

1074 # check whether we have any index columns specified, via either:

1075 #

1076 # 1) index_col (column indices)

1077 # 2) index_names (column names)

1078 #

1079 # Both must be non-null to ensure a successful construction. Otherwise,

1080 # we have to create a generic empty Index.

1081 index: Index

1082 if (index_col is None or index_col is False) or index_names is None:

1083 index = default_index(0)

1084 else:

1085 data = [Series([], dtype=dtype_dict[name]) for name in index_names]

1086 index = ensure_index_from_sequences(data, names=index_names)

1087 index_col.sort()

1088

1089 for i, n in enumerate(index_col):

1090 columns.pop(n - i)

1091

1092 col_dict = {

1093 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns

1094 }

1095

1096 return index, columns, col_dict

1097

1098

1099def _make_date_converter(

1100 date_parser=lib.no_default,

1101 dayfirst: bool = False,

1102 cache_dates: bool = True,

1103 date_format: dict[Hashable, str] | str | None = None,

1104):

1105 if date_parser is not lib.no_default:

1106 warnings.warn(

1107 "The argument 'date_parser' is deprecated and will "

1108 "be removed in a future version. "

1109 "Please use 'date_format' instead, or read your data in as 'object' dtype "

1110 "and then call 'to_datetime'.",

1111 FutureWarning,

1112 stacklevel=find_stack_level(),

1113 )

1114 if date_parser is not lib.no_default and date_format is not None:

1115 raise TypeError("Cannot use both 'date_parser' and 'date_format'")

1116

1117 def unpack_if_single_element(arg):

1118 # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615

1119 if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:

1120 return arg[0]

1121 return arg

1122

1123 def converter(*date_cols, col: Hashable):

1124 if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":

1125 return date_cols[0]

1126

1127 if date_parser is lib.no_default:

1128 strs = parsing.concat_date_cols(date_cols)

1129 date_fmt = (

1130 date_format.get(col) if isinstance(date_format, dict) else date_format

1131 )

1132

1133 result = tools.to_datetime(

1134 ensure_object(strs),

1135 format=date_fmt,

1136 utc=False,

1137 dayfirst=dayfirst,

1138 errors="ignore",

1139 cache=cache_dates,

1140 )

1141 if isinstance(result, DatetimeIndex):

1142 arr = result.to_numpy()

1143 arr.flags.writeable = True

1144 return arr

1145 return result._values

1146 else:

1147 try:

1148 result = tools.to_datetime(

1149 date_parser(*(unpack_if_single_element(arg) for arg in date_cols)),

1150 errors="ignore",

1151 cache=cache_dates,

1152 )

1153 if isinstance(result, datetime.datetime):

1154 raise Exception("scalar parser")

1155 return result

1156 except Exception:

1157 return tools.to_datetime(

1158 parsing.try_parse_dates(

1159 parsing.concat_date_cols(date_cols),

1160 parser=date_parser,

1161 ),

1162 errors="ignore",

1163 )

1164

1165 return converter

1166

1167

1168parser_defaults = {

1169 "delimiter": None,

1170 "escapechar": None,

1171 "quotechar": '"',

1172 "quoting": csv.QUOTE_MINIMAL,

1173 "doublequote": True,

1174 "skipinitialspace": False,

1175 "lineterminator": None,

1176 "header": "infer",

1177 "index_col": None,

1178 "names": None,

1179 "skiprows": None,

1180 "skipfooter": 0,

1181 "nrows": None,

1182 "na_values": None,

1183 "keep_default_na": True,

1184 "true_values": None,

1185 "false_values": None,

1186 "converters": None,

1187 "dtype": None,

1188 "cache_dates": True,

1189 "thousands": None,

1190 "comment": None,

1191 "decimal": ".",

1192 # 'engine': 'c',

1193 "parse_dates": False,

1194 "keep_date_col": False,

1195 "dayfirst": False,

1196 "date_parser": lib.no_default,

1197 "date_format": None,

1198 "usecols": None,

1199 # 'iterator': False,

1200 "chunksize": None,

1201 "verbose": False,

1202 "encoding": None,

1203 "compression": None,

1204 "skip_blank_lines": True,

1205 "encoding_errors": "strict",

1206 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,

1207 "dtype_backend": lib.no_default,

1208}

1209

1210

1211def _process_date_conversion(

1212 data_dict,

1213 converter: Callable,

1214 parse_spec,

1215 index_col,

1216 index_names,

1217 columns,

1218 keep_date_col: bool = False,

1219 dtype_backend=lib.no_default,

1220):

1221 def _isindex(colspec):

1222 return (isinstance(index_col, list) and colspec in index_col) or (

1223 isinstance(index_names, list) and colspec in index_names

1224 )

1225

1226 new_cols = []

1227 new_data = {}

1228

1229 orig_names = columns

1230 columns = list(columns)

1231

1232 date_cols = set()

1233

1234 if parse_spec is None or isinstance(parse_spec, bool):

1235 return data_dict, columns

1236

1237 if isinstance(parse_spec, list):

1238 # list of column lists

1239 for colspec in parse_spec:

1240 if is_scalar(colspec) or isinstance(colspec, tuple):

1241 if isinstance(colspec, int) and colspec not in data_dict:

1242 colspec = orig_names[colspec]

1243 if _isindex(colspec):

1244 continue

1245 elif dtype_backend == "pyarrow":

1246 import pyarrow as pa

1247

1248 dtype = data_dict[colspec].dtype

1249 if isinstance(dtype, ArrowDtype) and (

1250 pa.types.is_timestamp(dtype.pyarrow_dtype)

1251 or pa.types.is_date(dtype.pyarrow_dtype)

1252 ):

1253 continue

1254

1255 # Pyarrow engine returns Series which we need to convert to

1256 # numpy array before converter, its a no-op for other parsers

1257 data_dict[colspec] = converter(

1258 np.asarray(data_dict[colspec]), col=colspec

1259 )

1260 else:

1261 new_name, col, old_names = _try_convert_dates(

1262 converter, colspec, data_dict, orig_names

1263 )

1264 if new_name in data_dict:

1265 raise ValueError(f"New date column already in dict {new_name}")

1266 new_data[new_name] = col

1267 new_cols.append(new_name)

1268 date_cols.update(old_names)

1269

1270 elif isinstance(parse_spec, dict):

1271 # dict of new name to column list

1272 for new_name, colspec in parse_spec.items():

1273 if new_name in data_dict:

1274 raise ValueError(f"Date column {new_name} already in dict")

1275

1276 _, col, old_names = _try_convert_dates(

1277 converter,

1278 colspec,

1279 data_dict,

1280 orig_names,

1281 target_name=new_name,

1282 )

1283

1284 new_data[new_name] = col

1285

1286 # If original column can be converted to date we keep the converted values

1287 # This can only happen if values are from single column

1288 if len(colspec) == 1:

1289 new_data[colspec[0]] = col

1290

1291 new_cols.append(new_name)

1292 date_cols.update(old_names)

1293

1294 data_dict.update(new_data)

1295 new_cols.extend(columns)

1296

1297 if not keep_date_col:

1298 for c in list(date_cols):

1299 data_dict.pop(c)

1300 new_cols.remove(c)

1301

1302 return data_dict, new_cols

1303

1304

1305def _try_convert_dates(

1306 parser: Callable, colspec, data_dict, columns, target_name: str | None = None

1307):

1308 colset = set(columns)

1309 colnames = []

1310

1311 for c in colspec:

1312 if c in colset:

1313 colnames.append(c)

1314 elif isinstance(c, int) and c not in columns:

1315 colnames.append(columns[c])

1316 else:

1317 colnames.append(c)

1318

1319 new_name: tuple | str

1320 if all(isinstance(x, tuple) for x in colnames):

1321 new_name = tuple(map("_".join, zip(*colnames)))

1322 else:

1323 new_name = "_".join([str(x) for x in colnames])

1324 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]

1325

1326 new_col = parser(*to_parse, col=new_name if target_name is None else target_name)

1327 return new_name, new_col, colnames

1328

1329

1330def _get_na_values(col, na_values, na_fvalues, keep_default_na):

1331 """

1332 Get the NaN values for a given column.

1333

1334 Parameters

1335 ----------

1336 col : str

1337 The name of the column.

1338 na_values : array-like, dict

1339 The object listing the NaN values as strings.

1340 na_fvalues : array-like, dict

1341 The object listing the NaN values as floats.

1342 keep_default_na : bool

1343 If `na_values` is a dict, and the column is not mapped in the

1344 dictionary, whether to return the default NaN values or the empty set.

1345

1346 Returns

1347 -------

1348 nan_tuple : A length-two tuple composed of

1349

1350 1) na_values : the string NaN values for that column.

1351 2) na_fvalues : the float NaN values for that column.

1352 """

1353 if isinstance(na_values, dict):

1354 if col in na_values:

1355 return na_values[col], na_fvalues[col]

1356 else:

1357 if keep_default_na:

1358 return STR_NA_VALUES, set()

1359

1360 return set(), set()

1361 else:

1362 return na_values, na_fvalues

1363

1364

1365def _validate_parse_dates_arg(parse_dates):

1366 """

1367 Check whether or not the 'parse_dates' parameter

1368 is a non-boolean scalar. Raises a ValueError if

1369 that is the case.

1370 """

1371 msg = (

1372 "Only booleans, lists, and dictionaries are accepted "

1373 "for the 'parse_dates' parameter"

1374 )

1375

1376 if parse_dates is not None:

1377 if is_scalar(parse_dates):

1378 if not lib.is_bool(parse_dates):

1379 raise TypeError(msg)

1380

1381 elif not isinstance(parse_dates, (list, dict)):

1382 raise TypeError(msg)

1383

1384 return parse_dates

1385

1386

1387def is_index_col(col) -> bool:

1388 return col is not None and col is not False

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py: 15%

578 statements