1from __future__ import annotations
2
3from collections import defaultdict
4from copy import copy
5import csv
6import datetime
7from enum import Enum
8import itertools
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13 cast,
14 final,
15 overload,
16)
17import warnings
18
19import numpy as np
20
21from pandas._libs import (
22 lib,
23 parsers,
24)
25import pandas._libs.ops as libops
26from pandas._libs.parsers import STR_NA_VALUES
27from pandas._libs.tslibs import parsing
28from pandas.compat._optional import import_optional_dependency
29from pandas.errors import (
30 ParserError,
31 ParserWarning,
32)
33from pandas.util._exceptions import find_stack_level
34
35from pandas.core.dtypes.astype import astype_array
36from pandas.core.dtypes.common import (
37 ensure_object,
38 is_bool_dtype,
39 is_dict_like,
40 is_extension_array_dtype,
41 is_float_dtype,
42 is_integer,
43 is_integer_dtype,
44 is_list_like,
45 is_object_dtype,
46 is_scalar,
47 is_string_dtype,
48 pandas_dtype,
49)
50from pandas.core.dtypes.dtypes import (
51 CategoricalDtype,
52 ExtensionDtype,
53)
54from pandas.core.dtypes.missing import isna
55
56from pandas import (
57 ArrowDtype,
58 DataFrame,
59 DatetimeIndex,
60 StringDtype,
61 concat,
62)
63from pandas.core import algorithms
64from pandas.core.arrays import (
65 ArrowExtensionArray,
66 BaseMaskedArray,
67 BooleanArray,
68 Categorical,
69 ExtensionArray,
70 FloatingArray,
71 IntegerArray,
72)
73from pandas.core.arrays.boolean import BooleanDtype
74from pandas.core.indexes.api import (
75 Index,
76 MultiIndex,
77 default_index,
78 ensure_index_from_sequences,
79)
80from pandas.core.series import Series
81from pandas.core.tools import datetimes as tools
82
83from pandas.io.common import is_potential_multi_index
84
85if TYPE_CHECKING:
86 from collections.abc import (
87 Hashable,
88 Iterable,
89 Mapping,
90 Sequence,
91 )
92
93 from pandas._typing import (
94 ArrayLike,
95 DtypeArg,
96 DtypeObj,
97 Scalar,
98 )
99
100
101class ParserBase:
102 class BadLineHandleMethod(Enum):
103 ERROR = 0
104 WARN = 1
105 SKIP = 2
106
107 _implicit_index: bool
108 _first_chunk: bool
109 keep_default_na: bool
110 dayfirst: bool
111 cache_dates: bool
112 keep_date_col: bool
113 usecols_dtype: str | None
114
115 def __init__(self, kwds) -> None:
116 self._implicit_index = False
117
118 self.names = kwds.get("names")
119 self.orig_names: Sequence[Hashable] | None = None
120
121 self.index_col = kwds.get("index_col", None)
122 self.unnamed_cols: set = set()
123 self.index_names: Sequence[Hashable] | None = None
124 self.col_names: Sequence[Hashable] | None = None
125
126 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
127 self._parse_date_cols: Iterable = []
128 self.date_parser = kwds.pop("date_parser", lib.no_default)
129 self.date_format = kwds.pop("date_format", None)
130 self.dayfirst = kwds.pop("dayfirst", False)
131 self.keep_date_col = kwds.pop("keep_date_col", False)
132
133 self.na_values = kwds.get("na_values")
134 self.na_fvalues = kwds.get("na_fvalues")
135 self.na_filter = kwds.get("na_filter", False)
136 self.keep_default_na = kwds.get("keep_default_na", True)
137
138 self.dtype = copy(kwds.get("dtype", None))
139 self.converters = kwds.get("converters")
140 self.dtype_backend = kwds.get("dtype_backend")
141
142 self.true_values = kwds.get("true_values")
143 self.false_values = kwds.get("false_values")
144 self.cache_dates = kwds.pop("cache_dates", True)
145
146 self._date_conv = _make_date_converter(
147 date_parser=self.date_parser,
148 date_format=self.date_format,
149 dayfirst=self.dayfirst,
150 cache_dates=self.cache_dates,
151 )
152
153 # validate header options for mi
154 self.header = kwds.get("header")
155 if is_list_like(self.header, allow_sets=False):
156 if kwds.get("usecols"):
157 raise ValueError(
158 "cannot specify usecols when specifying a multi-index header"
159 )
160 if kwds.get("names"):
161 raise ValueError(
162 "cannot specify names when specifying a multi-index header"
163 )
164
165 # validate index_col that only contains integers
166 if self.index_col is not None:
167 # In this case we can pin down index_col as list[int]
168 if is_integer(self.index_col):
169 self.index_col = [self.index_col]
170 elif not (
171 is_list_like(self.index_col, allow_sets=False)
172 and all(map(is_integer, self.index_col))
173 ):
174 raise ValueError(
175 "index_col must only contain row numbers "
176 "when specifying a multi-index header"
177 )
178 else:
179 self.index_col = list(self.index_col)
180
181 self._name_processed = False
182
183 self._first_chunk = True
184
185 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
186
187 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
188 # Normally, this arg would get pre-processed earlier on
189 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
190
191 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
192 """
193 Check if parse_dates are in columns.
194
195 If user has provided names for parse_dates, check if those columns
196 are available.
197
198 Parameters
199 ----------
200 columns : list
201 List of names of the dataframe.
202
203 Returns
204 -------
205 The names of the columns which will get parsed later if a dict or list
206 is given as specification.
207
208 Raises
209 ------
210 ValueError
211 If column to parse_date is not in dataframe.
212
213 """
214 cols_needed: Iterable
215 if is_dict_like(self.parse_dates):
216 cols_needed = itertools.chain(*self.parse_dates.values())
217 elif is_list_like(self.parse_dates):
218 # a column in parse_dates could be represented
219 # ColReference = Union[int, str]
220 # DateGroups = List[ColReference]
221 # ParseDates = Union[DateGroups, List[DateGroups],
222 # Dict[ColReference, DateGroups]]
223 cols_needed = itertools.chain.from_iterable(
224 col if is_list_like(col) and not isinstance(col, tuple) else [col]
225 for col in self.parse_dates
226 )
227 else:
228 cols_needed = []
229
230 cols_needed = list(cols_needed)
231
232 # get only columns that are references using names (str), not by index
233 missing_cols = ", ".join(
234 sorted(
235 {
236 col
237 for col in cols_needed
238 if isinstance(col, str) and col not in columns
239 }
240 )
241 )
242 if missing_cols:
243 raise ValueError(
244 f"Missing column provided to 'parse_dates': '{missing_cols}'"
245 )
246 # Convert positions to actual column names
247 return [
248 col if (isinstance(col, str) or col in columns) else columns[col]
249 for col in cols_needed
250 ]
251
252 def close(self) -> None:
253 pass
254
255 @final
256 @property
257 def _has_complex_date_col(self) -> bool:
258 return isinstance(self.parse_dates, dict) or (
259 isinstance(self.parse_dates, list)
260 and len(self.parse_dates) > 0
261 and isinstance(self.parse_dates[0], list)
262 )
263
264 @final
265 def _should_parse_dates(self, i: int) -> bool:
266 if lib.is_bool(self.parse_dates):
267 return bool(self.parse_dates)
268 else:
269 if self.index_names is not None:
270 name = self.index_names[i]
271 else:
272 name = None
273 j = i if self.index_col is None else self.index_col[i]
274
275 return (j in self.parse_dates) or (
276 name is not None and name in self.parse_dates
277 )
278
279 @final
280 def _extract_multi_indexer_columns(
281 self,
282 header,
283 index_names: Sequence[Hashable] | None,
284 passed_names: bool = False,
285 ) -> tuple[
286 Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool
287 ]:
288 """
289 Extract and return the names, index_names, col_names if the column
290 names are a MultiIndex.
291
292 Parameters
293 ----------
294 header: list of lists
295 The header rows
296 index_names: list, optional
297 The names of the future index
298 passed_names: bool, default False
299 A flag specifying if names where passed
300
301 """
302 if len(header) < 2:
303 return header[0], index_names, None, passed_names
304
305 # the names are the tuples of the header that are not the index cols
306 # 0 is the name of the index, assuming index_col is a list of column
307 # numbers
308 ic = self.index_col
309 if ic is None:
310 ic = []
311
312 if not isinstance(ic, (list, tuple, np.ndarray)):
313 ic = [ic]
314 sic = set(ic)
315
316 # clean the index_names
317 index_names = header.pop(-1)
318 index_names, _, _ = self._clean_index_names(index_names, self.index_col)
319
320 # extract the columns
321 field_count = len(header[0])
322
323 # check if header lengths are equal
324 if not all(len(header_iter) == field_count for header_iter in header[1:]):
325 raise ParserError("Header rows must have an equal number of columns.")
326
327 def extract(r):
328 return tuple(r[i] for i in range(field_count) if i not in sic)
329
330 columns = list(zip(*(extract(r) for r in header)))
331 names = columns.copy()
332 for single_ic in sorted(ic):
333 names.insert(single_ic, single_ic)
334
335 # Clean the column names (if we have an index_col).
336 if len(ic):
337 col_names = [
338 r[ic[0]]
339 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)
340 else None
341 for r in header
342 ]
343 else:
344 col_names = [None] * len(header)
345
346 passed_names = True
347
348 return names, index_names, col_names, passed_names
349
350 @final
351 def _maybe_make_multi_index_columns(
352 self,
353 columns: Sequence[Hashable],
354 col_names: Sequence[Hashable] | None = None,
355 ) -> Sequence[Hashable] | MultiIndex:
356 # possibly create a column mi here
357 if is_potential_multi_index(columns):
358 list_columns = cast(list[tuple], columns)
359 return MultiIndex.from_tuples(list_columns, names=col_names)
360 return columns
361
362 @final
363 def _make_index(
364 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
365 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
366 index: Index | None
367 if not is_index_col(self.index_col) or not self.index_col:
368 index = None
369
370 elif not self._has_complex_date_col:
371 simple_index = self._get_simple_index(alldata, columns)
372 index = self._agg_index(simple_index)
373 elif self._has_complex_date_col:
374 if not self._name_processed:
375 (self.index_names, _, self.index_col) = self._clean_index_names(
376 list(columns), self.index_col
377 )
378 self._name_processed = True
379 date_index = self._get_complex_date_index(data, columns)
380 index = self._agg_index(date_index, try_parse_dates=False)
381
382 # add names for the index
383 if indexnamerow:
384 coffset = len(indexnamerow) - len(columns)
385 assert index is not None
386 index = index.set_names(indexnamerow[:coffset])
387
388 # maybe create a mi on the columns
389 columns = self._maybe_make_multi_index_columns(columns, self.col_names)
390
391 return index, columns
392
393 @final
394 def _get_simple_index(self, data, columns):
395 def ix(col):
396 if not isinstance(col, str):
397 return col
398 raise ValueError(f"Index {col} invalid")
399
400 to_remove = []
401 index = []
402 for idx in self.index_col:
403 i = ix(idx)
404 to_remove.append(i)
405 index.append(data[i])
406
407 # remove index items from content and columns, don't pop in
408 # loop
409 for i in sorted(to_remove, reverse=True):
410 data.pop(i)
411 if not self._implicit_index:
412 columns.pop(i)
413
414 return index
415
416 @final
417 def _get_complex_date_index(self, data, col_names):
418 def _get_name(icol):
419 if isinstance(icol, str):
420 return icol
421
422 if col_names is None:
423 raise ValueError(f"Must supply column order to use {icol!s} as index")
424
425 for i, c in enumerate(col_names):
426 if i == icol:
427 return c
428
429 to_remove = []
430 index = []
431 for idx in self.index_col:
432 name = _get_name(idx)
433 to_remove.append(name)
434 index.append(data[name])
435
436 # remove index items from content and columns, don't pop in
437 # loop
438 for c in sorted(to_remove, reverse=True):
439 data.pop(c)
440 col_names.remove(c)
441
442 return index
443
444 @final
445 def _clean_mapping(self, mapping):
446 """converts col numbers to names"""
447 if not isinstance(mapping, dict):
448 return mapping
449 clean = {}
450 # for mypy
451 assert self.orig_names is not None
452
453 for col, v in mapping.items():
454 if isinstance(col, int) and col not in self.orig_names:
455 col = self.orig_names[col]
456 clean[col] = v
457 if isinstance(mapping, defaultdict):
458 remaining_cols = set(self.orig_names) - set(clean.keys())
459 clean.update({col: mapping[col] for col in remaining_cols})
460 return clean
461
462 @final
463 def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
464 arrays = []
465 converters = self._clean_mapping(self.converters)
466
467 for i, arr in enumerate(index):
468 if try_parse_dates and self._should_parse_dates(i):
469 arr = self._date_conv(
470 arr,
471 col=self.index_names[i] if self.index_names is not None else None,
472 )
473
474 if self.na_filter:
475 col_na_values = self.na_values
476 col_na_fvalues = self.na_fvalues
477 else:
478 col_na_values = set()
479 col_na_fvalues = set()
480
481 if isinstance(self.na_values, dict):
482 assert self.index_names is not None
483 col_name = self.index_names[i]
484 if col_name is not None:
485 col_na_values, col_na_fvalues = _get_na_values(
486 col_name, self.na_values, self.na_fvalues, self.keep_default_na
487 )
488
489 clean_dtypes = self._clean_mapping(self.dtype)
490
491 cast_type = None
492 index_converter = False
493 if self.index_names is not None:
494 if isinstance(clean_dtypes, dict):
495 cast_type = clean_dtypes.get(self.index_names[i], None)
496
497 if isinstance(converters, dict):
498 index_converter = converters.get(self.index_names[i]) is not None
499
500 try_num_bool = not (
501 cast_type and is_string_dtype(cast_type) or index_converter
502 )
503
504 arr, _ = self._infer_types(
505 arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
506 )
507 arrays.append(arr)
508
509 names = self.index_names
510 index = ensure_index_from_sequences(arrays, names)
511
512 return index
513
514 @final
515 def _convert_to_ndarrays(
516 self,
517 dct: Mapping,
518 na_values,
519 na_fvalues,
520 verbose: bool = False,
521 converters=None,
522 dtypes=None,
523 ):
524 result = {}
525 for c, values in dct.items():
526 conv_f = None if converters is None else converters.get(c, None)
527 if isinstance(dtypes, dict):
528 cast_type = dtypes.get(c, None)
529 else:
530 # single dtype or None
531 cast_type = dtypes
532
533 if self.na_filter:
534 col_na_values, col_na_fvalues = _get_na_values(
535 c, na_values, na_fvalues, self.keep_default_na
536 )
537 else:
538 col_na_values, col_na_fvalues = set(), set()
539
540 if c in self._parse_date_cols:
541 # GH#26203 Do not convert columns which get converted to dates
542 # but replace nans to ensure to_datetime works
543 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
544 np.putmask(values, mask, np.nan)
545 result[c] = values
546 continue
547
548 if conv_f is not None:
549 # conv_f applied to data before inference
550 if cast_type is not None:
551 warnings.warn(
552 (
553 "Both a converter and dtype were specified "
554 f"for column {c} - only the converter will be used."
555 ),
556 ParserWarning,
557 stacklevel=find_stack_level(),
558 )
559
560 try:
561 values = lib.map_infer(values, conv_f)
562 except ValueError:
563 mask = algorithms.isin(values, list(na_values)).view(np.uint8)
564 values = lib.map_infer_mask(values, conv_f, mask)
565
566 cvals, na_count = self._infer_types(
567 values,
568 set(col_na_values) | col_na_fvalues,
569 cast_type is None,
570 try_num_bool=False,
571 )
572 else:
573 is_ea = is_extension_array_dtype(cast_type)
574 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
575 # skip inference if specified dtype is object
576 # or casting to an EA
577 try_num_bool = not (cast_type and is_str_or_ea_dtype)
578
579 # general type inference and conversion
580 cvals, na_count = self._infer_types(
581 values,
582 set(col_na_values) | col_na_fvalues,
583 cast_type is None,
584 try_num_bool,
585 )
586
587 # type specified in dtype param or cast_type is an EA
588 if cast_type is not None:
589 cast_type = pandas_dtype(cast_type)
590 if cast_type and (cvals.dtype != cast_type or is_ea):
591 if not is_ea and na_count > 0:
592 if is_bool_dtype(cast_type):
593 raise ValueError(f"Bool column has NA values in column {c}")
594 cvals = self._cast_types(cvals, cast_type, c)
595
596 result[c] = cvals
597 if verbose and na_count:
598 print(f"Filled {na_count} NA values in column {c!s}")
599 return result
600
601 @final
602 def _set_noconvert_dtype_columns(
603 self, col_indices: list[int], names: Sequence[Hashable]
604 ) -> set[int]:
605 """
606 Set the columns that should not undergo dtype conversions.
607
608 Currently, any column that is involved with date parsing will not
609 undergo such conversions. If usecols is specified, the positions of the columns
610 not to cast is relative to the usecols not to all columns.
611
612 Parameters
613 ----------
614 col_indices: The indices specifying order and positions of the columns
615 names: The column names which order is corresponding with the order
616 of col_indices
617
618 Returns
619 -------
620 A set of integers containing the positions of the columns not to convert.
621 """
622 usecols: list[int] | list[str] | None
623 noconvert_columns = set()
624 if self.usecols_dtype == "integer":
625 # A set of integers will be converted to a list in
626 # the correct order every single time.
627 usecols = sorted(self.usecols)
628 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
629 # The names attribute should have the correct columns
630 # in the proper order for indexing with parse_dates.
631 usecols = col_indices
632 else:
633 # Usecols is empty.
634 usecols = None
635
636 def _set(x) -> int:
637 if usecols is not None and is_integer(x):
638 x = usecols[x]
639
640 if not is_integer(x):
641 x = col_indices[names.index(x)]
642
643 return x
644
645 if isinstance(self.parse_dates, list):
646 for val in self.parse_dates:
647 if isinstance(val, list):
648 for k in val:
649 noconvert_columns.add(_set(k))
650 else:
651 noconvert_columns.add(_set(val))
652
653 elif isinstance(self.parse_dates, dict):
654 for val in self.parse_dates.values():
655 if isinstance(val, list):
656 for k in val:
657 noconvert_columns.add(_set(k))
658 else:
659 noconvert_columns.add(_set(val))
660
661 elif self.parse_dates:
662 if isinstance(self.index_col, list):
663 for k in self.index_col:
664 noconvert_columns.add(_set(k))
665 elif self.index_col is not None:
666 noconvert_columns.add(_set(self.index_col))
667
668 return noconvert_columns
669
670 @final
671 def _infer_types(
672 self, values, na_values, no_dtype_specified, try_num_bool: bool = True
673 ) -> tuple[ArrayLike, int]:
674 """
675 Infer types of values, possibly casting
676
677 Parameters
678 ----------
679 values : ndarray
680 na_values : set
681 no_dtype_specified: Specifies if we want to cast explicitly
682 try_num_bool : bool, default try
683 try to cast values to numeric (first preference) or boolean
684
685 Returns
686 -------
687 converted : ndarray or ExtensionArray
688 na_count : int
689 """
690 na_count = 0
691 if issubclass(values.dtype.type, (np.number, np.bool_)):
692 # If our array has numeric dtype, we don't have to check for strings in isin
693 na_values = np.array([val for val in na_values if not isinstance(val, str)])
694 mask = algorithms.isin(values, na_values)
695 na_count = mask.astype("uint8", copy=False).sum()
696 if na_count > 0:
697 if is_integer_dtype(values):
698 values = values.astype(np.float64)
699 np.putmask(values, mask, np.nan)
700 return values, na_count
701
702 dtype_backend = self.dtype_backend
703 non_default_dtype_backend = (
704 no_dtype_specified and dtype_backend is not lib.no_default
705 )
706 result: ArrayLike
707
708 if try_num_bool and is_object_dtype(values.dtype):
709 # exclude e.g DatetimeIndex here
710 try:
711 result, result_mask = lib.maybe_convert_numeric(
712 values,
713 na_values,
714 False,
715 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
716 )
717 except (ValueError, TypeError):
718 # e.g. encountering datetime string gets ValueError
719 # TypeError can be raised in floatify
720 na_count = parsers.sanitize_objects(values, na_values)
721 result = values
722 else:
723 if non_default_dtype_backend:
724 if result_mask is None:
725 result_mask = np.zeros(result.shape, dtype=np.bool_)
726
727 if result_mask.all():
728 result = IntegerArray(
729 np.ones(result_mask.shape, dtype=np.int64), result_mask
730 )
731 elif is_integer_dtype(result):
732 result = IntegerArray(result, result_mask)
733 elif is_bool_dtype(result):
734 result = BooleanArray(result, result_mask)
735 elif is_float_dtype(result):
736 result = FloatingArray(result, result_mask)
737
738 na_count = result_mask.sum()
739 else:
740 na_count = isna(result).sum()
741 else:
742 result = values
743 if values.dtype == np.object_:
744 na_count = parsers.sanitize_objects(values, na_values)
745
746 if result.dtype == np.object_ and try_num_bool:
747 result, bool_mask = libops.maybe_convert_bool(
748 np.asarray(values),
749 true_values=self.true_values,
750 false_values=self.false_values,
751 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
752 )
753 if result.dtype == np.bool_ and non_default_dtype_backend:
754 if bool_mask is None:
755 bool_mask = np.zeros(result.shape, dtype=np.bool_)
756 result = BooleanArray(result, bool_mask)
757 elif result.dtype == np.object_ and non_default_dtype_backend:
758 # read_excel sends array of datetime objects
759 if not lib.is_datetime_array(result, skipna=True):
760 dtype = StringDtype()
761 cls = dtype.construct_array_type()
762 result = cls._from_sequence(values, dtype=dtype)
763
764 if dtype_backend == "pyarrow":
765 pa = import_optional_dependency("pyarrow")
766 if isinstance(result, np.ndarray):
767 result = ArrowExtensionArray(pa.array(result, from_pandas=True))
768 elif isinstance(result, BaseMaskedArray):
769 if result._mask.all():
770 # We want an arrow null array here
771 result = ArrowExtensionArray(pa.array([None] * len(result)))
772 else:
773 result = ArrowExtensionArray(
774 pa.array(result._data, mask=result._mask)
775 )
776 else:
777 result = ArrowExtensionArray(
778 pa.array(result.to_numpy(), from_pandas=True)
779 )
780
781 return result, na_count
782
783 @final
784 def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
785 """
786 Cast values to specified type
787
788 Parameters
789 ----------
790 values : ndarray or ExtensionArray
791 cast_type : np.dtype or ExtensionDtype
792 dtype to cast values to
793 column : string
794 column name - used only for error reporting
795
796 Returns
797 -------
798 converted : ndarray or ExtensionArray
799 """
800 if isinstance(cast_type, CategoricalDtype):
801 known_cats = cast_type.categories is not None
802
803 if not is_object_dtype(values.dtype) and not known_cats:
804 # TODO: this is for consistency with
805 # c-parser which parses all categories
806 # as strings
807 values = lib.ensure_string_array(
808 values, skipna=False, convert_na_value=False
809 )
810
811 cats = Index(values).unique().dropna()
812 values = Categorical._from_inferred_categories(
813 cats, cats.get_indexer(values), cast_type, true_values=self.true_values
814 )
815
816 # use the EA's implementation of casting
817 elif isinstance(cast_type, ExtensionDtype):
818 array_type = cast_type.construct_array_type()
819 try:
820 if isinstance(cast_type, BooleanDtype):
821 # error: Unexpected keyword argument "true_values" for
822 # "_from_sequence_of_strings" of "ExtensionArray"
823 return array_type._from_sequence_of_strings( # type: ignore[call-arg]
824 values,
825 dtype=cast_type,
826 true_values=self.true_values,
827 false_values=self.false_values,
828 )
829 else:
830 return array_type._from_sequence_of_strings(values, dtype=cast_type)
831 except NotImplementedError as err:
832 raise NotImplementedError(
833 f"Extension Array: {array_type} must implement "
834 "_from_sequence_of_strings in order to be used in parser methods"
835 ) from err
836
837 elif isinstance(values, ExtensionArray):
838 values = values.astype(cast_type, copy=False)
839 elif issubclass(cast_type.type, str):
840 # TODO: why skipna=True here and False above? some tests depend
841 # on it here, but nothing fails if we change it above
842 # (as no tests get there as of 2022-12-06)
843 values = lib.ensure_string_array(
844 values, skipna=True, convert_na_value=False
845 )
846 else:
847 try:
848 values = astype_array(values, cast_type, copy=True)
849 except ValueError as err:
850 raise ValueError(
851 f"Unable to convert column {column} to type {cast_type}"
852 ) from err
853 return values
854
855 @overload
856 def _do_date_conversions(
857 self,
858 names: Index,
859 data: DataFrame,
860 ) -> tuple[Sequence[Hashable] | Index, DataFrame]:
861 ...
862
863 @overload
864 def _do_date_conversions(
865 self,
866 names: Sequence[Hashable],
867 data: Mapping[Hashable, ArrayLike],
868 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
869 ...
870
871 @final
872 def _do_date_conversions(
873 self,
874 names: Sequence[Hashable] | Index,
875 data: Mapping[Hashable, ArrayLike] | DataFrame,
876 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
877 # returns data, columns
878
879 if self.parse_dates is not None:
880 data, names = _process_date_conversion(
881 data,
882 self._date_conv,
883 self.parse_dates,
884 self.index_col,
885 self.index_names,
886 names,
887 keep_date_col=self.keep_date_col,
888 dtype_backend=self.dtype_backend,
889 )
890
891 return names, data
892
893 @final
894 def _check_data_length(
895 self,
896 columns: Sequence[Hashable],
897 data: Sequence[ArrayLike],
898 ) -> None:
899 """Checks if length of data is equal to length of column names.
900
901 One set of trailing commas is allowed. self.index_col not False
902 results in a ParserError previously when lengths do not match.
903
904 Parameters
905 ----------
906 columns: list of column names
907 data: list of array-likes containing the data column-wise.
908 """
909 if not self.index_col and len(columns) != len(data) and columns:
910 empty_str = is_object_dtype(data[-1]) and data[-1] == ""
911 # error: No overload variant of "__ror__" of "ndarray" matches
912 # argument type "ExtensionArray"
913 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
914 if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
915 return
916 warnings.warn(
917 "Length of header or names does not match length of data. This leads "
918 "to a loss of data with index_col=False.",
919 ParserWarning,
920 stacklevel=find_stack_level(),
921 )
922
923 @overload
924 def _evaluate_usecols(
925 self,
926 usecols: set[int] | Callable[[Hashable], object],
927 names: Sequence[Hashable],
928 ) -> set[int]:
929 ...
930
931 @overload
932 def _evaluate_usecols(
933 self, usecols: set[str], names: Sequence[Hashable]
934 ) -> set[str]:
935 ...
936
937 @final
938 def _evaluate_usecols(
939 self,
940 usecols: Callable[[Hashable], object] | set[str] | set[int],
941 names: Sequence[Hashable],
942 ) -> set[str] | set[int]:
943 """
944 Check whether or not the 'usecols' parameter
945 is a callable. If so, enumerates the 'names'
946 parameter and returns a set of indices for
947 each entry in 'names' that evaluates to True.
948 If not a callable, returns 'usecols'.
949 """
950 if callable(usecols):
951 return {i for i, name in enumerate(names) if usecols(name)}
952 return usecols
953
954 @final
955 def _validate_usecols_names(self, usecols, names: Sequence):
956 """
957 Validates that all usecols are present in a given
958 list of names. If not, raise a ValueError that
959 shows what usecols are missing.
960
961 Parameters
962 ----------
963 usecols : iterable of usecols
964 The columns to validate are present in names.
965 names : iterable of names
966 The column names to check against.
967
968 Returns
969 -------
970 usecols : iterable of usecols
971 The `usecols` parameter if the validation succeeds.
972
973 Raises
974 ------
975 ValueError : Columns were missing. Error message will list them.
976 """
977 missing = [c for c in usecols if c not in names]
978 if len(missing) > 0:
979 raise ValueError(
980 f"Usecols do not match columns, columns expected but not found: "
981 f"{missing}"
982 )
983
984 return usecols
985
986 @final
987 def _validate_usecols_arg(self, usecols):
988 """
989 Validate the 'usecols' parameter.
990
991 Checks whether or not the 'usecols' parameter contains all integers
992 (column selection by index), strings (column by name) or is a callable.
993 Raises a ValueError if that is not the case.
994
995 Parameters
996 ----------
997 usecols : list-like, callable, or None
998 List of columns to use when parsing or a callable that can be used
999 to filter a list of table columns.
1000
1001 Returns
1002 -------
1003 usecols_tuple : tuple
1004 A tuple of (verified_usecols, usecols_dtype).
1005
1006 'verified_usecols' is either a set if an array-like is passed in or
1007 'usecols' if a callable or None is passed in.
1008
1009 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
1010 is passed in or None if a callable or None is passed in.
1011 """
1012 msg = (
1013 "'usecols' must either be list-like of all strings, all unicode, "
1014 "all integers or a callable."
1015 )
1016 if usecols is not None:
1017 if callable(usecols):
1018 return usecols, None
1019
1020 if not is_list_like(usecols):
1021 # see gh-20529
1022 #
1023 # Ensure it is iterable container but not string.
1024 raise ValueError(msg)
1025
1026 usecols_dtype = lib.infer_dtype(usecols, skipna=False)
1027
1028 if usecols_dtype not in ("empty", "integer", "string"):
1029 raise ValueError(msg)
1030
1031 usecols = set(usecols)
1032
1033 return usecols, usecols_dtype
1034 return usecols, None
1035
1036 @final
1037 def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
1038 if not is_index_col(index_col):
1039 return None, columns, index_col
1040
1041 columns = list(columns)
1042
1043 # In case of no rows and multiindex columns we have to set index_names to
1044 # list of Nones GH#38292
1045 if not columns:
1046 return [None] * len(index_col), columns, index_col
1047
1048 cp_cols = list(columns)
1049 index_names: list[str | int | None] = []
1050
1051 # don't mutate
1052 index_col = list(index_col)
1053
1054 for i, c in enumerate(index_col):
1055 if isinstance(c, str):
1056 index_names.append(c)
1057 for j, name in enumerate(cp_cols):
1058 if name == c:
1059 index_col[i] = j
1060 columns.remove(name)
1061 break
1062 else:
1063 name = cp_cols[c]
1064 columns.remove(name)
1065 index_names.append(name)
1066
1067 # Only clean index names that were placeholders.
1068 for i, name in enumerate(index_names):
1069 if isinstance(name, str) and name in self.unnamed_cols:
1070 index_names[i] = None
1071
1072 return index_names, columns, index_col
1073
1074 @final
1075 def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
1076 columns = list(columns)
1077
1078 index_col = self.index_col
1079 index_names = self.index_names
1080
1081 # Convert `dtype` to a defaultdict of some kind.
1082 # This will enable us to write `dtype[col_name]`
1083 # without worrying about KeyError issues later on.
1084 dtype_dict: defaultdict[Hashable, Any]
1085 if not is_dict_like(dtype):
1086 # if dtype == None, default will be object.
1087 default_dtype = dtype or object
1088 dtype_dict = defaultdict(lambda: default_dtype)
1089 else:
1090 dtype = cast(dict, dtype)
1091 dtype_dict = defaultdict(
1092 lambda: object,
1093 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
1094 )
1095
1096 # Even though we have no data, the "index" of the empty DataFrame
1097 # could for example still be an empty MultiIndex. Thus, we need to
1098 # check whether we have any index columns specified, via either:
1099 #
1100 # 1) index_col (column indices)
1101 # 2) index_names (column names)
1102 #
1103 # Both must be non-null to ensure a successful construction. Otherwise,
1104 # we have to create a generic empty Index.
1105 index: Index
1106 if (index_col is None or index_col is False) or index_names is None:
1107 index = default_index(0)
1108 else:
1109 data = [Series([], dtype=dtype_dict[name]) for name in index_names]
1110 index = ensure_index_from_sequences(data, names=index_names)
1111 index_col.sort()
1112
1113 for i, n in enumerate(index_col):
1114 columns.pop(n - i)
1115
1116 col_dict = {
1117 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns
1118 }
1119
1120 return index, columns, col_dict
1121
1122
1123def _make_date_converter(
1124 date_parser=lib.no_default,
1125 dayfirst: bool = False,
1126 cache_dates: bool = True,
1127 date_format: dict[Hashable, str] | str | None = None,
1128):
1129 if date_parser is not lib.no_default:
1130 warnings.warn(
1131 "The argument 'date_parser' is deprecated and will "
1132 "be removed in a future version. "
1133 "Please use 'date_format' instead, or read your data in as 'object' dtype "
1134 "and then call 'to_datetime'.",
1135 FutureWarning,
1136 stacklevel=find_stack_level(),
1137 )
1138 if date_parser is not lib.no_default and date_format is not None:
1139 raise TypeError("Cannot use both 'date_parser' and 'date_format'")
1140
1141 def unpack_if_single_element(arg):
1142 # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
1143 if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:
1144 return arg[0]
1145 return arg
1146
1147 def converter(*date_cols, col: Hashable):
1148 if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":
1149 return date_cols[0]
1150
1151 if date_parser is lib.no_default:
1152 strs = parsing.concat_date_cols(date_cols)
1153 date_fmt = (
1154 date_format.get(col) if isinstance(date_format, dict) else date_format
1155 )
1156
1157 with warnings.catch_warnings():
1158 warnings.filterwarnings(
1159 "ignore",
1160 ".*parsing datetimes with mixed time zones will raise an error",
1161 category=FutureWarning,
1162 )
1163 str_objs = ensure_object(strs)
1164 try:
1165 result = tools.to_datetime(
1166 str_objs,
1167 format=date_fmt,
1168 utc=False,
1169 dayfirst=dayfirst,
1170 cache=cache_dates,
1171 )
1172 except (ValueError, TypeError):
1173 # test_usecols_with_parse_dates4
1174 return str_objs
1175
1176 if isinstance(result, DatetimeIndex):
1177 arr = result.to_numpy()
1178 arr.flags.writeable = True
1179 return arr
1180 return result._values
1181 else:
1182 try:
1183 with warnings.catch_warnings():
1184 warnings.filterwarnings(
1185 "ignore",
1186 ".*parsing datetimes with mixed time zones "
1187 "will raise an error",
1188 category=FutureWarning,
1189 )
1190 pre_parsed = date_parser(
1191 *(unpack_if_single_element(arg) for arg in date_cols)
1192 )
1193 try:
1194 result = tools.to_datetime(
1195 pre_parsed,
1196 cache=cache_dates,
1197 )
1198 except (ValueError, TypeError):
1199 # test_read_csv_with_custom_date_parser
1200 result = pre_parsed
1201 if isinstance(result, datetime.datetime):
1202 raise Exception("scalar parser")
1203 return result
1204 except Exception:
1205 # e.g. test_datetime_fractional_seconds
1206 with warnings.catch_warnings():
1207 warnings.filterwarnings(
1208 "ignore",
1209 ".*parsing datetimes with mixed time zones "
1210 "will raise an error",
1211 category=FutureWarning,
1212 )
1213 pre_parsed = parsing.try_parse_dates(
1214 parsing.concat_date_cols(date_cols),
1215 parser=date_parser,
1216 )
1217 try:
1218 return tools.to_datetime(pre_parsed)
1219 except (ValueError, TypeError):
1220 # TODO: not reached in tests 2023-10-27; needed?
1221 return pre_parsed
1222
1223 return converter
1224
1225
1226parser_defaults = {
1227 "delimiter": None,
1228 "escapechar": None,
1229 "quotechar": '"',
1230 "quoting": csv.QUOTE_MINIMAL,
1231 "doublequote": True,
1232 "skipinitialspace": False,
1233 "lineterminator": None,
1234 "header": "infer",
1235 "index_col": None,
1236 "names": None,
1237 "skiprows": None,
1238 "skipfooter": 0,
1239 "nrows": None,
1240 "na_values": None,
1241 "keep_default_na": True,
1242 "true_values": None,
1243 "false_values": None,
1244 "converters": None,
1245 "dtype": None,
1246 "cache_dates": True,
1247 "thousands": None,
1248 "comment": None,
1249 "decimal": ".",
1250 # 'engine': 'c',
1251 "parse_dates": False,
1252 "keep_date_col": False,
1253 "dayfirst": False,
1254 "date_parser": lib.no_default,
1255 "date_format": None,
1256 "usecols": None,
1257 # 'iterator': False,
1258 "chunksize": None,
1259 "verbose": False,
1260 "encoding": None,
1261 "compression": None,
1262 "skip_blank_lines": True,
1263 "encoding_errors": "strict",
1264 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
1265 "dtype_backend": lib.no_default,
1266}
1267
1268
1269def _process_date_conversion(
1270 data_dict,
1271 converter: Callable,
1272 parse_spec,
1273 index_col,
1274 index_names,
1275 columns,
1276 keep_date_col: bool = False,
1277 dtype_backend=lib.no_default,
1278):
1279 def _isindex(colspec):
1280 return (isinstance(index_col, list) and colspec in index_col) or (
1281 isinstance(index_names, list) and colspec in index_names
1282 )
1283
1284 new_cols = []
1285 new_data = {}
1286
1287 orig_names = columns
1288 columns = list(columns)
1289
1290 date_cols = set()
1291
1292 if parse_spec is None or isinstance(parse_spec, bool):
1293 return data_dict, columns
1294
1295 if isinstance(parse_spec, list):
1296 # list of column lists
1297 for colspec in parse_spec:
1298 if is_scalar(colspec) or isinstance(colspec, tuple):
1299 if isinstance(colspec, int) and colspec not in data_dict:
1300 colspec = orig_names[colspec]
1301 if _isindex(colspec):
1302 continue
1303 elif dtype_backend == "pyarrow":
1304 import pyarrow as pa
1305
1306 dtype = data_dict[colspec].dtype
1307 if isinstance(dtype, ArrowDtype) and (
1308 pa.types.is_timestamp(dtype.pyarrow_dtype)
1309 or pa.types.is_date(dtype.pyarrow_dtype)
1310 ):
1311 continue
1312
1313 # Pyarrow engine returns Series which we need to convert to
1314 # numpy array before converter, its a no-op for other parsers
1315 data_dict[colspec] = converter(
1316 np.asarray(data_dict[colspec]), col=colspec
1317 )
1318 else:
1319 new_name, col, old_names = _try_convert_dates(
1320 converter, colspec, data_dict, orig_names
1321 )
1322 if new_name in data_dict:
1323 raise ValueError(f"New date column already in dict {new_name}")
1324 new_data[new_name] = col
1325 new_cols.append(new_name)
1326 date_cols.update(old_names)
1327
1328 elif isinstance(parse_spec, dict):
1329 # dict of new name to column list
1330 for new_name, colspec in parse_spec.items():
1331 if new_name in data_dict:
1332 raise ValueError(f"Date column {new_name} already in dict")
1333
1334 _, col, old_names = _try_convert_dates(
1335 converter,
1336 colspec,
1337 data_dict,
1338 orig_names,
1339 target_name=new_name,
1340 )
1341
1342 new_data[new_name] = col
1343
1344 # If original column can be converted to date we keep the converted values
1345 # This can only happen if values are from single column
1346 if len(colspec) == 1:
1347 new_data[colspec[0]] = col
1348
1349 new_cols.append(new_name)
1350 date_cols.update(old_names)
1351
1352 if isinstance(data_dict, DataFrame):
1353 data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False)
1354 else:
1355 data_dict.update(new_data)
1356 new_cols.extend(columns)
1357
1358 if not keep_date_col:
1359 for c in list(date_cols):
1360 data_dict.pop(c)
1361 new_cols.remove(c)
1362
1363 return data_dict, new_cols
1364
1365
1366def _try_convert_dates(
1367 parser: Callable, colspec, data_dict, columns, target_name: str | None = None
1368):
1369 colset = set(columns)
1370 colnames = []
1371
1372 for c in colspec:
1373 if c in colset:
1374 colnames.append(c)
1375 elif isinstance(c, int) and c not in columns:
1376 colnames.append(columns[c])
1377 else:
1378 colnames.append(c)
1379
1380 new_name: tuple | str
1381 if all(isinstance(x, tuple) for x in colnames):
1382 new_name = tuple(map("_".join, zip(*colnames)))
1383 else:
1384 new_name = "_".join([str(x) for x in colnames])
1385 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]
1386
1387 new_col = parser(*to_parse, col=new_name if target_name is None else target_name)
1388 return new_name, new_col, colnames
1389
1390
1391def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
1392 """
1393 Get the NaN values for a given column.
1394
1395 Parameters
1396 ----------
1397 col : str
1398 The name of the column.
1399 na_values : array-like, dict
1400 The object listing the NaN values as strings.
1401 na_fvalues : array-like, dict
1402 The object listing the NaN values as floats.
1403 keep_default_na : bool
1404 If `na_values` is a dict, and the column is not mapped in the
1405 dictionary, whether to return the default NaN values or the empty set.
1406
1407 Returns
1408 -------
1409 nan_tuple : A length-two tuple composed of
1410
1411 1) na_values : the string NaN values for that column.
1412 2) na_fvalues : the float NaN values for that column.
1413 """
1414 if isinstance(na_values, dict):
1415 if col in na_values:
1416 return na_values[col], na_fvalues[col]
1417 else:
1418 if keep_default_na:
1419 return STR_NA_VALUES, set()
1420
1421 return set(), set()
1422 else:
1423 return na_values, na_fvalues
1424
1425
1426def _validate_parse_dates_arg(parse_dates):
1427 """
1428 Check whether or not the 'parse_dates' parameter
1429 is a non-boolean scalar. Raises a ValueError if
1430 that is the case.
1431 """
1432 msg = (
1433 "Only booleans, lists, and dictionaries are accepted "
1434 "for the 'parse_dates' parameter"
1435 )
1436
1437 if not (
1438 parse_dates is None
1439 or lib.is_bool(parse_dates)
1440 or isinstance(parse_dates, (list, dict))
1441 ):
1442 raise TypeError(msg)
1443
1444 return parse_dates
1445
1446
1447def is_index_col(col) -> bool:
1448 return col is not None and col is not False