1from __future__ import annotations
2
3from collections import defaultdict
4from copy import copy
5import csv
6import datetime
7from enum import Enum
8import itertools
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13 Hashable,
14 Iterable,
15 List,
16 Mapping,
17 Sequence,
18 Tuple,
19 cast,
20 final,
21 overload,
22)
23import warnings
24
25import numpy as np
26
27from pandas._libs import (
28 lib,
29 parsers,
30)
31import pandas._libs.ops as libops
32from pandas._libs.parsers import STR_NA_VALUES
33from pandas._libs.tslibs import parsing
34from pandas._typing import (
35 ArrayLike,
36 DtypeArg,
37 DtypeObj,
38 Scalar,
39)
40from pandas.compat._optional import import_optional_dependency
41from pandas.errors import (
42 ParserError,
43 ParserWarning,
44)
45from pandas.util._exceptions import find_stack_level
46
47from pandas.core.dtypes.astype import astype_array
48from pandas.core.dtypes.common import (
49 ensure_object,
50 is_bool_dtype,
51 is_dict_like,
52 is_dtype_equal,
53 is_extension_array_dtype,
54 is_float_dtype,
55 is_integer,
56 is_integer_dtype,
57 is_list_like,
58 is_object_dtype,
59 is_scalar,
60 is_string_dtype,
61 pandas_dtype,
62)
63from pandas.core.dtypes.dtypes import (
64 CategoricalDtype,
65 ExtensionDtype,
66)
67from pandas.core.dtypes.missing import isna
68
69from pandas import (
70 ArrowDtype,
71 DatetimeIndex,
72 StringDtype,
73)
74from pandas.core import algorithms
75from pandas.core.arrays import (
76 ArrowExtensionArray,
77 BooleanArray,
78 Categorical,
79 ExtensionArray,
80 FloatingArray,
81 IntegerArray,
82)
83from pandas.core.arrays.boolean import BooleanDtype
84from pandas.core.indexes.api import (
85 Index,
86 MultiIndex,
87 default_index,
88 ensure_index_from_sequences,
89)
90from pandas.core.series import Series
91from pandas.core.tools import datetimes as tools
92
93from pandas.io.common import is_potential_multi_index
94
95if TYPE_CHECKING:
96 from pandas import DataFrame
97
98
99class ParserBase:
100 class BadLineHandleMethod(Enum):
101 ERROR = 0
102 WARN = 1
103 SKIP = 2
104
105 _implicit_index: bool = False
106 _first_chunk: bool
107
108 def __init__(self, kwds) -> None:
109 self.names = kwds.get("names")
110 self.orig_names: Sequence[Hashable] | None = None
111
112 self.index_col = kwds.get("index_col", None)
113 self.unnamed_cols: set = set()
114 self.index_names: Sequence[Hashable] | None = None
115 self.col_names: Sequence[Hashable] | None = None
116
117 self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
118 self._parse_date_cols: Iterable = []
119 self.date_parser = kwds.pop("date_parser", lib.no_default)
120 self.date_format = kwds.pop("date_format", None)
121 self.dayfirst = kwds.pop("dayfirst", False)
122 self.keep_date_col = kwds.pop("keep_date_col", False)
123
124 self.na_values = kwds.get("na_values")
125 self.na_fvalues = kwds.get("na_fvalues")
126 self.na_filter = kwds.get("na_filter", False)
127 self.keep_default_na = kwds.get("keep_default_na", True)
128
129 self.dtype = copy(kwds.get("dtype", None))
130 self.converters = kwds.get("converters")
131 self.dtype_backend = kwds.get("dtype_backend")
132
133 self.true_values = kwds.get("true_values")
134 self.false_values = kwds.get("false_values")
135 self.cache_dates = kwds.pop("cache_dates", True)
136
137 self._date_conv = _make_date_converter(
138 date_parser=self.date_parser,
139 date_format=self.date_format,
140 dayfirst=self.dayfirst,
141 cache_dates=self.cache_dates,
142 )
143
144 # validate header options for mi
145 self.header = kwds.get("header")
146 if is_list_like(self.header, allow_sets=False):
147 if kwds.get("usecols"):
148 raise ValueError(
149 "cannot specify usecols when specifying a multi-index header"
150 )
151 if kwds.get("names"):
152 raise ValueError(
153 "cannot specify names when specifying a multi-index header"
154 )
155
156 # validate index_col that only contains integers
157 if self.index_col is not None:
158 if not (
159 is_list_like(self.index_col, allow_sets=False)
160 and all(map(is_integer, self.index_col))
161 or is_integer(self.index_col)
162 ):
163 raise ValueError(
164 "index_col must only contain row numbers "
165 "when specifying a multi-index header"
166 )
167
168 self._name_processed = False
169
170 self._first_chunk = True
171
172 self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
173
174 # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
175 # Normally, this arg would get pre-processed earlier on
176 self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
177
178 def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
179 """
180 Check if parse_dates are in columns.
181
182 If user has provided names for parse_dates, check if those columns
183 are available.
184
185 Parameters
186 ----------
187 columns : list
188 List of names of the dataframe.
189
190 Returns
191 -------
192 The names of the columns which will get parsed later if a dict or list
193 is given as specification.
194
195 Raises
196 ------
197 ValueError
198 If column to parse_date is not in dataframe.
199
200 """
201 cols_needed: Iterable
202 if is_dict_like(self.parse_dates):
203 cols_needed = itertools.chain(*self.parse_dates.values())
204 elif is_list_like(self.parse_dates):
205 # a column in parse_dates could be represented
206 # ColReference = Union[int, str]
207 # DateGroups = List[ColReference]
208 # ParseDates = Union[DateGroups, List[DateGroups],
209 # Dict[ColReference, DateGroups]]
210 cols_needed = itertools.chain.from_iterable(
211 col if is_list_like(col) and not isinstance(col, tuple) else [col]
212 for col in self.parse_dates
213 )
214 else:
215 cols_needed = []
216
217 cols_needed = list(cols_needed)
218
219 # get only columns that are references using names (str), not by index
220 missing_cols = ", ".join(
221 sorted(
222 {
223 col
224 for col in cols_needed
225 if isinstance(col, str) and col not in columns
226 }
227 )
228 )
229 if missing_cols:
230 raise ValueError(
231 f"Missing column provided to 'parse_dates': '{missing_cols}'"
232 )
233 # Convert positions to actual column names
234 return [
235 col if (isinstance(col, str) or col in columns) else columns[col]
236 for col in cols_needed
237 ]
238
239 def close(self) -> None:
240 pass
241
242 @final
243 @property
244 def _has_complex_date_col(self) -> bool:
245 return isinstance(self.parse_dates, dict) or (
246 isinstance(self.parse_dates, list)
247 and len(self.parse_dates) > 0
248 and isinstance(self.parse_dates[0], list)
249 )
250
251 @final
252 def _should_parse_dates(self, i: int) -> bool:
253 if isinstance(self.parse_dates, bool):
254 return self.parse_dates
255 else:
256 if self.index_names is not None:
257 name = self.index_names[i]
258 else:
259 name = None
260 j = i if self.index_col is None else self.index_col[i]
261
262 if is_scalar(self.parse_dates):
263 return (j == self.parse_dates) or (
264 name is not None and name == self.parse_dates
265 )
266 else:
267 return (j in self.parse_dates) or (
268 name is not None and name in self.parse_dates
269 )
270
271 @final
272 def _extract_multi_indexer_columns(
273 self,
274 header,
275 index_names: Sequence[Hashable] | None,
276 passed_names: bool = False,
277 ) -> tuple[
278 Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool
279 ]:
280 """
281 Extract and return the names, index_names, col_names if the column
282 names are a MultiIndex.
283
284 Parameters
285 ----------
286 header: list of lists
287 The header rows
288 index_names: list, optional
289 The names of the future index
290 passed_names: bool, default False
291 A flag specifying if names where passed
292
293 """
294 if len(header) < 2:
295 return header[0], index_names, None, passed_names
296
297 # the names are the tuples of the header that are not the index cols
298 # 0 is the name of the index, assuming index_col is a list of column
299 # numbers
300 ic = self.index_col
301 if ic is None:
302 ic = []
303
304 if not isinstance(ic, (list, tuple, np.ndarray)):
305 ic = [ic]
306 sic = set(ic)
307
308 # clean the index_names
309 index_names = header.pop(-1)
310 index_names, _, _ = self._clean_index_names(index_names, self.index_col)
311
312 # extract the columns
313 field_count = len(header[0])
314
315 # check if header lengths are equal
316 if not all(len(header_iter) == field_count for header_iter in header[1:]):
317 raise ParserError("Header rows must have an equal number of columns.")
318
319 def extract(r):
320 return tuple(r[i] for i in range(field_count) if i not in sic)
321
322 columns = list(zip(*(extract(r) for r in header)))
323 names = columns.copy()
324 for single_ic in sorted(ic):
325 names.insert(single_ic, single_ic)
326
327 # Clean the column names (if we have an index_col).
328 if len(ic):
329 col_names = [
330 r[ic[0]]
331 if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)
332 else None
333 for r in header
334 ]
335 else:
336 col_names = [None] * len(header)
337
338 passed_names = True
339
340 return names, index_names, col_names, passed_names
341
342 @final
343 def _maybe_make_multi_index_columns(
344 self,
345 columns: Sequence[Hashable],
346 col_names: Sequence[Hashable] | None = None,
347 ) -> Sequence[Hashable] | MultiIndex:
348 # possibly create a column mi here
349 if is_potential_multi_index(columns):
350 list_columns = cast(List[Tuple], columns)
351 return MultiIndex.from_tuples(list_columns, names=col_names)
352 return columns
353
354 @final
355 def _make_index(
356 self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
357 ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
358 index: Index | None
359 if not is_index_col(self.index_col) or not self.index_col:
360 index = None
361
362 elif not self._has_complex_date_col:
363 simple_index = self._get_simple_index(alldata, columns)
364 index = self._agg_index(simple_index)
365 elif self._has_complex_date_col:
366 if not self._name_processed:
367 (self.index_names, _, self.index_col) = self._clean_index_names(
368 list(columns), self.index_col
369 )
370 self._name_processed = True
371 date_index = self._get_complex_date_index(data, columns)
372 index = self._agg_index(date_index, try_parse_dates=False)
373
374 # add names for the index
375 if indexnamerow:
376 coffset = len(indexnamerow) - len(columns)
377 assert index is not None
378 index = index.set_names(indexnamerow[:coffset])
379
380 # maybe create a mi on the columns
381 columns = self._maybe_make_multi_index_columns(columns, self.col_names)
382
383 return index, columns
384
385 @final
386 def _get_simple_index(self, data, columns):
387 def ix(col):
388 if not isinstance(col, str):
389 return col
390 raise ValueError(f"Index {col} invalid")
391
392 to_remove = []
393 index = []
394 for idx in self.index_col:
395 i = ix(idx)
396 to_remove.append(i)
397 index.append(data[i])
398
399 # remove index items from content and columns, don't pop in
400 # loop
401 for i in sorted(to_remove, reverse=True):
402 data.pop(i)
403 if not self._implicit_index:
404 columns.pop(i)
405
406 return index
407
408 @final
409 def _get_complex_date_index(self, data, col_names):
410 def _get_name(icol):
411 if isinstance(icol, str):
412 return icol
413
414 if col_names is None:
415 raise ValueError(f"Must supply column order to use {icol!s} as index")
416
417 for i, c in enumerate(col_names):
418 if i == icol:
419 return c
420
421 to_remove = []
422 index = []
423 for idx in self.index_col:
424 name = _get_name(idx)
425 to_remove.append(name)
426 index.append(data[name])
427
428 # remove index items from content and columns, don't pop in
429 # loop
430 for c in sorted(to_remove, reverse=True):
431 data.pop(c)
432 col_names.remove(c)
433
434 return index
435
436 def _clean_mapping(self, mapping):
437 """converts col numbers to names"""
438 if not isinstance(mapping, dict):
439 return mapping
440 clean = {}
441 # for mypy
442 assert self.orig_names is not None
443
444 for col, v in mapping.items():
445 if isinstance(col, int) and col not in self.orig_names:
446 col = self.orig_names[col]
447 clean[col] = v
448 if isinstance(mapping, defaultdict):
449 remaining_cols = set(self.orig_names) - set(clean.keys())
450 clean.update({col: mapping[col] for col in remaining_cols})
451 return clean
452
453 @final
454 def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
455 arrays = []
456 converters = self._clean_mapping(self.converters)
457
458 for i, arr in enumerate(index):
459 if try_parse_dates and self._should_parse_dates(i):
460 arr = self._date_conv(
461 arr,
462 col=self.index_names[i] if self.index_names is not None else None,
463 )
464
465 if self.na_filter:
466 col_na_values = self.na_values
467 col_na_fvalues = self.na_fvalues
468 else:
469 col_na_values = set()
470 col_na_fvalues = set()
471
472 if isinstance(self.na_values, dict):
473 assert self.index_names is not None
474 col_name = self.index_names[i]
475 if col_name is not None:
476 col_na_values, col_na_fvalues = _get_na_values(
477 col_name, self.na_values, self.na_fvalues, self.keep_default_na
478 )
479
480 clean_dtypes = self._clean_mapping(self.dtype)
481
482 cast_type = None
483 index_converter = False
484 if self.index_names is not None:
485 if isinstance(clean_dtypes, dict):
486 cast_type = clean_dtypes.get(self.index_names[i], None)
487
488 if isinstance(converters, dict):
489 index_converter = converters.get(self.index_names[i]) is not None
490
491 try_num_bool = not (
492 cast_type and is_string_dtype(cast_type) or index_converter
493 )
494
495 arr, _ = self._infer_types(
496 arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
497 )
498 arrays.append(arr)
499
500 names = self.index_names
501 index = ensure_index_from_sequences(arrays, names)
502
503 return index
504
505 @final
506 def _convert_to_ndarrays(
507 self,
508 dct: Mapping,
509 na_values,
510 na_fvalues,
511 verbose: bool = False,
512 converters=None,
513 dtypes=None,
514 ):
515 result = {}
516 for c, values in dct.items():
517 conv_f = None if converters is None else converters.get(c, None)
518 if isinstance(dtypes, dict):
519 cast_type = dtypes.get(c, None)
520 else:
521 # single dtype or None
522 cast_type = dtypes
523
524 if self.na_filter:
525 col_na_values, col_na_fvalues = _get_na_values(
526 c, na_values, na_fvalues, self.keep_default_na
527 )
528 else:
529 col_na_values, col_na_fvalues = set(), set()
530
531 if c in self._parse_date_cols:
532 # GH#26203 Do not convert columns which get converted to dates
533 # but replace nans to ensure to_datetime works
534 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
535 np.putmask(values, mask, np.nan)
536 result[c] = values
537 continue
538
539 if conv_f is not None:
540 # conv_f applied to data before inference
541 if cast_type is not None:
542 warnings.warn(
543 (
544 "Both a converter and dtype were specified "
545 f"for column {c} - only the converter will be used."
546 ),
547 ParserWarning,
548 stacklevel=find_stack_level(),
549 )
550
551 try:
552 values = lib.map_infer(values, conv_f)
553 except ValueError:
554 # error: Argument 2 to "isin" has incompatible type "List[Any]";
555 # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"
556 mask = algorithms.isin(
557 values, list(na_values) # type: ignore[arg-type]
558 ).view(np.uint8)
559 values = lib.map_infer_mask(values, conv_f, mask)
560
561 cvals, na_count = self._infer_types(
562 values,
563 set(col_na_values) | col_na_fvalues,
564 cast_type is None,
565 try_num_bool=False,
566 )
567 else:
568 is_ea = is_extension_array_dtype(cast_type)
569 is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
570 # skip inference if specified dtype is object
571 # or casting to an EA
572 try_num_bool = not (cast_type and is_str_or_ea_dtype)
573
574 # general type inference and conversion
575 cvals, na_count = self._infer_types(
576 values,
577 set(col_na_values) | col_na_fvalues,
578 cast_type is None,
579 try_num_bool,
580 )
581
582 # type specified in dtype param or cast_type is an EA
583 if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea):
584 if not is_ea and na_count > 0:
585 if is_bool_dtype(cast_type):
586 raise ValueError(f"Bool column has NA values in column {c}")
587 cast_type = pandas_dtype(cast_type)
588 cvals = self._cast_types(cvals, cast_type, c)
589
590 result[c] = cvals
591 if verbose and na_count:
592 print(f"Filled {na_count} NA values in column {c!s}")
593 return result
594
595 @final
596 def _set_noconvert_dtype_columns(
597 self, col_indices: list[int], names: Sequence[Hashable]
598 ) -> set[int]:
599 """
600 Set the columns that should not undergo dtype conversions.
601
602 Currently, any column that is involved with date parsing will not
603 undergo such conversions. If usecols is specified, the positions of the columns
604 not to cast is relative to the usecols not to all columns.
605
606 Parameters
607 ----------
608 col_indices: The indices specifying order and positions of the columns
609 names: The column names which order is corresponding with the order
610 of col_indices
611
612 Returns
613 -------
614 A set of integers containing the positions of the columns not to convert.
615 """
616 usecols: list[int] | list[str] | None
617 noconvert_columns = set()
618 if self.usecols_dtype == "integer":
619 # A set of integers will be converted to a list in
620 # the correct order every single time.
621 usecols = sorted(self.usecols)
622 elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
623 # The names attribute should have the correct columns
624 # in the proper order for indexing with parse_dates.
625 usecols = col_indices
626 else:
627 # Usecols is empty.
628 usecols = None
629
630 def _set(x) -> int:
631 if usecols is not None and is_integer(x):
632 x = usecols[x]
633
634 if not is_integer(x):
635 x = col_indices[names.index(x)]
636
637 return x
638
639 if isinstance(self.parse_dates, list):
640 for val in self.parse_dates:
641 if isinstance(val, list):
642 for k in val:
643 noconvert_columns.add(_set(k))
644 else:
645 noconvert_columns.add(_set(val))
646
647 elif isinstance(self.parse_dates, dict):
648 for val in self.parse_dates.values():
649 if isinstance(val, list):
650 for k in val:
651 noconvert_columns.add(_set(k))
652 else:
653 noconvert_columns.add(_set(val))
654
655 elif self.parse_dates:
656 if isinstance(self.index_col, list):
657 for k in self.index_col:
658 noconvert_columns.add(_set(k))
659 elif self.index_col is not None:
660 noconvert_columns.add(_set(self.index_col))
661
662 return noconvert_columns
663
664 def _infer_types(
665 self, values, na_values, no_dtype_specified, try_num_bool: bool = True
666 ) -> tuple[ArrayLike, int]:
667 """
668 Infer types of values, possibly casting
669
670 Parameters
671 ----------
672 values : ndarray
673 na_values : set
674 no_dtype_specified: Specifies if we want to cast explicitly
675 try_num_bool : bool, default try
676 try to cast values to numeric (first preference) or boolean
677
678 Returns
679 -------
680 converted : ndarray or ExtensionArray
681 na_count : int
682 """
683 na_count = 0
684 if issubclass(values.dtype.type, (np.number, np.bool_)):
685 # If our array has numeric dtype, we don't have to check for strings in isin
686 na_values = np.array([val for val in na_values if not isinstance(val, str)])
687 mask = algorithms.isin(values, na_values)
688 na_count = mask.astype("uint8", copy=False).sum()
689 if na_count > 0:
690 if is_integer_dtype(values):
691 values = values.astype(np.float64)
692 np.putmask(values, mask, np.nan)
693 return values, na_count
694
695 dtype_backend = self.dtype_backend
696 non_default_dtype_backend = (
697 no_dtype_specified and dtype_backend is not lib.no_default
698 )
699 result: ArrayLike
700
701 if try_num_bool and is_object_dtype(values.dtype):
702 # exclude e.g DatetimeIndex here
703 try:
704 result, result_mask = lib.maybe_convert_numeric(
705 values,
706 na_values,
707 False,
708 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa
709 )
710 except (ValueError, TypeError):
711 # e.g. encountering datetime string gets ValueError
712 # TypeError can be raised in floatify
713 na_count = parsers.sanitize_objects(values, na_values)
714 result = values
715 else:
716 if non_default_dtype_backend:
717 if result_mask is None:
718 result_mask = np.zeros(result.shape, dtype=np.bool_)
719
720 if result_mask.all():
721 result = IntegerArray(
722 np.ones(result_mask.shape, dtype=np.int64), result_mask
723 )
724 elif is_integer_dtype(result):
725 result = IntegerArray(result, result_mask)
726 elif is_bool_dtype(result):
727 result = BooleanArray(result, result_mask)
728 elif is_float_dtype(result):
729 result = FloatingArray(result, result_mask)
730
731 na_count = result_mask.sum()
732 else:
733 na_count = isna(result).sum()
734 else:
735 result = values
736 if values.dtype == np.object_:
737 na_count = parsers.sanitize_objects(values, na_values)
738
739 if result.dtype == np.object_ and try_num_bool:
740 result, bool_mask = libops.maybe_convert_bool(
741 np.asarray(values),
742 true_values=self.true_values,
743 false_values=self.false_values,
744 convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa
745 )
746 if result.dtype == np.bool_ and non_default_dtype_backend:
747 if bool_mask is None:
748 bool_mask = np.zeros(result.shape, dtype=np.bool_)
749 result = BooleanArray(result, bool_mask)
750 elif result.dtype == np.object_ and non_default_dtype_backend:
751 # read_excel sends array of datetime objects
752 inferred_type = lib.infer_dtype(result)
753 if inferred_type != "datetime":
754 result = StringDtype().construct_array_type()._from_sequence(values)
755
756 if dtype_backend == "pyarrow":
757 pa = import_optional_dependency("pyarrow")
758 if isinstance(result, np.ndarray):
759 result = ArrowExtensionArray(pa.array(result, from_pandas=True))
760 else:
761 # ExtensionArray
762 result = ArrowExtensionArray(
763 pa.array(result.to_numpy(), from_pandas=True)
764 )
765
766 return result, na_count
767
768 def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
769 """
770 Cast values to specified type
771
772 Parameters
773 ----------
774 values : ndarray or ExtensionArray
775 cast_type : np.dtype or ExtensionDtype
776 dtype to cast values to
777 column : string
778 column name - used only for error reporting
779
780 Returns
781 -------
782 converted : ndarray or ExtensionArray
783 """
784 if isinstance(cast_type, CategoricalDtype):
785 known_cats = cast_type.categories is not None
786
787 if not is_object_dtype(values.dtype) and not known_cats:
788 # TODO: this is for consistency with
789 # c-parser which parses all categories
790 # as strings
791 values = lib.ensure_string_array(
792 values, skipna=False, convert_na_value=False
793 )
794
795 cats = Index(values).unique().dropna()
796 values = Categorical._from_inferred_categories(
797 cats, cats.get_indexer(values), cast_type, true_values=self.true_values
798 )
799
800 # use the EA's implementation of casting
801 elif isinstance(cast_type, ExtensionDtype):
802 array_type = cast_type.construct_array_type()
803 try:
804 if isinstance(cast_type, BooleanDtype):
805 # error: Unexpected keyword argument "true_values" for
806 # "_from_sequence_of_strings" of "ExtensionArray"
807 return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa:E501
808 values,
809 dtype=cast_type,
810 true_values=self.true_values,
811 false_values=self.false_values,
812 )
813 else:
814 return array_type._from_sequence_of_strings(values, dtype=cast_type)
815 except NotImplementedError as err:
816 raise NotImplementedError(
817 f"Extension Array: {array_type} must implement "
818 "_from_sequence_of_strings in order to be used in parser methods"
819 ) from err
820
821 elif isinstance(values, ExtensionArray):
822 values = values.astype(cast_type, copy=False)
823 elif issubclass(cast_type.type, str):
824 # TODO: why skipna=True here and False above? some tests depend
825 # on it here, but nothing fails if we change it above
826 # (as no tests get there as of 2022-12-06)
827 values = lib.ensure_string_array(
828 values, skipna=True, convert_na_value=False
829 )
830 else:
831 try:
832 values = astype_array(values, cast_type, copy=True)
833 except ValueError as err:
834 raise ValueError(
835 f"Unable to convert column {column} to type {cast_type}"
836 ) from err
837 return values
838
839 @overload
840 def _do_date_conversions(
841 self,
842 names: Index,
843 data: DataFrame,
844 ) -> tuple[Sequence[Hashable] | Index, DataFrame]:
845 ...
846
847 @overload
848 def _do_date_conversions(
849 self,
850 names: Sequence[Hashable],
851 data: Mapping[Hashable, ArrayLike],
852 ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
853 ...
854
855 def _do_date_conversions(
856 self,
857 names: Sequence[Hashable] | Index,
858 data: Mapping[Hashable, ArrayLike] | DataFrame,
859 ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
860 # returns data, columns
861
862 if self.parse_dates is not None:
863 data, names = _process_date_conversion(
864 data,
865 self._date_conv,
866 self.parse_dates,
867 self.index_col,
868 self.index_names,
869 names,
870 keep_date_col=self.keep_date_col,
871 dtype_backend=self.dtype_backend,
872 )
873
874 return names, data
875
876 def _check_data_length(
877 self,
878 columns: Sequence[Hashable],
879 data: Sequence[ArrayLike],
880 ) -> None:
881 """Checks if length of data is equal to length of column names.
882
883 One set of trailing commas is allowed. self.index_col not False
884 results in a ParserError previously when lengths do not match.
885
886 Parameters
887 ----------
888 columns: list of column names
889 data: list of array-likes containing the data column-wise.
890 """
891 if not self.index_col and len(columns) != len(data) and columns:
892 empty_str = is_object_dtype(data[-1]) and data[-1] == ""
893 # error: No overload variant of "__ror__" of "ndarray" matches
894 # argument type "ExtensionArray"
895 empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
896 if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
897 return
898 warnings.warn(
899 "Length of header or names does not match length of data. This leads "
900 "to a loss of data with index_col=False.",
901 ParserWarning,
902 stacklevel=find_stack_level(),
903 )
904
905 @overload
906 def _evaluate_usecols(
907 self,
908 usecols: set[int] | Callable[[Hashable], object],
909 names: Sequence[Hashable],
910 ) -> set[int]:
911 ...
912
913 @overload
914 def _evaluate_usecols(
915 self, usecols: set[str], names: Sequence[Hashable]
916 ) -> set[str]:
917 ...
918
919 def _evaluate_usecols(
920 self,
921 usecols: Callable[[Hashable], object] | set[str] | set[int],
922 names: Sequence[Hashable],
923 ) -> set[str] | set[int]:
924 """
925 Check whether or not the 'usecols' parameter
926 is a callable. If so, enumerates the 'names'
927 parameter and returns a set of indices for
928 each entry in 'names' that evaluates to True.
929 If not a callable, returns 'usecols'.
930 """
931 if callable(usecols):
932 return {i for i, name in enumerate(names) if usecols(name)}
933 return usecols
934
935 def _validate_usecols_names(self, usecols, names):
936 """
937 Validates that all usecols are present in a given
938 list of names. If not, raise a ValueError that
939 shows what usecols are missing.
940
941 Parameters
942 ----------
943 usecols : iterable of usecols
944 The columns to validate are present in names.
945 names : iterable of names
946 The column names to check against.
947
948 Returns
949 -------
950 usecols : iterable of usecols
951 The `usecols` parameter if the validation succeeds.
952
953 Raises
954 ------
955 ValueError : Columns were missing. Error message will list them.
956 """
957 missing = [c for c in usecols if c not in names]
958 if len(missing) > 0:
959 raise ValueError(
960 f"Usecols do not match columns, columns expected but not found: "
961 f"{missing}"
962 )
963
964 return usecols
965
966 def _validate_usecols_arg(self, usecols):
967 """
968 Validate the 'usecols' parameter.
969
970 Checks whether or not the 'usecols' parameter contains all integers
971 (column selection by index), strings (column by name) or is a callable.
972 Raises a ValueError if that is not the case.
973
974 Parameters
975 ----------
976 usecols : list-like, callable, or None
977 List of columns to use when parsing or a callable that can be used
978 to filter a list of table columns.
979
980 Returns
981 -------
982 usecols_tuple : tuple
983 A tuple of (verified_usecols, usecols_dtype).
984
985 'verified_usecols' is either a set if an array-like is passed in or
986 'usecols' if a callable or None is passed in.
987
988 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
989 is passed in or None if a callable or None is passed in.
990 """
991 msg = (
992 "'usecols' must either be list-like of all strings, all unicode, "
993 "all integers or a callable."
994 )
995 if usecols is not None:
996 if callable(usecols):
997 return usecols, None
998
999 if not is_list_like(usecols):
1000 # see gh-20529
1001 #
1002 # Ensure it is iterable container but not string.
1003 raise ValueError(msg)
1004
1005 usecols_dtype = lib.infer_dtype(usecols, skipna=False)
1006
1007 if usecols_dtype not in ("empty", "integer", "string"):
1008 raise ValueError(msg)
1009
1010 usecols = set(usecols)
1011
1012 return usecols, usecols_dtype
1013 return usecols, None
1014
1015 def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
1016 if not is_index_col(index_col):
1017 return None, columns, index_col
1018
1019 columns = list(columns)
1020
1021 # In case of no rows and multiindex columns we have to set index_names to
1022 # list of Nones GH#38292
1023 if not columns:
1024 return [None] * len(index_col), columns, index_col
1025
1026 cp_cols = list(columns)
1027 index_names: list[str | int | None] = []
1028
1029 # don't mutate
1030 index_col = list(index_col)
1031
1032 for i, c in enumerate(index_col):
1033 if isinstance(c, str):
1034 index_names.append(c)
1035 for j, name in enumerate(cp_cols):
1036 if name == c:
1037 index_col[i] = j
1038 columns.remove(name)
1039 break
1040 else:
1041 name = cp_cols[c]
1042 columns.remove(name)
1043 index_names.append(name)
1044
1045 # Only clean index names that were placeholders.
1046 for i, name in enumerate(index_names):
1047 if isinstance(name, str) and name in self.unnamed_cols:
1048 index_names[i] = None
1049
1050 return index_names, columns, index_col
1051
1052 def _get_empty_meta(
1053 self, columns, index_col, index_names, dtype: DtypeArg | None = None
1054 ):
1055 columns = list(columns)
1056
1057 # Convert `dtype` to a defaultdict of some kind.
1058 # This will enable us to write `dtype[col_name]`
1059 # without worrying about KeyError issues later on.
1060 dtype_dict: defaultdict[Hashable, Any]
1061 if not is_dict_like(dtype):
1062 # if dtype == None, default will be object.
1063 default_dtype = dtype or object
1064 dtype_dict = defaultdict(lambda: default_dtype)
1065 else:
1066 dtype = cast(dict, dtype)
1067 dtype_dict = defaultdict(
1068 lambda: object,
1069 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
1070 )
1071
1072 # Even though we have no data, the "index" of the empty DataFrame
1073 # could for example still be an empty MultiIndex. Thus, we need to
1074 # check whether we have any index columns specified, via either:
1075 #
1076 # 1) index_col (column indices)
1077 # 2) index_names (column names)
1078 #
1079 # Both must be non-null to ensure a successful construction. Otherwise,
1080 # we have to create a generic empty Index.
1081 index: Index
1082 if (index_col is None or index_col is False) or index_names is None:
1083 index = default_index(0)
1084 else:
1085 data = [Series([], dtype=dtype_dict[name]) for name in index_names]
1086 index = ensure_index_from_sequences(data, names=index_names)
1087 index_col.sort()
1088
1089 for i, n in enumerate(index_col):
1090 columns.pop(n - i)
1091
1092 col_dict = {
1093 col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns
1094 }
1095
1096 return index, columns, col_dict
1097
1098
1099def _make_date_converter(
1100 date_parser=lib.no_default,
1101 dayfirst: bool = False,
1102 cache_dates: bool = True,
1103 date_format: dict[Hashable, str] | str | None = None,
1104):
1105 if date_parser is not lib.no_default:
1106 warnings.warn(
1107 "The argument 'date_parser' is deprecated and will "
1108 "be removed in a future version. "
1109 "Please use 'date_format' instead, or read your data in as 'object' dtype "
1110 "and then call 'to_datetime'.",
1111 FutureWarning,
1112 stacklevel=find_stack_level(),
1113 )
1114 if date_parser is not lib.no_default and date_format is not None:
1115 raise TypeError("Cannot use both 'date_parser' and 'date_format'")
1116
1117 def unpack_if_single_element(arg):
1118 # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
1119 if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:
1120 return arg[0]
1121 return arg
1122
1123 def converter(*date_cols, col: Hashable):
1124 if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":
1125 return date_cols[0]
1126
1127 if date_parser is lib.no_default:
1128 strs = parsing.concat_date_cols(date_cols)
1129 date_fmt = (
1130 date_format.get(col) if isinstance(date_format, dict) else date_format
1131 )
1132
1133 result = tools.to_datetime(
1134 ensure_object(strs),
1135 format=date_fmt,
1136 utc=False,
1137 dayfirst=dayfirst,
1138 errors="ignore",
1139 cache=cache_dates,
1140 )
1141 if isinstance(result, DatetimeIndex):
1142 arr = result.to_numpy()
1143 arr.flags.writeable = True
1144 return arr
1145 return result._values
1146 else:
1147 try:
1148 result = tools.to_datetime(
1149 date_parser(*(unpack_if_single_element(arg) for arg in date_cols)),
1150 errors="ignore",
1151 cache=cache_dates,
1152 )
1153 if isinstance(result, datetime.datetime):
1154 raise Exception("scalar parser")
1155 return result
1156 except Exception:
1157 return tools.to_datetime(
1158 parsing.try_parse_dates(
1159 parsing.concat_date_cols(date_cols),
1160 parser=date_parser,
1161 ),
1162 errors="ignore",
1163 )
1164
1165 return converter
1166
1167
1168parser_defaults = {
1169 "delimiter": None,
1170 "escapechar": None,
1171 "quotechar": '"',
1172 "quoting": csv.QUOTE_MINIMAL,
1173 "doublequote": True,
1174 "skipinitialspace": False,
1175 "lineterminator": None,
1176 "header": "infer",
1177 "index_col": None,
1178 "names": None,
1179 "skiprows": None,
1180 "skipfooter": 0,
1181 "nrows": None,
1182 "na_values": None,
1183 "keep_default_na": True,
1184 "true_values": None,
1185 "false_values": None,
1186 "converters": None,
1187 "dtype": None,
1188 "cache_dates": True,
1189 "thousands": None,
1190 "comment": None,
1191 "decimal": ".",
1192 # 'engine': 'c',
1193 "parse_dates": False,
1194 "keep_date_col": False,
1195 "dayfirst": False,
1196 "date_parser": lib.no_default,
1197 "date_format": None,
1198 "usecols": None,
1199 # 'iterator': False,
1200 "chunksize": None,
1201 "verbose": False,
1202 "encoding": None,
1203 "compression": None,
1204 "skip_blank_lines": True,
1205 "encoding_errors": "strict",
1206 "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
1207 "dtype_backend": lib.no_default,
1208}
1209
1210
1211def _process_date_conversion(
1212 data_dict,
1213 converter: Callable,
1214 parse_spec,
1215 index_col,
1216 index_names,
1217 columns,
1218 keep_date_col: bool = False,
1219 dtype_backend=lib.no_default,
1220):
1221 def _isindex(colspec):
1222 return (isinstance(index_col, list) and colspec in index_col) or (
1223 isinstance(index_names, list) and colspec in index_names
1224 )
1225
1226 new_cols = []
1227 new_data = {}
1228
1229 orig_names = columns
1230 columns = list(columns)
1231
1232 date_cols = set()
1233
1234 if parse_spec is None or isinstance(parse_spec, bool):
1235 return data_dict, columns
1236
1237 if isinstance(parse_spec, list):
1238 # list of column lists
1239 for colspec in parse_spec:
1240 if is_scalar(colspec) or isinstance(colspec, tuple):
1241 if isinstance(colspec, int) and colspec not in data_dict:
1242 colspec = orig_names[colspec]
1243 if _isindex(colspec):
1244 continue
1245 elif dtype_backend == "pyarrow":
1246 import pyarrow as pa
1247
1248 dtype = data_dict[colspec].dtype
1249 if isinstance(dtype, ArrowDtype) and (
1250 pa.types.is_timestamp(dtype.pyarrow_dtype)
1251 or pa.types.is_date(dtype.pyarrow_dtype)
1252 ):
1253 continue
1254
1255 # Pyarrow engine returns Series which we need to convert to
1256 # numpy array before converter, its a no-op for other parsers
1257 data_dict[colspec] = converter(
1258 np.asarray(data_dict[colspec]), col=colspec
1259 )
1260 else:
1261 new_name, col, old_names = _try_convert_dates(
1262 converter, colspec, data_dict, orig_names
1263 )
1264 if new_name in data_dict:
1265 raise ValueError(f"New date column already in dict {new_name}")
1266 new_data[new_name] = col
1267 new_cols.append(new_name)
1268 date_cols.update(old_names)
1269
1270 elif isinstance(parse_spec, dict):
1271 # dict of new name to column list
1272 for new_name, colspec in parse_spec.items():
1273 if new_name in data_dict:
1274 raise ValueError(f"Date column {new_name} already in dict")
1275
1276 _, col, old_names = _try_convert_dates(
1277 converter,
1278 colspec,
1279 data_dict,
1280 orig_names,
1281 target_name=new_name,
1282 )
1283
1284 new_data[new_name] = col
1285
1286 # If original column can be converted to date we keep the converted values
1287 # This can only happen if values are from single column
1288 if len(colspec) == 1:
1289 new_data[colspec[0]] = col
1290
1291 new_cols.append(new_name)
1292 date_cols.update(old_names)
1293
1294 data_dict.update(new_data)
1295 new_cols.extend(columns)
1296
1297 if not keep_date_col:
1298 for c in list(date_cols):
1299 data_dict.pop(c)
1300 new_cols.remove(c)
1301
1302 return data_dict, new_cols
1303
1304
1305def _try_convert_dates(
1306 parser: Callable, colspec, data_dict, columns, target_name: str | None = None
1307):
1308 colset = set(columns)
1309 colnames = []
1310
1311 for c in colspec:
1312 if c in colset:
1313 colnames.append(c)
1314 elif isinstance(c, int) and c not in columns:
1315 colnames.append(columns[c])
1316 else:
1317 colnames.append(c)
1318
1319 new_name: tuple | str
1320 if all(isinstance(x, tuple) for x in colnames):
1321 new_name = tuple(map("_".join, zip(*colnames)))
1322 else:
1323 new_name = "_".join([str(x) for x in colnames])
1324 to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]
1325
1326 new_col = parser(*to_parse, col=new_name if target_name is None else target_name)
1327 return new_name, new_col, colnames
1328
1329
1330def _get_na_values(col, na_values, na_fvalues, keep_default_na):
1331 """
1332 Get the NaN values for a given column.
1333
1334 Parameters
1335 ----------
1336 col : str
1337 The name of the column.
1338 na_values : array-like, dict
1339 The object listing the NaN values as strings.
1340 na_fvalues : array-like, dict
1341 The object listing the NaN values as floats.
1342 keep_default_na : bool
1343 If `na_values` is a dict, and the column is not mapped in the
1344 dictionary, whether to return the default NaN values or the empty set.
1345
1346 Returns
1347 -------
1348 nan_tuple : A length-two tuple composed of
1349
1350 1) na_values : the string NaN values for that column.
1351 2) na_fvalues : the float NaN values for that column.
1352 """
1353 if isinstance(na_values, dict):
1354 if col in na_values:
1355 return na_values[col], na_fvalues[col]
1356 else:
1357 if keep_default_na:
1358 return STR_NA_VALUES, set()
1359
1360 return set(), set()
1361 else:
1362 return na_values, na_fvalues
1363
1364
1365def _validate_parse_dates_arg(parse_dates):
1366 """
1367 Check whether or not the 'parse_dates' parameter
1368 is a non-boolean scalar. Raises a ValueError if
1369 that is the case.
1370 """
1371 msg = (
1372 "Only booleans, lists, and dictionaries are accepted "
1373 "for the 'parse_dates' parameter"
1374 )
1375
1376 if parse_dates is not None:
1377 if is_scalar(parse_dates):
1378 if not lib.is_bool(parse_dates):
1379 raise TypeError(msg)
1380
1381 elif not isinstance(parse_dates, (list, dict)):
1382 raise TypeError(msg)
1383
1384 return parse_dates
1385
1386
1387def is_index_col(col) -> bool:
1388 return col is not None and col is not False