1"""
2Module contains tools for processing files into DataFrames or other objects
3
4GH#48849 provides a convenient way of deprecating keyword arguments
5"""
6from __future__ import annotations
7
8from collections import (
9 abc,
10 defaultdict,
11)
12import csv
13import sys
14from textwrap import fill
15from typing import (
16 IO,
17 TYPE_CHECKING,
18 Any,
19 Callable,
20 Literal,
21 NamedTuple,
22 TypedDict,
23 overload,
24)
25import warnings
26
27import numpy as np
28
29from pandas._config import using_copy_on_write
30
31from pandas._libs import lib
32from pandas._libs.parsers import STR_NA_VALUES
33from pandas.errors import (
34 AbstractMethodError,
35 ParserWarning,
36)
37from pandas.util._decorators import Appender
38from pandas.util._exceptions import find_stack_level
39from pandas.util._validators import check_dtype_backend
40
41from pandas.core.dtypes.common import (
42 is_file_like,
43 is_float,
44 is_hashable,
45 is_integer,
46 is_list_like,
47 pandas_dtype,
48)
49
50from pandas import Series
51from pandas.core.frame import DataFrame
52from pandas.core.indexes.api import RangeIndex
53from pandas.core.shared_docs import _shared_docs
54
55from pandas.io.common import (
56 IOHandles,
57 get_handle,
58 stringify_path,
59 validate_header_arg,
60)
61from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
62from pandas.io.parsers.base_parser import (
63 ParserBase,
64 is_index_col,
65 parser_defaults,
66)
67from pandas.io.parsers.c_parser_wrapper import CParserWrapper
68from pandas.io.parsers.python_parser import (
69 FixedWidthFieldParser,
70 PythonParser,
71)
72
73if TYPE_CHECKING:
74 from collections.abc import (
75 Hashable,
76 Iterable,
77 Mapping,
78 Sequence,
79 )
80 from types import TracebackType
81
82 from pandas._typing import (
83 CompressionOptions,
84 CSVEngine,
85 DtypeArg,
86 DtypeBackend,
87 FilePath,
88 IndexLabel,
89 ReadCsvBuffer,
90 Self,
91 StorageOptions,
92 UsecolsArgType,
93 )
94_doc_read_csv_and_table = (
95 r"""
96{summary}
97
98Also supports optionally iterating or breaking of the file
99into chunks.
100
101Additional help can be found in the online docs for
102`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
103
104Parameters
105----------
106filepath_or_buffer : str, path object or file-like object
107 Any valid string path is acceptable. The string could be a URL. Valid
108 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
109 expected. A local file could be: file://localhost/path/to/table.csv.
110
111 If you want to pass in a path object, pandas accepts any ``os.PathLike``.
112
113 By file-like object, we refer to objects with a ``read()`` method, such as
114 a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
115sep : str, default {_default_sep}
116 Character or regex pattern to treat as the delimiter. If ``sep=None``, the
117 C engine cannot automatically detect
118 the separator, but the Python parsing engine can, meaning the latter will
119 be used and automatically detect the separator from only the first valid
120 row of the file by Python's builtin sniffer tool, ``csv.Sniffer``.
121 In addition, separators longer than 1 character and different from
122 ``'\s+'`` will be interpreted as regular expressions and will also force
123 the use of the Python parsing engine. Note that regex delimiters are prone
124 to ignoring quoted data. Regex example: ``'\r\t'``.
125delimiter : str, optional
126 Alias for ``sep``.
127header : int, Sequence of int, 'infer' or None, default 'infer'
128 Row number(s) containing column labels and marking the start of the
129 data (zero-indexed). Default behavior is to infer the column names: if no ``names``
130 are passed the behavior is identical to ``header=0`` and column
131 names are inferred from the first line of the file, if column
132 names are passed explicitly to ``names`` then the behavior is identical to
133 ``header=None``. Explicitly pass ``header=0`` to be able to
134 replace existing names. The header can be a list of integers that
135 specify row locations for a :class:`~pandas.MultiIndex` on the columns
136 e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be
137 skipped (e.g. 2 in this example is skipped). Note that this
138 parameter ignores commented lines and empty lines if
139 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
140 data rather than the first line of the file.
141names : Sequence of Hashable, optional
142 Sequence of column labels to apply. If the file contains a header row,
143 then you should explicitly pass ``header=0`` to override the column names.
144 Duplicates in this list are not allowed.
145index_col : Hashable, Sequence of Hashable or False, optional
146 Column(s) to use as row label(s), denoted either by column labels or column
147 indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex`
148 will be formed for the row labels.
149
150 Note: ``index_col=False`` can be used to force pandas to *not* use the first
151 column as the index, e.g., when you have a malformed file with delimiters at
152 the end of each line.
153usecols : Sequence of Hashable or Callable, optional
154 Subset of columns to select, denoted either by column labels or column indices.
155 If list-like, all elements must either
156 be positional (i.e. integer indices into the document columns) or strings
157 that correspond to column names provided either by the user in ``names`` or
158 inferred from the document header row(s). If ``names`` are given, the document
159 header row(s) are not taken into account. For example, a valid list-like
160 ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
161 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
162 To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order
163 preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]``
164 for columns in ``['foo', 'bar']`` order or
165 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
166 for ``['bar', 'foo']`` order.
167
168 If callable, the callable function will be evaluated against the column
169 names, returning names where the callable function evaluates to ``True``. An
170 example of a valid callable argument would be ``lambda x: x.upper() in
171 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
172 parsing time and lower memory usage.
173dtype : dtype or dict of {{Hashable : dtype}}, optional
174 Data type(s) to apply to either the whole dataset or individual columns.
175 E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}``
176 Use ``str`` or ``object`` together with suitable ``na_values`` settings
177 to preserve and not interpret ``dtype``.
178 If ``converters`` are specified, they will be applied INSTEAD
179 of ``dtype`` conversion.
180
181 .. versionadded:: 1.5.0
182
183 Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where
184 the default determines the ``dtype`` of the columns which are not explicitly
185 listed.
186engine : {{'c', 'python', 'pyarrow'}}, optional
187 Parser engine to use. The C and pyarrow engines are faster, while the python engine
188 is currently more feature-complete. Multithreading is currently only supported by
189 the pyarrow engine.
190
191 .. versionadded:: 1.4.0
192
193 The 'pyarrow' engine was added as an *experimental* engine, and some features
194 are unsupported, or may not work correctly, with this engine.
195converters : dict of {{Hashable : Callable}}, optional
196 Functions for converting values in specified columns. Keys can either
197 be column labels or column indices.
198true_values : list, optional
199 Values to consider as ``True`` in addition to case-insensitive variants of 'True'.
200false_values : list, optional
201 Values to consider as ``False`` in addition to case-insensitive variants of 'False'.
202skipinitialspace : bool, default False
203 Skip spaces after delimiter.
204skiprows : int, list of int or Callable, optional
205 Line numbers to skip (0-indexed) or number of lines to skip (``int``)
206 at the start of the file.
207
208 If callable, the callable function will be evaluated against the row
209 indices, returning ``True`` if the row should be skipped and ``False`` otherwise.
210 An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
211skipfooter : int, default 0
212 Number of lines at bottom of file to skip (Unsupported with ``engine='c'``).
213nrows : int, optional
214 Number of rows of file to read. Useful for reading pieces of large files.
215na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional
216 Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific
217 per-column ``NA`` values. By default the following values are interpreted as
218 ``NaN``: " """
219 + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
220 + """ ".
221
222keep_default_na : bool, default True
223 Whether or not to include the default ``NaN`` values when parsing the data.
224 Depending on whether ``na_values`` is passed in, the behavior is as follows:
225
226 * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values``
227 is appended to the default ``NaN`` values used for parsing.
228 * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only
229 the default ``NaN`` values are used for parsing.
230 * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only
231 the ``NaN`` values specified ``na_values`` are used for parsing.
232 * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no
233 strings will be parsed as ``NaN``.
234
235 Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and
236 ``na_values`` parameters will be ignored.
237na_filter : bool, default True
238 Detect missing value markers (empty strings and the value of ``na_values``). In
239 data without any ``NA`` values, passing ``na_filter=False`` can improve the
240 performance of reading a large file.
241verbose : bool, default False
242 Indicate number of ``NA`` values placed in non-numeric columns.
243
244 .. deprecated:: 2.2.0
245skip_blank_lines : bool, default True
246 If ``True``, skip over blank lines rather than interpreting as ``NaN`` values.
247parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \
248default False
249 The behavior is as follows:
250
251 * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to
252 ``True`` if ``date_format`` or ``date_parser`` arguments have been passed.
253 * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3
254 each as a separate date column.
255 * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse
256 as a single date column. Values are joined with a space before parsing.
257 * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call
258 result 'foo'. Values are joined with a space before parsing.
259
260 If a column or index cannot be represented as an array of ``datetime``,
261 say because of an unparsable value or a mixture of timezones, the column
262 or index will be returned unaltered as an ``object`` data type. For
263 non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after
264 :func:`~pandas.read_csv`.
265
266 Note: A fast-path exists for iso8601-formatted dates.
267infer_datetime_format : bool, default False
268 If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the
269 format of the ``datetime`` strings in the columns, and if it can be inferred,
270 switch to a faster method of parsing them. In some cases this can increase
271 the parsing speed by 5-10x.
272
273 .. deprecated:: 2.0.0
274 A strict version of this argument is now the default, passing it has no effect.
275
276keep_date_col : bool, default False
277 If ``True`` and ``parse_dates`` specifies combining multiple columns then
278 keep the original columns.
279date_parser : Callable, optional
280 Function to use for converting a sequence of string columns to an array of
281 ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the
282 conversion. pandas will try to call ``date_parser`` in three different ways,
283 advancing to the next if an exception occurs: 1) Pass one or more arrays
284 (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the
285 string values from the columns defined by ``parse_dates`` into a single array
286 and pass that; and 3) call ``date_parser`` once for each row using one or
287 more strings (corresponding to the columns defined by ``parse_dates``) as
288 arguments.
289
290 .. deprecated:: 2.0.0
291 Use ``date_format`` instead, or read in as ``object`` and then apply
292 :func:`~pandas.to_datetime` as-needed.
293date_format : str or dict of column -> format, optional
294 Format to use for parsing dates when used in conjunction with ``parse_dates``.
295 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
296 `strftime documentation
297 <https://docs.python.org/3/library/datetime.html
298 #strftime-and-strptime-behavior>`_ for more information on choices, though
299 note that :const:`"%f"` will parse all the way up to nanoseconds.
300 You can also pass:
301
302 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
303 time string (not necessarily in exactly the same format);
304 - "mixed", to infer the format for each element individually. This is risky,
305 and you should probably use it along with `dayfirst`.
306
307 .. versionadded:: 2.0.0
308dayfirst : bool, default False
309 DD/MM format dates, international and European format.
310cache_dates : bool, default True
311 If ``True``, use a cache of unique, converted dates to apply the ``datetime``
312 conversion. May produce significant speed-up when parsing duplicate
313 date strings, especially ones with timezone offsets.
314
315iterator : bool, default False
316 Return ``TextFileReader`` object for iteration or getting chunks with
317 ``get_chunk()``.
318chunksize : int, optional
319 Number of lines to read from the file per chunk. Passing a value will cause the
320 function to return a ``TextFileReader`` object for iteration.
321 See the `IO Tools docs
322 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
323 for more information on ``iterator`` and ``chunksize``.
324
325{decompression_options}
326
327 .. versionchanged:: 1.4.0 Zstandard support.
328
329thousands : str (length 1), optional
330 Character acting as the thousands separator in numerical values.
331decimal : str (length 1), default '.'
332 Character to recognize as decimal point (e.g., use ',' for European data).
333lineterminator : str (length 1), optional
334 Character used to denote a line break. Only valid with C parser.
335quotechar : str (length 1), optional
336 Character used to denote the start and end of a quoted item. Quoted
337 items can include the ``delimiter`` and it will be ignored.
338quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \
3393 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL
340 Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is
341 ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special
342 characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``,
343 or ``lineterminator``.
344doublequote : bool, default True
345 When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate
346 whether or not to interpret two consecutive ``quotechar`` elements INSIDE a
347 field as a single ``quotechar`` element.
348escapechar : str (length 1), optional
349 Character used to escape other characters.
350comment : str (length 1), optional
351 Character indicating that the remainder of line should not be parsed.
352 If found at the beginning
353 of a line, the line will be ignored altogether. This parameter must be a
354 single character. Like empty lines (as long as ``skip_blank_lines=True``),
355 fully commented lines are ignored by the parameter ``header`` but not by
356 ``skiprows``. For example, if ``comment='#'``, parsing
357 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being
358 treated as the header.
359encoding : str, optional, default 'utf-8'
360 Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python
361 standard encodings
362 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
363
364encoding_errors : str, optional, default 'strict'
365 How encoding errors are treated. `List of possible values
366 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
367
368 .. versionadded:: 1.3.0
369
370dialect : str or csv.Dialect, optional
371 If provided, this parameter will override values (default or not) for the
372 following parameters: ``delimiter``, ``doublequote``, ``escapechar``,
373 ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to
374 override values, a ``ParserWarning`` will be issued. See ``csv.Dialect``
375 documentation for more details.
376on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error'
377 Specifies what to do upon encountering a bad line (a line with too many fields).
378 Allowed values are :
379
380 - ``'error'``, raise an Exception when a bad line is encountered.
381 - ``'warn'``, raise a warning when a bad line is encountered and skip that line.
382 - ``'skip'``, skip bad lines without raising or warning when they are encountered.
383
384 .. versionadded:: 1.3.0
385
386 .. versionadded:: 1.4.0
387
388 - Callable, function with signature
389 ``(bad_line: list[str]) -> list[str] | None`` that will process a single
390 bad line. ``bad_line`` is a list of strings split by the ``sep``.
391 If the function returns ``None``, the bad line will be ignored.
392 If the function returns a new ``list`` of strings with more elements than
393 expected, a ``ParserWarning`` will be emitted while dropping extra elements.
394 Only supported when ``engine='python'``
395
396 .. versionchanged:: 2.2.0
397
398 - Callable, function with signature
399 as described in `pyarrow documentation
400 <https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
401 #pyarrow.csv.ParseOptions.invalid_row_handler>`_ when ``engine='pyarrow'``
402
403delim_whitespace : bool, default False
404 Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be
405 used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
406 is set to ``True``, nothing should be passed in for the ``delimiter``
407 parameter.
408
409 .. deprecated:: 2.2.0
410 Use ``sep="\\s+"`` instead.
411low_memory : bool, default True
412 Internally process the file in chunks, resulting in lower memory use
413 while parsing, but possibly mixed type inference. To ensure no mixed
414 types either set ``False``, or specify the type with the ``dtype`` parameter.
415 Note that the entire file is read into a single :class:`~pandas.DataFrame`
416 regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in
417 chunks. (Only valid with C parser).
418memory_map : bool, default False
419 If a filepath is provided for ``filepath_or_buffer``, map the file object
420 directly onto memory and access the data directly from there. Using this
421 option can improve performance because there is no longer any I/O overhead.
422float_precision : {{'high', 'legacy', 'round_trip'}}, optional
423 Specifies which converter the C engine should use for floating-point
424 values. The options are ``None`` or ``'high'`` for the ordinary converter,
425 ``'legacy'`` for the original lower precision pandas converter, and
426 ``'round_trip'`` for the round-trip converter.
427
428{storage_options}
429
430dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
431 Back-end data type applied to the resultant :class:`DataFrame`
432 (still experimental). Behaviour is as follows:
433
434 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
435 (default).
436 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
437 DataFrame.
438
439 .. versionadded:: 2.0
440
441Returns
442-------
443DataFrame or TextFileReader
444 A comma-separated values (csv) file is returned as two-dimensional
445 data structure with labeled axes.
446
447See Also
448--------
449DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
450{see_also_func_name} : {see_also_func_summary}
451read_fwf : Read a table of fixed-width formatted lines into DataFrame.
452
453Examples
454--------
455>>> pd.{func_name}('data.csv') # doctest: +SKIP
456"""
457)
458
459
460class _C_Parser_Defaults(TypedDict):
461 delim_whitespace: Literal[False]
462 na_filter: Literal[True]
463 low_memory: Literal[True]
464 memory_map: Literal[False]
465 float_precision: None
466
467
468_c_parser_defaults: _C_Parser_Defaults = {
469 "delim_whitespace": False,
470 "na_filter": True,
471 "low_memory": True,
472 "memory_map": False,
473 "float_precision": None,
474}
475
476
477class _Fwf_Defaults(TypedDict):
478 colspecs: Literal["infer"]
479 infer_nrows: Literal[100]
480 widths: None
481
482
483_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
484_c_unsupported = {"skipfooter"}
485_python_unsupported = {"low_memory", "float_precision"}
486_pyarrow_unsupported = {
487 "skipfooter",
488 "float_precision",
489 "chunksize",
490 "comment",
491 "nrows",
492 "thousands",
493 "memory_map",
494 "dialect",
495 "delim_whitespace",
496 "quoting",
497 "lineterminator",
498 "converters",
499 "iterator",
500 "dayfirst",
501 "verbose",
502 "skipinitialspace",
503 "low_memory",
504}
505
506
507class _DeprecationConfig(NamedTuple):
508 default_value: Any
509 msg: str | None
510
511
512@overload
513def validate_integer(name: str, val: None, min_val: int = ...) -> None:
514 ...
515
516
517@overload
518def validate_integer(name: str, val: float, min_val: int = ...) -> int:
519 ...
520
521
522@overload
523def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None:
524 ...
525
526
527def validate_integer(
528 name: str, val: int | float | None, min_val: int = 0
529) -> int | None:
530 """
531 Checks whether the 'name' parameter for parsing is either
532 an integer OR float that can SAFELY be cast to an integer
533 without losing accuracy. Raises a ValueError if that is
534 not the case.
535
536 Parameters
537 ----------
538 name : str
539 Parameter name (used for error reporting)
540 val : int or float
541 The value to check
542 min_val : int
543 Minimum allowed value (val < min_val will result in a ValueError)
544 """
545 if val is None:
546 return val
547
548 msg = f"'{name:s}' must be an integer >={min_val:d}"
549 if is_float(val):
550 if int(val) != val:
551 raise ValueError(msg)
552 val = int(val)
553 elif not (is_integer(val) and val >= min_val):
554 raise ValueError(msg)
555
556 return int(val)
557
558
559def _validate_names(names: Sequence[Hashable] | None) -> None:
560 """
561 Raise ValueError if the `names` parameter contains duplicates or has an
562 invalid data type.
563
564 Parameters
565 ----------
566 names : array-like or None
567 An array containing a list of the names used for the output DataFrame.
568
569 Raises
570 ------
571 ValueError
572 If names are not unique or are not ordered (e.g. set).
573 """
574 if names is not None:
575 if len(names) != len(set(names)):
576 raise ValueError("Duplicate names are not allowed.")
577 if not (
578 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
579 ):
580 raise ValueError("Names should be an ordered collection.")
581
582
583def _read(
584 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
585) -> DataFrame | TextFileReader:
586 """Generic reader of line files."""
587 # if we pass a date_parser and parse_dates=False, we should not parse the
588 # dates GH#44366
589 if kwds.get("parse_dates", None) is None:
590 if (
591 kwds.get("date_parser", lib.no_default) is lib.no_default
592 and kwds.get("date_format", None) is None
593 ):
594 kwds["parse_dates"] = False
595 else:
596 kwds["parse_dates"] = True
597
598 # Extract some of the arguments (pass chunksize on).
599 iterator = kwds.get("iterator", False)
600 chunksize = kwds.get("chunksize", None)
601 if kwds.get("engine") == "pyarrow":
602 if iterator:
603 raise ValueError(
604 "The 'iterator' option is not supported with the 'pyarrow' engine"
605 )
606
607 if chunksize is not None:
608 raise ValueError(
609 "The 'chunksize' option is not supported with the 'pyarrow' engine"
610 )
611 else:
612 chunksize = validate_integer("chunksize", chunksize, 1)
613
614 nrows = kwds.get("nrows", None)
615
616 # Check for duplicates in names.
617 _validate_names(kwds.get("names", None))
618
619 # Create the parser.
620 parser = TextFileReader(filepath_or_buffer, **kwds)
621
622 if chunksize or iterator:
623 return parser
624
625 with parser:
626 return parser.read(nrows)
627
628
629# iterator=True -> TextFileReader
630@overload
631def read_csv(
632 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
633 *,
634 sep: str | None | lib.NoDefault = ...,
635 delimiter: str | None | lib.NoDefault = ...,
636 header: int | Sequence[int] | None | Literal["infer"] = ...,
637 names: Sequence[Hashable] | None | lib.NoDefault = ...,
638 index_col: IndexLabel | Literal[False] | None = ...,
639 usecols: UsecolsArgType = ...,
640 dtype: DtypeArg | None = ...,
641 engine: CSVEngine | None = ...,
642 converters: Mapping[Hashable, Callable] | None = ...,
643 true_values: list | None = ...,
644 false_values: list | None = ...,
645 skipinitialspace: bool = ...,
646 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
647 skipfooter: int = ...,
648 nrows: int | None = ...,
649 na_values: Hashable
650 | Iterable[Hashable]
651 | Mapping[Hashable, Iterable[Hashable]]
652 | None = ...,
653 na_filter: bool = ...,
654 verbose: bool | lib.NoDefault = ...,
655 skip_blank_lines: bool = ...,
656 parse_dates: bool | Sequence[Hashable] | None = ...,
657 infer_datetime_format: bool | lib.NoDefault = ...,
658 keep_date_col: bool | lib.NoDefault = ...,
659 date_parser: Callable | lib.NoDefault = ...,
660 date_format: str | dict[Hashable, str] | None = ...,
661 dayfirst: bool = ...,
662 cache_dates: bool = ...,
663 iterator: Literal[True],
664 chunksize: int | None = ...,
665 compression: CompressionOptions = ...,
666 thousands: str | None = ...,
667 decimal: str = ...,
668 lineterminator: str | None = ...,
669 quotechar: str = ...,
670 quoting: int = ...,
671 doublequote: bool = ...,
672 escapechar: str | None = ...,
673 comment: str | None = ...,
674 encoding: str | None = ...,
675 encoding_errors: str | None = ...,
676 dialect: str | csv.Dialect | None = ...,
677 on_bad_lines=...,
678 delim_whitespace: bool | lib.NoDefault = ...,
679 low_memory: bool = ...,
680 memory_map: bool = ...,
681 float_precision: Literal["high", "legacy"] | None = ...,
682 storage_options: StorageOptions = ...,
683 dtype_backend: DtypeBackend | lib.NoDefault = ...,
684) -> TextFileReader:
685 ...
686
687
688# chunksize=int -> TextFileReader
689@overload
690def read_csv(
691 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
692 *,
693 sep: str | None | lib.NoDefault = ...,
694 delimiter: str | None | lib.NoDefault = ...,
695 header: int | Sequence[int] | None | Literal["infer"] = ...,
696 names: Sequence[Hashable] | None | lib.NoDefault = ...,
697 index_col: IndexLabel | Literal[False] | None = ...,
698 usecols: UsecolsArgType = ...,
699 dtype: DtypeArg | None = ...,
700 engine: CSVEngine | None = ...,
701 converters: Mapping[Hashable, Callable] | None = ...,
702 true_values: list | None = ...,
703 false_values: list | None = ...,
704 skipinitialspace: bool = ...,
705 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
706 skipfooter: int = ...,
707 nrows: int | None = ...,
708 na_values: Hashable
709 | Iterable[Hashable]
710 | Mapping[Hashable, Iterable[Hashable]]
711 | None = ...,
712 keep_default_na: bool = ...,
713 na_filter: bool = ...,
714 verbose: bool | lib.NoDefault = ...,
715 skip_blank_lines: bool = ...,
716 parse_dates: bool | Sequence[Hashable] | None = ...,
717 infer_datetime_format: bool | lib.NoDefault = ...,
718 keep_date_col: bool | lib.NoDefault = ...,
719 date_parser: Callable | lib.NoDefault = ...,
720 date_format: str | dict[Hashable, str] | None = ...,
721 dayfirst: bool = ...,
722 cache_dates: bool = ...,
723 iterator: bool = ...,
724 chunksize: int,
725 compression: CompressionOptions = ...,
726 thousands: str | None = ...,
727 decimal: str = ...,
728 lineterminator: str | None = ...,
729 quotechar: str = ...,
730 quoting: int = ...,
731 doublequote: bool = ...,
732 escapechar: str | None = ...,
733 comment: str | None = ...,
734 encoding: str | None = ...,
735 encoding_errors: str | None = ...,
736 dialect: str | csv.Dialect | None = ...,
737 on_bad_lines=...,
738 delim_whitespace: bool | lib.NoDefault = ...,
739 low_memory: bool = ...,
740 memory_map: bool = ...,
741 float_precision: Literal["high", "legacy"] | None = ...,
742 storage_options: StorageOptions = ...,
743 dtype_backend: DtypeBackend | lib.NoDefault = ...,
744) -> TextFileReader:
745 ...
746
747
748# default case -> DataFrame
749@overload
750def read_csv(
751 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
752 *,
753 sep: str | None | lib.NoDefault = ...,
754 delimiter: str | None | lib.NoDefault = ...,
755 header: int | Sequence[int] | None | Literal["infer"] = ...,
756 names: Sequence[Hashable] | None | lib.NoDefault = ...,
757 index_col: IndexLabel | Literal[False] | None = ...,
758 usecols: UsecolsArgType = ...,
759 dtype: DtypeArg | None = ...,
760 engine: CSVEngine | None = ...,
761 converters: Mapping[Hashable, Callable] | None = ...,
762 true_values: list | None = ...,
763 false_values: list | None = ...,
764 skipinitialspace: bool = ...,
765 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
766 skipfooter: int = ...,
767 nrows: int | None = ...,
768 na_values: Hashable
769 | Iterable[Hashable]
770 | Mapping[Hashable, Iterable[Hashable]]
771 | None = ...,
772 keep_default_na: bool = ...,
773 na_filter: bool = ...,
774 verbose: bool | lib.NoDefault = ...,
775 skip_blank_lines: bool = ...,
776 parse_dates: bool | Sequence[Hashable] | None = ...,
777 infer_datetime_format: bool | lib.NoDefault = ...,
778 keep_date_col: bool | lib.NoDefault = ...,
779 date_parser: Callable | lib.NoDefault = ...,
780 date_format: str | dict[Hashable, str] | None = ...,
781 dayfirst: bool = ...,
782 cache_dates: bool = ...,
783 iterator: Literal[False] = ...,
784 chunksize: None = ...,
785 compression: CompressionOptions = ...,
786 thousands: str | None = ...,
787 decimal: str = ...,
788 lineterminator: str | None = ...,
789 quotechar: str = ...,
790 quoting: int = ...,
791 doublequote: bool = ...,
792 escapechar: str | None = ...,
793 comment: str | None = ...,
794 encoding: str | None = ...,
795 encoding_errors: str | None = ...,
796 dialect: str | csv.Dialect | None = ...,
797 on_bad_lines=...,
798 delim_whitespace: bool | lib.NoDefault = ...,
799 low_memory: bool = ...,
800 memory_map: bool = ...,
801 float_precision: Literal["high", "legacy"] | None = ...,
802 storage_options: StorageOptions = ...,
803 dtype_backend: DtypeBackend | lib.NoDefault = ...,
804) -> DataFrame:
805 ...
806
807
808# Unions -> DataFrame | TextFileReader
809@overload
810def read_csv(
811 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
812 *,
813 sep: str | None | lib.NoDefault = ...,
814 delimiter: str | None | lib.NoDefault = ...,
815 header: int | Sequence[int] | None | Literal["infer"] = ...,
816 names: Sequence[Hashable] | None | lib.NoDefault = ...,
817 index_col: IndexLabel | Literal[False] | None = ...,
818 usecols: UsecolsArgType = ...,
819 dtype: DtypeArg | None = ...,
820 engine: CSVEngine | None = ...,
821 converters: Mapping[Hashable, Callable] | None = ...,
822 true_values: list | None = ...,
823 false_values: list | None = ...,
824 skipinitialspace: bool = ...,
825 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
826 skipfooter: int = ...,
827 nrows: int | None = ...,
828 na_values: Hashable
829 | Iterable[Hashable]
830 | Mapping[Hashable, Iterable[Hashable]]
831 | None = ...,
832 keep_default_na: bool = ...,
833 na_filter: bool = ...,
834 verbose: bool | lib.NoDefault = ...,
835 skip_blank_lines: bool = ...,
836 parse_dates: bool | Sequence[Hashable] | None = ...,
837 infer_datetime_format: bool | lib.NoDefault = ...,
838 keep_date_col: bool | lib.NoDefault = ...,
839 date_parser: Callable | lib.NoDefault = ...,
840 date_format: str | dict[Hashable, str] | None = ...,
841 dayfirst: bool = ...,
842 cache_dates: bool = ...,
843 iterator: bool = ...,
844 chunksize: int | None = ...,
845 compression: CompressionOptions = ...,
846 thousands: str | None = ...,
847 decimal: str = ...,
848 lineterminator: str | None = ...,
849 quotechar: str = ...,
850 quoting: int = ...,
851 doublequote: bool = ...,
852 escapechar: str | None = ...,
853 comment: str | None = ...,
854 encoding: str | None = ...,
855 encoding_errors: str | None = ...,
856 dialect: str | csv.Dialect | None = ...,
857 on_bad_lines=...,
858 delim_whitespace: bool | lib.NoDefault = ...,
859 low_memory: bool = ...,
860 memory_map: bool = ...,
861 float_precision: Literal["high", "legacy"] | None = ...,
862 storage_options: StorageOptions = ...,
863 dtype_backend: DtypeBackend | lib.NoDefault = ...,
864) -> DataFrame | TextFileReader:
865 ...
866
867
868@Appender(
869 _doc_read_csv_and_table.format(
870 func_name="read_csv",
871 summary="Read a comma-separated values (csv) file into DataFrame.",
872 see_also_func_name="read_table",
873 see_also_func_summary="Read general delimited file into DataFrame.",
874 _default_sep="','",
875 storage_options=_shared_docs["storage_options"],
876 decompression_options=_shared_docs["decompression_options"]
877 % "filepath_or_buffer",
878 )
879)
880def read_csv(
881 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
882 *,
883 sep: str | None | lib.NoDefault = lib.no_default,
884 delimiter: str | None | lib.NoDefault = None,
885 # Column and Index Locations and Names
886 header: int | Sequence[int] | None | Literal["infer"] = "infer",
887 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
888 index_col: IndexLabel | Literal[False] | None = None,
889 usecols: UsecolsArgType = None,
890 # General Parsing Configuration
891 dtype: DtypeArg | None = None,
892 engine: CSVEngine | None = None,
893 converters: Mapping[Hashable, Callable] | None = None,
894 true_values: list | None = None,
895 false_values: list | None = None,
896 skipinitialspace: bool = False,
897 skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,
898 skipfooter: int = 0,
899 nrows: int | None = None,
900 # NA and Missing Data Handling
901 na_values: Hashable
902 | Iterable[Hashable]
903 | Mapping[Hashable, Iterable[Hashable]]
904 | None = None,
905 keep_default_na: bool = True,
906 na_filter: bool = True,
907 verbose: bool | lib.NoDefault = lib.no_default,
908 skip_blank_lines: bool = True,
909 # Datetime Handling
910 parse_dates: bool | Sequence[Hashable] | None = None,
911 infer_datetime_format: bool | lib.NoDefault = lib.no_default,
912 keep_date_col: bool | lib.NoDefault = lib.no_default,
913 date_parser: Callable | lib.NoDefault = lib.no_default,
914 date_format: str | dict[Hashable, str] | None = None,
915 dayfirst: bool = False,
916 cache_dates: bool = True,
917 # Iteration
918 iterator: bool = False,
919 chunksize: int | None = None,
920 # Quoting, Compression, and File Format
921 compression: CompressionOptions = "infer",
922 thousands: str | None = None,
923 decimal: str = ".",
924 lineterminator: str | None = None,
925 quotechar: str = '"',
926 quoting: int = csv.QUOTE_MINIMAL,
927 doublequote: bool = True,
928 escapechar: str | None = None,
929 comment: str | None = None,
930 encoding: str | None = None,
931 encoding_errors: str | None = "strict",
932 dialect: str | csv.Dialect | None = None,
933 # Error Handling
934 on_bad_lines: str = "error",
935 # Internal
936 delim_whitespace: bool | lib.NoDefault = lib.no_default,
937 low_memory: bool = _c_parser_defaults["low_memory"],
938 memory_map: bool = False,
939 float_precision: Literal["high", "legacy"] | None = None,
940 storage_options: StorageOptions | None = None,
941 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
942) -> DataFrame | TextFileReader:
943 if keep_date_col is not lib.no_default:
944 # GH#55569
945 warnings.warn(
946 "The 'keep_date_col' keyword in pd.read_csv is deprecated and "
947 "will be removed in a future version. Explicitly remove unwanted "
948 "columns after parsing instead.",
949 FutureWarning,
950 stacklevel=find_stack_level(),
951 )
952 else:
953 keep_date_col = False
954
955 if lib.is_list_like(parse_dates):
956 # GH#55569
957 depr = False
958 # error: Item "bool" of "bool | Sequence[Hashable] | None" has no
959 # attribute "__iter__" (not iterable)
960 if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]
961 depr = True
962 elif isinstance(parse_dates, dict) and any(
963 lib.is_list_like(x) for x in parse_dates.values()
964 ):
965 depr = True
966 if depr:
967 warnings.warn(
968 "Support for nested sequences for 'parse_dates' in pd.read_csv "
969 "is deprecated. Combine the desired columns with pd.to_datetime "
970 "after parsing instead.",
971 FutureWarning,
972 stacklevel=find_stack_level(),
973 )
974
975 if infer_datetime_format is not lib.no_default:
976 warnings.warn(
977 "The argument 'infer_datetime_format' is deprecated and will "
978 "be removed in a future version. "
979 "A strict version of it is now the default, see "
980 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
981 "You can safely remove this argument.",
982 FutureWarning,
983 stacklevel=find_stack_level(),
984 )
985
986 if delim_whitespace is not lib.no_default:
987 # GH#55569
988 warnings.warn(
989 "The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
990 "will be removed in a future version. Use ``sep='\\s+'`` instead",
991 FutureWarning,
992 stacklevel=find_stack_level(),
993 )
994 else:
995 delim_whitespace = False
996
997 if verbose is not lib.no_default:
998 # GH#55569
999 warnings.warn(
1000 "The 'verbose' keyword in pd.read_csv is deprecated and "
1001 "will be removed in a future version.",
1002 FutureWarning,
1003 stacklevel=find_stack_level(),
1004 )
1005 else:
1006 verbose = False
1007
1008 # locals() should never be modified
1009 kwds = locals().copy()
1010 del kwds["filepath_or_buffer"]
1011 del kwds["sep"]
1012
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
1016 delim_whitespace,
1017 engine,
1018 sep,
1019 on_bad_lines,
1020 names,
1021 defaults={"delimiter": ","},
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
1025
1026 return _read(filepath_or_buffer, kwds)
1027
1028
1029# iterator=True -> TextFileReader
1030@overload
1031def read_table(
1032 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1033 *,
1034 sep: str | None | lib.NoDefault = ...,
1035 delimiter: str | None | lib.NoDefault = ...,
1036 header: int | Sequence[int] | None | Literal["infer"] = ...,
1037 names: Sequence[Hashable] | None | lib.NoDefault = ...,
1038 index_col: IndexLabel | Literal[False] | None = ...,
1039 usecols: UsecolsArgType = ...,
1040 dtype: DtypeArg | None = ...,
1041 engine: CSVEngine | None = ...,
1042 converters: Mapping[Hashable, Callable] | None = ...,
1043 true_values: list | None = ...,
1044 false_values: list | None = ...,
1045 skipinitialspace: bool = ...,
1046 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
1047 skipfooter: int = ...,
1048 nrows: int | None = ...,
1049 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
1050 keep_default_na: bool = ...,
1051 na_filter: bool = ...,
1052 verbose: bool | lib.NoDefault = ...,
1053 skip_blank_lines: bool = ...,
1054 parse_dates: bool | Sequence[Hashable] = ...,
1055 infer_datetime_format: bool | lib.NoDefault = ...,
1056 keep_date_col: bool | lib.NoDefault = ...,
1057 date_parser: Callable | lib.NoDefault = ...,
1058 date_format: str | dict[Hashable, str] | None = ...,
1059 dayfirst: bool = ...,
1060 cache_dates: bool = ...,
1061 iterator: Literal[True],
1062 chunksize: int | None = ...,
1063 compression: CompressionOptions = ...,
1064 thousands: str | None = ...,
1065 decimal: str = ...,
1066 lineterminator: str | None = ...,
1067 quotechar: str = ...,
1068 quoting: int = ...,
1069 doublequote: bool = ...,
1070 escapechar: str | None = ...,
1071 comment: str | None = ...,
1072 encoding: str | None = ...,
1073 encoding_errors: str | None = ...,
1074 dialect: str | csv.Dialect | None = ...,
1075 on_bad_lines=...,
1076 delim_whitespace: bool = ...,
1077 low_memory: bool = ...,
1078 memory_map: bool = ...,
1079 float_precision: str | None = ...,
1080 storage_options: StorageOptions = ...,
1081 dtype_backend: DtypeBackend | lib.NoDefault = ...,
1082) -> TextFileReader:
1083 ...
1084
1085
1086# chunksize=int -> TextFileReader
1087@overload
1088def read_table(
1089 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1090 *,
1091 sep: str | None | lib.NoDefault = ...,
1092 delimiter: str | None | lib.NoDefault = ...,
1093 header: int | Sequence[int] | None | Literal["infer"] = ...,
1094 names: Sequence[Hashable] | None | lib.NoDefault = ...,
1095 index_col: IndexLabel | Literal[False] | None = ...,
1096 usecols: UsecolsArgType = ...,
1097 dtype: DtypeArg | None = ...,
1098 engine: CSVEngine | None = ...,
1099 converters: Mapping[Hashable, Callable] | None = ...,
1100 true_values: list | None = ...,
1101 false_values: list | None = ...,
1102 skipinitialspace: bool = ...,
1103 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
1104 skipfooter: int = ...,
1105 nrows: int | None = ...,
1106 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
1107 keep_default_na: bool = ...,
1108 na_filter: bool = ...,
1109 verbose: bool | lib.NoDefault = ...,
1110 skip_blank_lines: bool = ...,
1111 parse_dates: bool | Sequence[Hashable] = ...,
1112 infer_datetime_format: bool | lib.NoDefault = ...,
1113 keep_date_col: bool | lib.NoDefault = ...,
1114 date_parser: Callable | lib.NoDefault = ...,
1115 date_format: str | dict[Hashable, str] | None = ...,
1116 dayfirst: bool = ...,
1117 cache_dates: bool = ...,
1118 iterator: bool = ...,
1119 chunksize: int,
1120 compression: CompressionOptions = ...,
1121 thousands: str | None = ...,
1122 decimal: str = ...,
1123 lineterminator: str | None = ...,
1124 quotechar: str = ...,
1125 quoting: int = ...,
1126 doublequote: bool = ...,
1127 escapechar: str | None = ...,
1128 comment: str | None = ...,
1129 encoding: str | None = ...,
1130 encoding_errors: str | None = ...,
1131 dialect: str | csv.Dialect | None = ...,
1132 on_bad_lines=...,
1133 delim_whitespace: bool = ...,
1134 low_memory: bool = ...,
1135 memory_map: bool = ...,
1136 float_precision: str | None = ...,
1137 storage_options: StorageOptions = ...,
1138 dtype_backend: DtypeBackend | lib.NoDefault = ...,
1139) -> TextFileReader:
1140 ...
1141
1142
1143# default -> DataFrame
1144@overload
1145def read_table(
1146 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1147 *,
1148 sep: str | None | lib.NoDefault = ...,
1149 delimiter: str | None | lib.NoDefault = ...,
1150 header: int | Sequence[int] | None | Literal["infer"] = ...,
1151 names: Sequence[Hashable] | None | lib.NoDefault = ...,
1152 index_col: IndexLabel | Literal[False] | None = ...,
1153 usecols: UsecolsArgType = ...,
1154 dtype: DtypeArg | None = ...,
1155 engine: CSVEngine | None = ...,
1156 converters: Mapping[Hashable, Callable] | None = ...,
1157 true_values: list | None = ...,
1158 false_values: list | None = ...,
1159 skipinitialspace: bool = ...,
1160 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
1161 skipfooter: int = ...,
1162 nrows: int | None = ...,
1163 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
1164 keep_default_na: bool = ...,
1165 na_filter: bool = ...,
1166 verbose: bool | lib.NoDefault = ...,
1167 skip_blank_lines: bool = ...,
1168 parse_dates: bool | Sequence[Hashable] = ...,
1169 infer_datetime_format: bool | lib.NoDefault = ...,
1170 keep_date_col: bool | lib.NoDefault = ...,
1171 date_parser: Callable | lib.NoDefault = ...,
1172 date_format: str | dict[Hashable, str] | None = ...,
1173 dayfirst: bool = ...,
1174 cache_dates: bool = ...,
1175 iterator: Literal[False] = ...,
1176 chunksize: None = ...,
1177 compression: CompressionOptions = ...,
1178 thousands: str | None = ...,
1179 decimal: str = ...,
1180 lineterminator: str | None = ...,
1181 quotechar: str = ...,
1182 quoting: int = ...,
1183 doublequote: bool = ...,
1184 escapechar: str | None = ...,
1185 comment: str | None = ...,
1186 encoding: str | None = ...,
1187 encoding_errors: str | None = ...,
1188 dialect: str | csv.Dialect | None = ...,
1189 on_bad_lines=...,
1190 delim_whitespace: bool = ...,
1191 low_memory: bool = ...,
1192 memory_map: bool = ...,
1193 float_precision: str | None = ...,
1194 storage_options: StorageOptions = ...,
1195 dtype_backend: DtypeBackend | lib.NoDefault = ...,
1196) -> DataFrame:
1197 ...
1198
1199
1200# Unions -> DataFrame | TextFileReader
1201@overload
1202def read_table(
1203 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1204 *,
1205 sep: str | None | lib.NoDefault = ...,
1206 delimiter: str | None | lib.NoDefault = ...,
1207 header: int | Sequence[int] | None | Literal["infer"] = ...,
1208 names: Sequence[Hashable] | None | lib.NoDefault = ...,
1209 index_col: IndexLabel | Literal[False] | None = ...,
1210 usecols: UsecolsArgType = ...,
1211 dtype: DtypeArg | None = ...,
1212 engine: CSVEngine | None = ...,
1213 converters: Mapping[Hashable, Callable] | None = ...,
1214 true_values: list | None = ...,
1215 false_values: list | None = ...,
1216 skipinitialspace: bool = ...,
1217 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,
1218 skipfooter: int = ...,
1219 nrows: int | None = ...,
1220 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,
1221 keep_default_na: bool = ...,
1222 na_filter: bool = ...,
1223 verbose: bool | lib.NoDefault = ...,
1224 skip_blank_lines: bool = ...,
1225 parse_dates: bool | Sequence[Hashable] = ...,
1226 infer_datetime_format: bool | lib.NoDefault = ...,
1227 keep_date_col: bool | lib.NoDefault = ...,
1228 date_parser: Callable | lib.NoDefault = ...,
1229 date_format: str | dict[Hashable, str] | None = ...,
1230 dayfirst: bool = ...,
1231 cache_dates: bool = ...,
1232 iterator: bool = ...,
1233 chunksize: int | None = ...,
1234 compression: CompressionOptions = ...,
1235 thousands: str | None = ...,
1236 decimal: str = ...,
1237 lineterminator: str | None = ...,
1238 quotechar: str = ...,
1239 quoting: int = ...,
1240 doublequote: bool = ...,
1241 escapechar: str | None = ...,
1242 comment: str | None = ...,
1243 encoding: str | None = ...,
1244 encoding_errors: str | None = ...,
1245 dialect: str | csv.Dialect | None = ...,
1246 on_bad_lines=...,
1247 delim_whitespace: bool = ...,
1248 low_memory: bool = ...,
1249 memory_map: bool = ...,
1250 float_precision: str | None = ...,
1251 storage_options: StorageOptions = ...,
1252 dtype_backend: DtypeBackend | lib.NoDefault = ...,
1253) -> DataFrame | TextFileReader:
1254 ...
1255
1256
1257@Appender(
1258 _doc_read_csv_and_table.format(
1259 func_name="read_table",
1260 summary="Read general delimited file into DataFrame.",
1261 see_also_func_name="read_csv",
1262 see_also_func_summary=(
1263 "Read a comma-separated values (csv) file into DataFrame."
1264 ),
1265 _default_sep=r"'\\t' (tab-stop)",
1266 storage_options=_shared_docs["storage_options"],
1267 decompression_options=_shared_docs["decompression_options"]
1268 % "filepath_or_buffer",
1269 )
1270)
1271def read_table(
1272 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1273 *,
1274 sep: str | None | lib.NoDefault = lib.no_default,
1275 delimiter: str | None | lib.NoDefault = None,
1276 # Column and Index Locations and Names
1277 header: int | Sequence[int] | None | Literal["infer"] = "infer",
1278 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
1279 index_col: IndexLabel | Literal[False] | None = None,
1280 usecols: UsecolsArgType = None,
1281 # General Parsing Configuration
1282 dtype: DtypeArg | None = None,
1283 engine: CSVEngine | None = None,
1284 converters: Mapping[Hashable, Callable] | None = None,
1285 true_values: list | None = None,
1286 false_values: list | None = None,
1287 skipinitialspace: bool = False,
1288 skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,
1289 skipfooter: int = 0,
1290 nrows: int | None = None,
1291 # NA and Missing Data Handling
1292 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None,
1293 keep_default_na: bool = True,
1294 na_filter: bool = True,
1295 verbose: bool | lib.NoDefault = lib.no_default,
1296 skip_blank_lines: bool = True,
1297 # Datetime Handling
1298 parse_dates: bool | Sequence[Hashable] = False,
1299 infer_datetime_format: bool | lib.NoDefault = lib.no_default,
1300 keep_date_col: bool | lib.NoDefault = lib.no_default,
1301 date_parser: Callable | lib.NoDefault = lib.no_default,
1302 date_format: str | dict[Hashable, str] | None = None,
1303 dayfirst: bool = False,
1304 cache_dates: bool = True,
1305 # Iteration
1306 iterator: bool = False,
1307 chunksize: int | None = None,
1308 # Quoting, Compression, and File Format
1309 compression: CompressionOptions = "infer",
1310 thousands: str | None = None,
1311 decimal: str = ".",
1312 lineterminator: str | None = None,
1313 quotechar: str = '"',
1314 quoting: int = csv.QUOTE_MINIMAL,
1315 doublequote: bool = True,
1316 escapechar: str | None = None,
1317 comment: str | None = None,
1318 encoding: str | None = None,
1319 encoding_errors: str | None = "strict",
1320 dialect: str | csv.Dialect | None = None,
1321 # Error Handling
1322 on_bad_lines: str = "error",
1323 # Internal
1324 delim_whitespace: bool | lib.NoDefault = lib.no_default,
1325 low_memory: bool = _c_parser_defaults["low_memory"],
1326 memory_map: bool = False,
1327 float_precision: str | None = None,
1328 storage_options: StorageOptions | None = None,
1329 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1330) -> DataFrame | TextFileReader:
1331 if keep_date_col is not lib.no_default:
1332 # GH#55569
1333 warnings.warn(
1334 "The 'keep_date_col' keyword in pd.read_table is deprecated and "
1335 "will be removed in a future version. Explicitly remove unwanted "
1336 "columns after parsing instead.",
1337 FutureWarning,
1338 stacklevel=find_stack_level(),
1339 )
1340 else:
1341 keep_date_col = False
1342
1343 # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__"
1344 if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]
1345 # GH#55569
1346 warnings.warn(
1347 "Support for nested sequences for 'parse_dates' in pd.read_table "
1348 "is deprecated. Combine the desired columns with pd.to_datetime "
1349 "after parsing instead.",
1350 FutureWarning,
1351 stacklevel=find_stack_level(),
1352 )
1353
1354 if infer_datetime_format is not lib.no_default:
1355 warnings.warn(
1356 "The argument 'infer_datetime_format' is deprecated and will "
1357 "be removed in a future version. "
1358 "A strict version of it is now the default, see "
1359 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
1360 "You can safely remove this argument.",
1361 FutureWarning,
1362 stacklevel=find_stack_level(),
1363 )
1364
1365 if delim_whitespace is not lib.no_default:
1366 # GH#55569
1367 warnings.warn(
1368 "The 'delim_whitespace' keyword in pd.read_table is deprecated and "
1369 "will be removed in a future version. Use ``sep='\\s+'`` instead",
1370 FutureWarning,
1371 stacklevel=find_stack_level(),
1372 )
1373 else:
1374 delim_whitespace = False
1375
1376 if verbose is not lib.no_default:
1377 # GH#55569
1378 warnings.warn(
1379 "The 'verbose' keyword in pd.read_table is deprecated and "
1380 "will be removed in a future version.",
1381 FutureWarning,
1382 stacklevel=find_stack_level(),
1383 )
1384 else:
1385 verbose = False
1386
1387 # locals() should never be modified
1388 kwds = locals().copy()
1389 del kwds["filepath_or_buffer"]
1390 del kwds["sep"]
1391
1392 kwds_defaults = _refine_defaults_read(
1393 dialect,
1394 delimiter,
1395 delim_whitespace,
1396 engine,
1397 sep,
1398 on_bad_lines,
1399 names,
1400 defaults={"delimiter": "\t"},
1401 dtype_backend=dtype_backend,
1402 )
1403 kwds.update(kwds_defaults)
1404
1405 return _read(filepath_or_buffer, kwds)
1406
1407
1408@overload
1409def read_fwf(
1410 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1411 *,
1412 colspecs: Sequence[tuple[int, int]] | str | None = ...,
1413 widths: Sequence[int] | None = ...,
1414 infer_nrows: int = ...,
1415 dtype_backend: DtypeBackend | lib.NoDefault = ...,
1416 iterator: Literal[True],
1417 chunksize: int | None = ...,
1418 **kwds,
1419) -> TextFileReader:
1420 ...
1421
1422
1423@overload
1424def read_fwf(
1425 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1426 *,
1427 colspecs: Sequence[tuple[int, int]] | str | None = ...,
1428 widths: Sequence[int] | None = ...,
1429 infer_nrows: int = ...,
1430 dtype_backend: DtypeBackend | lib.NoDefault = ...,
1431 iterator: bool = ...,
1432 chunksize: int,
1433 **kwds,
1434) -> TextFileReader:
1435 ...
1436
1437
1438@overload
1439def read_fwf(
1440 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1441 *,
1442 colspecs: Sequence[tuple[int, int]] | str | None = ...,
1443 widths: Sequence[int] | None = ...,
1444 infer_nrows: int = ...,
1445 dtype_backend: DtypeBackend | lib.NoDefault = ...,
1446 iterator: Literal[False] = ...,
1447 chunksize: None = ...,
1448 **kwds,
1449) -> DataFrame:
1450 ...
1451
1452
1453def read_fwf(
1454 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
1455 *,
1456 colspecs: Sequence[tuple[int, int]] | str | None = "infer",
1457 widths: Sequence[int] | None = None,
1458 infer_nrows: int = 100,
1459 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1460 iterator: bool = False,
1461 chunksize: int | None = None,
1462 **kwds,
1463) -> DataFrame | TextFileReader:
1464 r"""
1465 Read a table of fixed-width formatted lines into DataFrame.
1466
1467 Also supports optionally iterating or breaking of the file
1468 into chunks.
1469
1470 Additional help can be found in the `online docs for IO Tools
1471 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
1472
1473 Parameters
1474 ----------
1475 filepath_or_buffer : str, path object, or file-like object
1476 String, path object (implementing ``os.PathLike[str]``), or file-like
1477 object implementing a text ``read()`` function.The string could be a URL.
1478 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
1479 expected. A local file could be:
1480 ``file://localhost/path/to/table.csv``.
1481 colspecs : list of tuple (int, int) or 'infer'. optional
1482 A list of tuples giving the extents of the fixed-width
1483 fields of each line as half-open intervals (i.e., [from, to[ ).
1484 String value 'infer' can be used to instruct the parser to try
1485 detecting the column specifications from the first 100 rows of
1486 the data which are not being skipped via skiprows (default='infer').
1487 widths : list of int, optional
1488 A list of field widths which can be used instead of 'colspecs' if
1489 the intervals are contiguous.
1490 infer_nrows : int, default 100
1491 The number of rows to consider when letting the parser determine the
1492 `colspecs`.
1493 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
1494 Back-end data type applied to the resultant :class:`DataFrame`
1495 (still experimental). Behaviour is as follows:
1496
1497 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
1498 (default).
1499 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
1500 DataFrame.
1501
1502 .. versionadded:: 2.0
1503
1504 **kwds : optional
1505 Optional keyword arguments can be passed to ``TextFileReader``.
1506
1507 Returns
1508 -------
1509 DataFrame or TextFileReader
1510 A comma-separated values (csv) file is returned as two-dimensional
1511 data structure with labeled axes.
1512
1513 See Also
1514 --------
1515 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
1516 read_csv : Read a comma-separated values (csv) file into DataFrame.
1517
1518 Examples
1519 --------
1520 >>> pd.read_fwf('data.csv') # doctest: +SKIP
1521 """
1522 # Check input arguments.
1523 if colspecs is None and widths is None:
1524 raise ValueError("Must specify either colspecs or widths")
1525 if colspecs not in (None, "infer") and widths is not None:
1526 raise ValueError("You must specify only one of 'widths' and 'colspecs'")
1527
1528 # Compute 'colspecs' from 'widths', if specified.
1529 if widths is not None:
1530 colspecs, col = [], 0
1531 for w in widths:
1532 colspecs.append((col, col + w))
1533 col += w
1534
1535 # for mypy
1536 assert colspecs is not None
1537
1538 # GH#40830
1539 # Ensure length of `colspecs` matches length of `names`
1540 names = kwds.get("names")
1541 if names is not None:
1542 if len(names) != len(colspecs) and colspecs != "infer":
1543 # need to check len(index_col) as it might contain
1544 # unnamed indices, in which case it's name is not required
1545 len_index = 0
1546 if kwds.get("index_col") is not None:
1547 index_col: Any = kwds.get("index_col")
1548 if index_col is not False:
1549 if not is_list_like(index_col):
1550 len_index = 1
1551 else:
1552 len_index = len(index_col)
1553 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):
1554 # If usecols is used colspec may be longer than names
1555 raise ValueError("Length of colspecs must match length of names")
1556
1557 kwds["colspecs"] = colspecs
1558 kwds["infer_nrows"] = infer_nrows
1559 kwds["engine"] = "python-fwf"
1560 kwds["iterator"] = iterator
1561 kwds["chunksize"] = chunksize
1562
1563 check_dtype_backend(dtype_backend)
1564 kwds["dtype_backend"] = dtype_backend
1565 return _read(filepath_or_buffer, kwds)
1566
1567
1568class TextFileReader(abc.Iterator):
1569 """
1570
1571 Passed dialect overrides any of the related parser options
1572
1573 """
1574
1575 def __init__(
1576 self,
1577 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,
1578 engine: CSVEngine | None = None,
1579 **kwds,
1580 ) -> None:
1581 if engine is not None:
1582 engine_specified = True
1583 else:
1584 engine = "python"
1585 engine_specified = False
1586 self.engine = engine
1587 self._engine_specified = kwds.get("engine_specified", engine_specified)
1588
1589 _validate_skipfooter(kwds)
1590
1591 dialect = _extract_dialect(kwds)
1592 if dialect is not None:
1593 if engine == "pyarrow":
1594 raise ValueError(
1595 "The 'dialect' option is not supported with the 'pyarrow' engine"
1596 )
1597 kwds = _merge_with_dialect_properties(dialect, kwds)
1598
1599 if kwds.get("header", "infer") == "infer":
1600 kwds["header"] = 0 if kwds.get("names") is None else None
1601
1602 self.orig_options = kwds
1603
1604 # miscellanea
1605 self._currow = 0
1606
1607 options = self._get_options_with_defaults(engine)
1608 options["storage_options"] = kwds.get("storage_options", None)
1609
1610 self.chunksize = options.pop("chunksize", None)
1611 self.nrows = options.pop("nrows", None)
1612
1613 self._check_file_or_buffer(f, engine)
1614 self.options, self.engine = self._clean_options(options, engine)
1615
1616 if "has_index_names" in kwds:
1617 self.options["has_index_names"] = kwds["has_index_names"]
1618
1619 self.handles: IOHandles | None = None
1620 self._engine = self._make_engine(f, self.engine)
1621
1622 def close(self) -> None:
1623 if self.handles is not None:
1624 self.handles.close()
1625 self._engine.close()
1626
1627 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
1628 kwds = self.orig_options
1629
1630 options = {}
1631 default: object | None
1632
1633 for argname, default in parser_defaults.items():
1634 value = kwds.get(argname, default)
1635
1636 # see gh-12935
1637 if (
1638 engine == "pyarrow"
1639 and argname in _pyarrow_unsupported
1640 and value != default
1641 and value != getattr(value, "value", default)
1642 ):
1643 raise ValueError(
1644 f"The {repr(argname)} option is not supported with the "
1645 f"'pyarrow' engine"
1646 )
1647 options[argname] = value
1648
1649 for argname, default in _c_parser_defaults.items():
1650 if argname in kwds:
1651 value = kwds[argname]
1652
1653 if engine != "c" and value != default:
1654 # TODO: Refactor this logic, its pretty convoluted
1655 if "python" in engine and argname not in _python_unsupported:
1656 pass
1657 elif "pyarrow" in engine and argname not in _pyarrow_unsupported:
1658 pass
1659 else:
1660 raise ValueError(
1661 f"The {repr(argname)} option is not supported with the "
1662 f"{repr(engine)} engine"
1663 )
1664 else:
1665 value = default
1666 options[argname] = value
1667
1668 if engine == "python-fwf":
1669 for argname, default in _fwf_defaults.items():
1670 options[argname] = kwds.get(argname, default)
1671
1672 return options
1673
1674 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
1675 # see gh-16530
1676 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):
1677 # The C engine doesn't need the file-like to have the "__iter__"
1678 # attribute. However, the Python engine needs "__iter__(...)"
1679 # when iterating through such an object, meaning it
1680 # needs to have that attribute
1681 raise ValueError(
1682 "The 'python' engine cannot iterate through this file buffer."
1683 )
1684
1685 def _clean_options(
1686 self, options: dict[str, Any], engine: CSVEngine
1687 ) -> tuple[dict[str, Any], CSVEngine]:
1688 result = options.copy()
1689
1690 fallback_reason = None
1691
1692 # C engine not supported yet
1693 if engine == "c":
1694 if options["skipfooter"] > 0:
1695 fallback_reason = "the 'c' engine does not support skipfooter"
1696 engine = "python"
1697
1698 sep = options["delimiter"]
1699 delim_whitespace = options["delim_whitespace"]
1700
1701 if sep is None and not delim_whitespace:
1702 if engine in ("c", "pyarrow"):
1703 fallback_reason = (
1704 f"the '{engine}' engine does not support "
1705 "sep=None with delim_whitespace=False"
1706 )
1707 engine = "python"
1708 elif sep is not None and len(sep) > 1:
1709 if engine == "c" and sep == r"\s+":
1710 result["delim_whitespace"] = True
1711 del result["delimiter"]
1712 elif engine not in ("python", "python-fwf"):
1713 # wait until regex engine integrated
1714 fallback_reason = (
1715 f"the '{engine}' engine does not support "
1716 "regex separators (separators > 1 char and "
1717 r"different from '\s+' are interpreted as regex)"
1718 )
1719 engine = "python"
1720 elif delim_whitespace:
1721 if "python" in engine:
1722 result["delimiter"] = r"\s+"
1723 elif sep is not None:
1724 encodeable = True
1725 encoding = sys.getfilesystemencoding() or "utf-8"
1726 try:
1727 if len(sep.encode(encoding)) > 1:
1728 encodeable = False
1729 except UnicodeDecodeError:
1730 encodeable = False
1731 if not encodeable and engine not in ("python", "python-fwf"):
1732 fallback_reason = (
1733 f"the separator encoded in {encoding} "
1734 f"is > 1 char long, and the '{engine}' engine "
1735 "does not support such separators"
1736 )
1737 engine = "python"
1738
1739 quotechar = options["quotechar"]
1740 if quotechar is not None and isinstance(quotechar, (str, bytes)):
1741 if (
1742 len(quotechar) == 1
1743 and ord(quotechar) > 127
1744 and engine not in ("python", "python-fwf")
1745 ):
1746 fallback_reason = (
1747 "ord(quotechar) > 127, meaning the "
1748 "quotechar is larger than one byte, "
1749 f"and the '{engine}' engine does not support such quotechars"
1750 )
1751 engine = "python"
1752
1753 if fallback_reason and self._engine_specified:
1754 raise ValueError(fallback_reason)
1755
1756 if engine == "c":
1757 for arg in _c_unsupported:
1758 del result[arg]
1759
1760 if "python" in engine:
1761 for arg in _python_unsupported:
1762 if fallback_reason and result[arg] != _c_parser_defaults.get(arg):
1763 raise ValueError(
1764 "Falling back to the 'python' engine because "
1765 f"{fallback_reason}, but this causes {repr(arg)} to be "
1766 "ignored as it is not supported by the 'python' engine."
1767 )
1768 del result[arg]
1769
1770 if fallback_reason:
1771 warnings.warn(
1772 (
1773 "Falling back to the 'python' engine because "
1774 f"{fallback_reason}; you can avoid this warning by specifying "
1775 "engine='python'."
1776 ),
1777 ParserWarning,
1778 stacklevel=find_stack_level(),
1779 )
1780
1781 index_col = options["index_col"]
1782 names = options["names"]
1783 converters = options["converters"]
1784 na_values = options["na_values"]
1785 skiprows = options["skiprows"]
1786
1787 validate_header_arg(options["header"])
1788
1789 if index_col is True:
1790 raise ValueError("The value of index_col couldn't be 'True'")
1791 if is_index_col(index_col):
1792 if not isinstance(index_col, (list, tuple, np.ndarray)):
1793 index_col = [index_col]
1794 result["index_col"] = index_col
1795
1796 names = list(names) if names is not None else names
1797
1798 # type conversion-related
1799 if converters is not None:
1800 if not isinstance(converters, dict):
1801 raise TypeError(
1802 "Type converters must be a dict or subclass, "
1803 f"input was a {type(converters).__name__}"
1804 )
1805 else:
1806 converters = {}
1807
1808 # Converting values to NA
1809 keep_default_na = options["keep_default_na"]
1810 floatify = engine != "pyarrow"
1811 na_values, na_fvalues = _clean_na_values(
1812 na_values, keep_default_na, floatify=floatify
1813 )
1814
1815 # handle skiprows; this is internally handled by the
1816 # c-engine, so only need for python and pyarrow parsers
1817 if engine == "pyarrow":
1818 if not is_integer(skiprows) and skiprows is not None:
1819 # pyarrow expects skiprows to be passed as an integer
1820 raise ValueError(
1821 "skiprows argument must be an integer when using "
1822 "engine='pyarrow'"
1823 )
1824 else:
1825 if is_integer(skiprows):
1826 skiprows = list(range(skiprows))
1827 if skiprows is None:
1828 skiprows = set()
1829 elif not callable(skiprows):
1830 skiprows = set(skiprows)
1831
1832 # put stuff back
1833 result["names"] = names
1834 result["converters"] = converters
1835 result["na_values"] = na_values
1836 result["na_fvalues"] = na_fvalues
1837 result["skiprows"] = skiprows
1838
1839 return result, engine
1840
1841 def __next__(self) -> DataFrame:
1842 try:
1843 return self.get_chunk()
1844 except StopIteration:
1845 self.close()
1846 raise
1847
1848 def _make_engine(
1849 self,
1850 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,
1851 engine: CSVEngine = "c",
1852 ) -> ParserBase:
1853 mapping: dict[str, type[ParserBase]] = {
1854 "c": CParserWrapper,
1855 "python": PythonParser,
1856 "pyarrow": ArrowParserWrapper,
1857 "python-fwf": FixedWidthFieldParser,
1858 }
1859 if engine not in mapping:
1860 raise ValueError(
1861 f"Unknown engine: {engine} (valid options are {mapping.keys()})"
1862 )
1863 if not isinstance(f, list):
1864 # open file here
1865 is_text = True
1866 mode = "r"
1867 if engine == "pyarrow":
1868 is_text = False
1869 mode = "rb"
1870 elif (
1871 engine == "c"
1872 and self.options.get("encoding", "utf-8") == "utf-8"
1873 and isinstance(stringify_path(f), str)
1874 ):
1875 # c engine can decode utf-8 bytes, adding TextIOWrapper makes
1876 # the c-engine especially for memory_map=True far slower
1877 is_text = False
1878 if "b" not in mode:
1879 mode += "b"
1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
1892
1893 elif engine != "python":
1894 msg = f"Invalid file path or buffer object type: {type(f)}"
1895 raise ValueError(msg)
1896
1897 try:
1898 return mapping[engine](f, **self.options)
1899 except Exception:
1900 if self.handles is not None:
1901 self.handles.close()
1902 raise
1903
1904 def _failover_to_python(self) -> None:
1905 raise AbstractMethodError(self)
1906
1907 def read(self, nrows: int | None = None) -> DataFrame:
1908 if self.engine == "pyarrow":
1909 try:
1910 # error: "ParserBase" has no attribute "read"
1911 df = self._engine.read() # type: ignore[attr-defined]
1912 except Exception:
1913 self.close()
1914 raise
1915 else:
1916 nrows = validate_integer("nrows", nrows)
1917 try:
1918 # error: "ParserBase" has no attribute "read"
1919 (
1920 index,
1921 columns,
1922 col_dict,
1923 ) = self._engine.read( # type: ignore[attr-defined]
1924 nrows
1925 )
1926 except Exception:
1927 self.close()
1928 raise
1929
1930 if index is None:
1931 if col_dict:
1932 # Any column is actually fine:
1933 new_rows = len(next(iter(col_dict.values())))
1934 index = RangeIndex(self._currow, self._currow + new_rows)
1935 else:
1936 new_rows = 0
1937 else:
1938 new_rows = len(index)
1939
1940 if hasattr(self, "orig_options"):
1941 dtype_arg = self.orig_options.get("dtype", None)
1942 else:
1943 dtype_arg = None
1944
1945 if isinstance(dtype_arg, dict):
1946 dtype = defaultdict(lambda: None) # type: ignore[var-annotated]
1947 dtype.update(dtype_arg)
1948 elif dtype_arg is not None and pandas_dtype(dtype_arg) in (
1949 np.str_,
1950 np.object_,
1951 ):
1952 dtype = defaultdict(lambda: dtype_arg)
1953 else:
1954 dtype = None
1955
1956 if dtype is not None:
1957 new_col_dict = {}
1958 for k, v in col_dict.items():
1959 d = (
1960 dtype[k]
1961 if pandas_dtype(dtype[k]) in (np.str_, np.object_)
1962 else None
1963 )
1964 new_col_dict[k] = Series(v, index=index, dtype=d, copy=False)
1965 else:
1966 new_col_dict = col_dict
1967
1968 df = DataFrame(
1969 new_col_dict,
1970 columns=columns,
1971 index=index,
1972 copy=not using_copy_on_write(),
1973 )
1974
1975 self._currow += new_rows
1976 return df
1977
1978 def get_chunk(self, size: int | None = None) -> DataFrame:
1979 if size is None:
1980 size = self.chunksize
1981 if self.nrows is not None:
1982 if self._currow >= self.nrows:
1983 raise StopIteration
1984 size = min(size, self.nrows - self._currow)
1985 return self.read(nrows=size)
1986
1987 def __enter__(self) -> Self:
1988 return self
1989
1990 def __exit__(
1991 self,
1992 exc_type: type[BaseException] | None,
1993 exc_value: BaseException | None,
1994 traceback: TracebackType | None,
1995 ) -> None:
1996 self.close()
1997
1998
1999def TextParser(*args, **kwds) -> TextFileReader:
2000 """
2001 Converts lists of lists/tuples into DataFrames with proper type inference
2002 and optional (e.g. string to datetime) conversion. Also enables iterating
2003 lazily over chunks of large files
2004
2005 Parameters
2006 ----------
2007 data : file-like object or list
2008 delimiter : separator character to use
2009 dialect : str or csv.Dialect instance, optional
2010 Ignored if delimiter is longer than 1 character
2011 names : sequence, default
2012 header : int, default 0
2013 Row to use to parse column labels. Defaults to the first row. Prior
2014 rows will be discarded
2015 index_col : int or list, optional
2016 Column or columns to use as the (possibly hierarchical) index
2017 has_index_names: bool, default False
2018 True if the cols defined in index_col have an index name and are
2019 not in the header.
2020 na_values : scalar, str, list-like, or dict, optional
2021 Additional strings to recognize as NA/NaN.
2022 keep_default_na : bool, default True
2023 thousands : str, optional
2024 Thousands separator
2025 comment : str, optional
2026 Comment out remainder of line
2027 parse_dates : bool, default False
2028 keep_date_col : bool, default False
2029 date_parser : function, optional
2030
2031 .. deprecated:: 2.0.0
2032 date_format : str or dict of column -> format, default ``None``
2033
2034 .. versionadded:: 2.0.0
2035 skiprows : list of integers
2036 Row numbers to skip
2037 skipfooter : int
2038 Number of line at bottom of file to skip
2039 converters : dict, optional
2040 Dict of functions for converting values in certain columns. Keys can
2041 either be integers or column labels, values are functions that take one
2042 input argument, the cell (not column) content, and return the
2043 transformed content.
2044 encoding : str, optional
2045 Encoding to use for UTF when reading/writing (ex. 'utf-8')
2046 float_precision : str, optional
2047 Specifies which converter the C engine should use for floating-point
2048 values. The options are `None` or `high` for the ordinary converter,
2049 `legacy` for the original lower precision pandas converter, and
2050 `round_trip` for the round-trip converter.
2051 """
2052 kwds["engine"] = "python"
2053 return TextFileReader(*args, **kwds)
2054
2055
2056def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True):
2057 na_fvalues: set | dict
2058 if na_values is None:
2059 if keep_default_na:
2060 na_values = STR_NA_VALUES
2061 else:
2062 na_values = set()
2063 na_fvalues = set()
2064 elif isinstance(na_values, dict):
2065 old_na_values = na_values.copy()
2066 na_values = {} # Prevent aliasing.
2067
2068 # Convert the values in the na_values dictionary
2069 # into array-likes for further use. This is also
2070 # where we append the default NaN values, provided
2071 # that `keep_default_na=True`.
2072 for k, v in old_na_values.items():
2073 if not is_list_like(v):
2074 v = [v]
2075
2076 if keep_default_na:
2077 v = set(v) | STR_NA_VALUES
2078
2079 na_values[k] = v
2080 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
2081 else:
2082 if not is_list_like(na_values):
2083 na_values = [na_values]
2084 na_values = _stringify_na_values(na_values, floatify)
2085 if keep_default_na:
2086 na_values = na_values | STR_NA_VALUES
2087
2088 na_fvalues = _floatify_na_values(na_values)
2089
2090 return na_values, na_fvalues
2091
2092
2093def _floatify_na_values(na_values):
2094 # create float versions of the na_values
2095 result = set()
2096 for v in na_values:
2097 try:
2098 v = float(v)
2099 if not np.isnan(v):
2100 result.add(v)
2101 except (TypeError, ValueError, OverflowError):
2102 pass
2103 return result
2104
2105
2106def _stringify_na_values(na_values, floatify: bool):
2107 """return a stringified and numeric for these values"""
2108 result: list[str | float] = []
2109 for x in na_values:
2110 result.append(str(x))
2111 result.append(x)
2112 try:
2113 v = float(x)
2114
2115 # we are like 999 here
2116 if v == int(v):
2117 v = int(v)
2118 result.append(f"{v}.0")
2119 result.append(str(v))
2120
2121 if floatify:
2122 result.append(v)
2123 except (TypeError, ValueError, OverflowError):
2124 pass
2125 if floatify:
2126 try:
2127 result.append(int(x))
2128 except (TypeError, ValueError, OverflowError):
2129 pass
2130 return set(result)
2131
2132
2133def _refine_defaults_read(
2134 dialect: str | csv.Dialect | None,
2135 delimiter: str | None | lib.NoDefault,
2136 delim_whitespace: bool,
2137 engine: CSVEngine | None,
2138 sep: str | None | lib.NoDefault,
2139 on_bad_lines: str | Callable,
2140 names: Sequence[Hashable] | None | lib.NoDefault,
2141 defaults: dict[str, Any],
2142 dtype_backend: DtypeBackend | lib.NoDefault,
2143):
2144 """Validate/refine default values of input parameters of read_csv, read_table.
2145
2146 Parameters
2147 ----------
2148 dialect : str or csv.Dialect
2149 If provided, this parameter will override values (default or not) for the
2150 following parameters: `delimiter`, `doublequote`, `escapechar`,
2151 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
2152 override values, a ParserWarning will be issued. See csv.Dialect
2153 documentation for more details.
2154 delimiter : str or object
2155 Alias for sep.
2156 delim_whitespace : bool
2157 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
2158 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
2159 is set to True, nothing should be passed in for the ``delimiter``
2160 parameter.
2161
2162 .. deprecated:: 2.2.0
2163 Use ``sep="\\s+"`` instead.
2164 engine : {{'c', 'python'}}
2165 Parser engine to use. The C engine is faster while the python engine is
2166 currently more feature-complete.
2167 sep : str or object
2168 A delimiter provided by the user (str) or a sentinel value, i.e.
2169 pandas._libs.lib.no_default.
2170 on_bad_lines : str, callable
2171 An option for handling bad lines or a sentinel value(None).
2172 names : array-like, optional
2173 List of column names to use. If the file contains a header row,
2174 then you should explicitly pass ``header=0`` to override the column names.
2175 Duplicates in this list are not allowed.
2176 defaults: dict
2177 Default values of input parameters.
2178
2179 Returns
2180 -------
2181 kwds : dict
2182 Input parameters with correct values.
2183
2184 Raises
2185 ------
2186 ValueError :
2187 If a delimiter was specified with ``sep`` (or ``delimiter``) and
2188 ``delim_whitespace=True``.
2189 """
2190 # fix types for sep, delimiter to Union(str, Any)
2191 delim_default = defaults["delimiter"]
2192 kwds: dict[str, Any] = {}
2193 # gh-23761
2194 #
2195 # When a dialect is passed, it overrides any of the overlapping
2196 # parameters passed in directly. We don't want to warn if the
2197 # default parameters were passed in (since it probably means
2198 # that the user didn't pass them in explicitly in the first place).
2199 #
2200 # "delimiter" is the annoying corner case because we alias it to
2201 # "sep" before doing comparison to the dialect values later on.
2202 # Thus, we need a flag to indicate that we need to "override"
2203 # the comparison to dialect values by checking if default values
2204 # for BOTH "delimiter" and "sep" were provided.
2205 if dialect is not None:
2206 kwds["sep_override"] = delimiter is None and (
2207 sep is lib.no_default or sep == delim_default
2208 )
2209
2210 if delimiter and (sep is not lib.no_default):
2211 raise ValueError("Specified a sep and a delimiter; you can only specify one.")
2212
2213 kwds["names"] = None if names is lib.no_default else names
2214
2215 # Alias sep -> delimiter.
2216 if delimiter is None:
2217 delimiter = sep
2218
2219 if delim_whitespace and (delimiter is not lib.no_default):
2220 raise ValueError(
2221 "Specified a delimiter with both sep and "
2222 "delim_whitespace=True; you can only specify one."
2223 )
2224
2225 if delimiter == "\n":
2226 raise ValueError(
2227 r"Specified \n as separator or delimiter. This forces the python engine "
2228 "which does not accept a line terminator. Hence it is not allowed to use "
2229 "the line terminator as separator.",
2230 )
2231
2232 if delimiter is lib.no_default:
2233 # assign default separator value
2234 kwds["delimiter"] = delim_default
2235 else:
2236 kwds["delimiter"] = delimiter
2237
2238 if engine is not None:
2239 kwds["engine_specified"] = True
2240 else:
2241 kwds["engine"] = "c"
2242 kwds["engine_specified"] = False
2243
2244 if on_bad_lines == "error":
2245 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
2246 elif on_bad_lines == "warn":
2247 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
2248 elif on_bad_lines == "skip":
2249 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
2250 elif callable(on_bad_lines):
2251 if engine not in ["python", "pyarrow"]:
2252 raise ValueError(
2253 "on_bad_line can only be a callable function "
2254 "if engine='python' or 'pyarrow'"
2255 )
2256 kwds["on_bad_lines"] = on_bad_lines
2257 else:
2258 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
2259
2260 check_dtype_backend(dtype_backend)
2261
2262 kwds["dtype_backend"] = dtype_backend
2263
2264 return kwds
2265
2266
2267def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:
2268 """
2269 Extract concrete csv dialect instance.
2270
2271 Returns
2272 -------
2273 csv.Dialect or None
2274 """
2275 if kwds.get("dialect") is None:
2276 return None
2277
2278 dialect = kwds["dialect"]
2279 if dialect in csv.list_dialects():
2280 dialect = csv.get_dialect(dialect)
2281
2282 _validate_dialect(dialect)
2283
2284 return dialect
2285
2286
2287MANDATORY_DIALECT_ATTRS = (
2288 "delimiter",
2289 "doublequote",
2290 "escapechar",
2291 "skipinitialspace",
2292 "quotechar",
2293 "quoting",
2294)
2295
2296
2297def _validate_dialect(dialect: csv.Dialect) -> None:
2298 """
2299 Validate csv dialect instance.
2300
2301 Raises
2302 ------
2303 ValueError
2304 If incorrect dialect is provided.
2305 """
2306 for param in MANDATORY_DIALECT_ATTRS:
2307 if not hasattr(dialect, param):
2308 raise ValueError(f"Invalid dialect {dialect} provided")
2309
2310
2311def _merge_with_dialect_properties(
2312 dialect: csv.Dialect,
2313 defaults: dict[str, Any],
2314) -> dict[str, Any]:
2315 """
2316 Merge default kwargs in TextFileReader with dialect parameters.
2317
2318 Parameters
2319 ----------
2320 dialect : csv.Dialect
2321 Concrete csv dialect. See csv.Dialect documentation for more details.
2322 defaults : dict
2323 Keyword arguments passed to TextFileReader.
2324
2325 Returns
2326 -------
2327 kwds : dict
2328 Updated keyword arguments, merged with dialect parameters.
2329 """
2330 kwds = defaults.copy()
2331
2332 for param in MANDATORY_DIALECT_ATTRS:
2333 dialect_val = getattr(dialect, param)
2334
2335 parser_default = parser_defaults[param]
2336 provided = kwds.get(param, parser_default)
2337
2338 # Messages for conflicting values between the dialect
2339 # instance and the actual parameters provided.
2340 conflict_msgs = []
2341
2342 # Don't warn if the default parameter was passed in,
2343 # even if it conflicts with the dialect (gh-23761).
2344 if provided not in (parser_default, dialect_val):
2345 msg = (
2346 f"Conflicting values for '{param}': '{provided}' was "
2347 f"provided, but the dialect specifies '{dialect_val}'. "
2348 "Using the dialect-specified value."
2349 )
2350
2351 # Annoying corner case for not warning about
2352 # conflicts between dialect and delimiter parameter.
2353 # Refer to the outer "_read_" function for more info.
2354 if not (param == "delimiter" and kwds.pop("sep_override", False)):
2355 conflict_msgs.append(msg)
2356
2357 if conflict_msgs:
2358 warnings.warn(
2359 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()
2360 )
2361 kwds[param] = dialect_val
2362 return kwds
2363
2364
2365def _validate_skipfooter(kwds: dict[str, Any]) -> None:
2366 """
2367 Check whether skipfooter is compatible with other kwargs in TextFileReader.
2368
2369 Parameters
2370 ----------
2371 kwds : dict
2372 Keyword arguments passed to TextFileReader.
2373
2374 Raises
2375 ------
2376 ValueError
2377 If skipfooter is not compatible with other parameters.
2378 """
2379 if kwds.get("skipfooter"):
2380 if kwds.get("iterator") or kwds.get("chunksize"):
2381 raise ValueError("'skipfooter' not supported for iteration")
2382 if kwds.get("nrows"):
2383 raise ValueError("'skipfooter' not supported with 'nrows'")