Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/parsers/readers.py: 52%

1"""

2Module contains tools for processing files into DataFrames or other objects

4GH#48849 provides a convenient way of deprecating keyword arguments

5"""

6from __future__ import annotations

8from collections import (

9 abc,

10 defaultdict,

11)

12import csv

13import sys

14from textwrap import fill

15from typing import (

16 IO,

17 TYPE_CHECKING,

18 Any,

19 Callable,

20 Literal,

21 NamedTuple,

22 TypedDict,

23 overload,

24)

25import warnings

27import numpy as np

29from pandas._config import using_copy_on_write

31from pandas._libs import lib

32from pandas._libs.parsers import STR_NA_VALUES

33from pandas.errors import (

34 AbstractMethodError,

35 ParserWarning,

36)

37from pandas.util._decorators import Appender

38from pandas.util._exceptions import find_stack_level

39from pandas.util._validators import check_dtype_backend

41from pandas.core.dtypes.common import (

42 is_file_like,

43 is_float,

44 is_hashable,

45 is_integer,

46 is_list_like,

47 pandas_dtype,

48)

50from pandas import Series

51from pandas.core.frame import DataFrame

52from pandas.core.indexes.api import RangeIndex

53from pandas.core.shared_docs import _shared_docs

55from pandas.io.common import (

56 IOHandles,

57 get_handle,

58 stringify_path,

59 validate_header_arg,

60)

61from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper

62from pandas.io.parsers.base_parser import (

63 ParserBase,

64 is_index_col,

65 parser_defaults,

66)

67from pandas.io.parsers.c_parser_wrapper import CParserWrapper

68from pandas.io.parsers.python_parser import (

69 FixedWidthFieldParser,

70 PythonParser,

71)

73if TYPE_CHECKING:

74 from collections.abc import (

75 Hashable,

76 Iterable,

77 Mapping,

78 Sequence,

79 )

80 from types import TracebackType

82 from pandas._typing import (

83 CompressionOptions,

84 CSVEngine,

85 DtypeArg,

86 DtypeBackend,

87 FilePath,

88 IndexLabel,

89 ReadCsvBuffer,

90 Self,

91 StorageOptions,

92 UsecolsArgType,

93 )

94_doc_read_csv_and_table = (

95 r"""

96{summary}

98Also supports optionally iterating or breaking of the file

99into chunks.

100

101Additional help can be found in the online docs for

102`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.

103

104Parameters

105----------

106filepath_or_buffer : str, path object or file-like object

107 Any valid string path is acceptable. The string could be a URL. Valid

108 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is

109 expected. A local file could be: file://localhost/path/to/table.csv.

110

111 If you want to pass in a path object, pandas accepts any ``os.PathLike``.

112

113 By file-like object, we refer to objects with a ``read()`` method, such as

114 a file handle (e.g. via builtin ``open`` function) or ``StringIO``.

115sep : str, default {_default_sep}

116 Character or regex pattern to treat as the delimiter. If ``sep=None``, the

117 C engine cannot automatically detect

118 the separator, but the Python parsing engine can, meaning the latter will

119 be used and automatically detect the separator from only the first valid

120 row of the file by Python's builtin sniffer tool, ``csv.Sniffer``.

121 In addition, separators longer than 1 character and different from

122 ``'\s+'`` will be interpreted as regular expressions and will also force

123 the use of the Python parsing engine. Note that regex delimiters are prone

124 to ignoring quoted data. Regex example: ``'\r\t'``.

125delimiter : str, optional

126 Alias for ``sep``.

127header : int, Sequence of int, 'infer' or None, default 'infer'

128 Row number(s) containing column labels and marking the start of the

129 data (zero-indexed). Default behavior is to infer the column names: if no ``names``

130 are passed the behavior is identical to ``header=0`` and column

131 names are inferred from the first line of the file, if column

132 names are passed explicitly to ``names`` then the behavior is identical to

133 ``header=None``. Explicitly pass ``header=0`` to be able to

134 replace existing names. The header can be a list of integers that

135 specify row locations for a :class:`~pandas.MultiIndex` on the columns

136 e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be

137 skipped (e.g. 2 in this example is skipped). Note that this

138 parameter ignores commented lines and empty lines if

139 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of

140 data rather than the first line of the file.

141names : Sequence of Hashable, optional

142 Sequence of column labels to apply. If the file contains a header row,

143 then you should explicitly pass ``header=0`` to override the column names.

144 Duplicates in this list are not allowed.

145index_col : Hashable, Sequence of Hashable or False, optional

146 Column(s) to use as row label(s), denoted either by column labels or column

147 indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex`

148 will be formed for the row labels.

149

150 Note: ``index_col=False`` can be used to force pandas to *not* use the first

151 column as the index, e.g., when you have a malformed file with delimiters at

152 the end of each line.

153usecols : Sequence of Hashable or Callable, optional

154 Subset of columns to select, denoted either by column labels or column indices.

155 If list-like, all elements must either

156 be positional (i.e. integer indices into the document columns) or strings

157 that correspond to column names provided either by the user in ``names`` or

158 inferred from the document header row(s). If ``names`` are given, the document

159 header row(s) are not taken into account. For example, a valid list-like

160 ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.

161 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.

162 To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order

163 preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]``

164 for columns in ``['foo', 'bar']`` order or

165 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``

166 for ``['bar', 'foo']`` order.

167

168 If callable, the callable function will be evaluated against the column

169 names, returning names where the callable function evaluates to ``True``. An

170 example of a valid callable argument would be ``lambda x: x.upper() in

171 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster

172 parsing time and lower memory usage.

173dtype : dtype or dict of {{Hashable : dtype}}, optional

174 Data type(s) to apply to either the whole dataset or individual columns.

175 E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}``

176 Use ``str`` or ``object`` together with suitable ``na_values`` settings

177 to preserve and not interpret ``dtype``.

178 If ``converters`` are specified, they will be applied INSTEAD

179 of ``dtype`` conversion.

180

181 .. versionadded:: 1.5.0

182

183 Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where

184 the default determines the ``dtype`` of the columns which are not explicitly

185 listed.

186engine : {{'c', 'python', 'pyarrow'}}, optional

187 Parser engine to use. The C and pyarrow engines are faster, while the python engine

188 is currently more feature-complete. Multithreading is currently only supported by

189 the pyarrow engine.

190

191 .. versionadded:: 1.4.0

192

193 The 'pyarrow' engine was added as an *experimental* engine, and some features

194 are unsupported, or may not work correctly, with this engine.

195converters : dict of {{Hashable : Callable}}, optional

196 Functions for converting values in specified columns. Keys can either

197 be column labels or column indices.

198true_values : list, optional

199 Values to consider as ``True`` in addition to case-insensitive variants of 'True'.

200false_values : list, optional

201 Values to consider as ``False`` in addition to case-insensitive variants of 'False'.

202skipinitialspace : bool, default False

203 Skip spaces after delimiter.

204skiprows : int, list of int or Callable, optional

205 Line numbers to skip (0-indexed) or number of lines to skip (``int``)

206 at the start of the file.

207

208 If callable, the callable function will be evaluated against the row

209 indices, returning ``True`` if the row should be skipped and ``False`` otherwise.

210 An example of a valid callable argument would be ``lambda x: x in [0, 2]``.

211skipfooter : int, default 0

212 Number of lines at bottom of file to skip (Unsupported with ``engine='c'``).

213nrows : int, optional

214 Number of rows of file to read. Useful for reading pieces of large files.

215na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional

216 Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific

217 per-column ``NA`` values. By default the following values are interpreted as

218 ``NaN``: " """

219 + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")

220 + """ ".

221

222keep_default_na : bool, default True

223 Whether or not to include the default ``NaN`` values when parsing the data.

224 Depending on whether ``na_values`` is passed in, the behavior is as follows:

225

226 * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values``

227 is appended to the default ``NaN`` values used for parsing.

228 * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only

229 the default ``NaN`` values are used for parsing.

230 * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only

231 the ``NaN`` values specified ``na_values`` are used for parsing.

232 * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no

233 strings will be parsed as ``NaN``.

234

235 Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and

236 ``na_values`` parameters will be ignored.

237na_filter : bool, default True

238 Detect missing value markers (empty strings and the value of ``na_values``). In

239 data without any ``NA`` values, passing ``na_filter=False`` can improve the

240 performance of reading a large file.

241verbose : bool, default False

242 Indicate number of ``NA`` values placed in non-numeric columns.

243

244 .. deprecated:: 2.2.0

245skip_blank_lines : bool, default True

246 If ``True``, skip over blank lines rather than interpreting as ``NaN`` values.

247parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \

248default False

249 The behavior is as follows:

250

251 * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to

252 ``True`` if ``date_format`` or ``date_parser`` arguments have been passed.

253 * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3

254 each as a separate date column.

255 * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse

256 as a single date column. Values are joined with a space before parsing.

257 * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call

258 result 'foo'. Values are joined with a space before parsing.

259

260 If a column or index cannot be represented as an array of ``datetime``,

261 say because of an unparsable value or a mixture of timezones, the column

262 or index will be returned unaltered as an ``object`` data type. For

263 non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after

264 :func:`~pandas.read_csv`.

265

266 Note: A fast-path exists for iso8601-formatted dates.

267infer_datetime_format : bool, default False

268 If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the

269 format of the ``datetime`` strings in the columns, and if it can be inferred,

270 switch to a faster method of parsing them. In some cases this can increase

271 the parsing speed by 5-10x.

272

273 .. deprecated:: 2.0.0

274 A strict version of this argument is now the default, passing it has no effect.

275

276keep_date_col : bool, default False

277 If ``True`` and ``parse_dates`` specifies combining multiple columns then

278 keep the original columns.

279date_parser : Callable, optional

280 Function to use for converting a sequence of string columns to an array of

281 ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the

282 conversion. pandas will try to call ``date_parser`` in three different ways,

283 advancing to the next if an exception occurs: 1) Pass one or more arrays

284 (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the

285 string values from the columns defined by ``parse_dates`` into a single array

286 and pass that; and 3) call ``date_parser`` once for each row using one or

287 more strings (corresponding to the columns defined by ``parse_dates``) as

288 arguments.

289

290 .. deprecated:: 2.0.0

291 Use ``date_format`` instead, or read in as ``object`` and then apply

292 :func:`~pandas.to_datetime` as-needed.

293date_format : str or dict of column -> format, optional

294 Format to use for parsing dates when used in conjunction with ``parse_dates``.

295 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See

296 `strftime documentation

297 <https://docs.python.org/3/library/datetime.html

298 #strftime-and-strptime-behavior>`_ for more information on choices, though

299 note that :const:`"%f"` will parse all the way up to nanoseconds.

300 You can also pass:

301

302 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_

303 time string (not necessarily in exactly the same format);

304 - "mixed", to infer the format for each element individually. This is risky,

305 and you should probably use it along with `dayfirst`.

306

307 .. versionadded:: 2.0.0

308dayfirst : bool, default False

309 DD/MM format dates, international and European format.

310cache_dates : bool, default True

311 If ``True``, use a cache of unique, converted dates to apply the ``datetime``

312 conversion. May produce significant speed-up when parsing duplicate

313 date strings, especially ones with timezone offsets.

314

315iterator : bool, default False

316 Return ``TextFileReader`` object for iteration or getting chunks with

317 ``get_chunk()``.

318chunksize : int, optional

319 Number of lines to read from the file per chunk. Passing a value will cause the

320 function to return a ``TextFileReader`` object for iteration.

321 See the `IO Tools docs

322 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_

323 for more information on ``iterator`` and ``chunksize``.

324

325{decompression_options}

326

327 .. versionchanged:: 1.4.0 Zstandard support.

328

329thousands : str (length 1), optional

330 Character acting as the thousands separator in numerical values.

331decimal : str (length 1), default '.'

332 Character to recognize as decimal point (e.g., use ',' for European data).

333lineterminator : str (length 1), optional

334 Character used to denote a line break. Only valid with C parser.

335quotechar : str (length 1), optional

336 Character used to denote the start and end of a quoted item. Quoted

337 items can include the ``delimiter`` and it will be ignored.

338quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \

3393 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL

340 Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is

341 ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special

342 characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``,

343 or ``lineterminator``.

344doublequote : bool, default True

345 When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate

346 whether or not to interpret two consecutive ``quotechar`` elements INSIDE a

347 field as a single ``quotechar`` element.

348escapechar : str (length 1), optional

349 Character used to escape other characters.

350comment : str (length 1), optional

351 Character indicating that the remainder of line should not be parsed.

352 If found at the beginning

353 of a line, the line will be ignored altogether. This parameter must be a

354 single character. Like empty lines (as long as ``skip_blank_lines=True``),

355 fully commented lines are ignored by the parameter ``header`` but not by

356 ``skiprows``. For example, if ``comment='#'``, parsing

357 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being

358 treated as the header.

359encoding : str, optional, default 'utf-8'

360 Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python

361 standard encodings

362 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .

363

364encoding_errors : str, optional, default 'strict'

365 How encoding errors are treated. `List of possible values

366 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .

367

368 .. versionadded:: 1.3.0

369

370dialect : str or csv.Dialect, optional

371 If provided, this parameter will override values (default or not) for the

372 following parameters: ``delimiter``, ``doublequote``, ``escapechar``,

373 ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to

374 override values, a ``ParserWarning`` will be issued. See ``csv.Dialect``

375 documentation for more details.

376on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error'

377 Specifies what to do upon encountering a bad line (a line with too many fields).

378 Allowed values are :

379

380 - ``'error'``, raise an Exception when a bad line is encountered.

381 - ``'warn'``, raise a warning when a bad line is encountered and skip that line.

382 - ``'skip'``, skip bad lines without raising or warning when they are encountered.

383

384 .. versionadded:: 1.3.0

385

386 .. versionadded:: 1.4.0

387

388 - Callable, function with signature

389 ``(bad_line: list[str]) -> list[str] | None`` that will process a single

390 bad line. ``bad_line`` is a list of strings split by the ``sep``.

391 If the function returns ``None``, the bad line will be ignored.

392 If the function returns a new ``list`` of strings with more elements than

393 expected, a ``ParserWarning`` will be emitted while dropping extra elements.

394 Only supported when ``engine='python'``

395

396 .. versionchanged:: 2.2.0

397

398 - Callable, function with signature

399 as described in `pyarrow documentation

400 <https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html

401 #pyarrow.csv.ParseOptions.invalid_row_handler>`_ when ``engine='pyarrow'``

402

403delim_whitespace : bool, default False

404 Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be

405 used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option

406 is set to ``True``, nothing should be passed in for the ``delimiter``

407 parameter.

408

409 .. deprecated:: 2.2.0

410 Use ``sep="\\s+"`` instead.

411low_memory : bool, default True

412 Internally process the file in chunks, resulting in lower memory use

413 while parsing, but possibly mixed type inference. To ensure no mixed

414 types either set ``False``, or specify the type with the ``dtype`` parameter.

415 Note that the entire file is read into a single :class:`~pandas.DataFrame`

416 regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in

417 chunks. (Only valid with C parser).

418memory_map : bool, default False

419 If a filepath is provided for ``filepath_or_buffer``, map the file object

420 directly onto memory and access the data directly from there. Using this

421 option can improve performance because there is no longer any I/O overhead.

422float_precision : {{'high', 'legacy', 'round_trip'}}, optional

423 Specifies which converter the C engine should use for floating-point

424 values. The options are ``None`` or ``'high'`` for the ordinary converter,

425 ``'legacy'`` for the original lower precision pandas converter, and

426 ``'round_trip'`` for the round-trip converter.

427

428{storage_options}

429

430dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'

431 Back-end data type applied to the resultant :class:`DataFrame`

432 (still experimental). Behaviour is as follows:

433

434 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

435 (default).

436 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

437 DataFrame.

438

439 .. versionadded:: 2.0

440

441Returns

442-------

443DataFrame or TextFileReader

444 A comma-separated values (csv) file is returned as two-dimensional

445 data structure with labeled axes.

446

447See Also

448--------

449DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

450{see_also_func_name} : {see_also_func_summary}

451read_fwf : Read a table of fixed-width formatted lines into DataFrame.

452

453Examples

454--------

455>>> pd.{func_name}('data.csv') # doctest: +SKIP

456"""

457)

458

459

460class _C_Parser_Defaults(TypedDict):

461 delim_whitespace: Literal[False]

462 na_filter: Literal[True]

463 low_memory: Literal[True]

464 memory_map: Literal[False]

465 float_precision: None

466

467

468_c_parser_defaults: _C_Parser_Defaults = {

469 "delim_whitespace": False,

470 "na_filter": True,

471 "low_memory": True,

472 "memory_map": False,

473 "float_precision": None,

474}

475

476

477class _Fwf_Defaults(TypedDict):

478 colspecs: Literal["infer"]

479 infer_nrows: Literal[100]

480 widths: None

481

482

483_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}

484_c_unsupported = {"skipfooter"}

485_python_unsupported = {"low_memory", "float_precision"}

486_pyarrow_unsupported = {

487 "skipfooter",

488 "float_precision",

489 "chunksize",

490 "comment",

491 "nrows",

492 "thousands",

493 "memory_map",

494 "dialect",

495 "delim_whitespace",

496 "quoting",

497 "lineterminator",

498 "converters",

499 "iterator",

500 "dayfirst",

501 "verbose",

502 "skipinitialspace",

503 "low_memory",

504}

505

506

507class _DeprecationConfig(NamedTuple):

508 default_value: Any

509 msg: str | None

510

511

512@overload

513def validate_integer(name: str, val: None, min_val: int = ...) -> None:

514 ...

515

516

517@overload

518def validate_integer(name: str, val: float, min_val: int = ...) -> int:

519 ...

520

521

522@overload

523def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None:

524 ...

525

526

527def validate_integer(

528 name: str, val: int | float | None, min_val: int = 0

529) -> int | None:

530 """

531 Checks whether the 'name' parameter for parsing is either

532 an integer OR float that can SAFELY be cast to an integer

533 without losing accuracy. Raises a ValueError if that is

534 not the case.

535

536 Parameters

537 ----------

538 name : str

539 Parameter name (used for error reporting)

540 val : int or float

541 The value to check

542 min_val : int

543 Minimum allowed value (val < min_val will result in a ValueError)

544 """

545 if val is None:

546 return val

547

548 msg = f"'{name:s}' must be an integer >={min_val:d}"

549 if is_float(val):

550 if int(val) != val:

551 raise ValueError(msg)

552 val = int(val)

553 elif not (is_integer(val) and val >= min_val):

554 raise ValueError(msg)

555

556 return int(val)

557

558

559def _validate_names(names: Sequence[Hashable] | None) -> None:

560 """

561 Raise ValueError if the `names` parameter contains duplicates or has an

562 invalid data type.

563

564 Parameters

565 ----------

566 names : array-like or None

567 An array containing a list of the names used for the output DataFrame.

568

569 Raises

570 ------

571 ValueError

572 If names are not unique or are not ordered (e.g. set).

573 """

574 if names is not None:

575 if len(names) != len(set(names)):

576 raise ValueError("Duplicate names are not allowed.")

577 if not (

578 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)

579 ):

580 raise ValueError("Names should be an ordered collection.")

581

582

583def _read(

584 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds

585) -> DataFrame | TextFileReader:

586 """Generic reader of line files."""

587 # if we pass a date_parser and parse_dates=False, we should not parse the

588 # dates GH#44366

589 if kwds.get("parse_dates", None) is None:

590 if (

591 kwds.get("date_parser", lib.no_default) is lib.no_default

592 and kwds.get("date_format", None) is None

593 ):

594 kwds["parse_dates"] = False

595 else:

596 kwds["parse_dates"] = True

597

598 # Extract some of the arguments (pass chunksize on).

599 iterator = kwds.get("iterator", False)

600 chunksize = kwds.get("chunksize", None)

601 if kwds.get("engine") == "pyarrow":

602 if iterator:

603 raise ValueError(

604 "The 'iterator' option is not supported with the 'pyarrow' engine"

605 )

606

607 if chunksize is not None:

608 raise ValueError(

609 "The 'chunksize' option is not supported with the 'pyarrow' engine"

610 )

611 else:

612 chunksize = validate_integer("chunksize", chunksize, 1)

613

614 nrows = kwds.get("nrows", None)

615

616 # Check for duplicates in names.

617 _validate_names(kwds.get("names", None))

618

619 # Create the parser.

620 parser = TextFileReader(filepath_or_buffer, **kwds)

621

622 if chunksize or iterator:

623 return parser

624

625 with parser:

626 return parser.read(nrows)

627

628

629# iterator=True -> TextFileReader

630@overload

631def read_csv(

632 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

633 *,

634 sep: str | None | lib.NoDefault = ...,

635 delimiter: str | None | lib.NoDefault = ...,

636 header: int | Sequence[int] | None | Literal["infer"] = ...,

637 names: Sequence[Hashable] | None | lib.NoDefault = ...,

638 index_col: IndexLabel | Literal[False] | None = ...,

639 usecols: UsecolsArgType = ...,

640 dtype: DtypeArg | None = ...,

641 engine: CSVEngine | None = ...,

642 converters: Mapping[Hashable, Callable] | None = ...,

643 true_values: list | None = ...,

644 false_values: list | None = ...,

645 skipinitialspace: bool = ...,

646 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

647 skipfooter: int = ...,

648 nrows: int | None = ...,

649 na_values: Hashable

650 | Iterable[Hashable]

651 | Mapping[Hashable, Iterable[Hashable]]

652 | None = ...,

653 na_filter: bool = ...,

654 verbose: bool | lib.NoDefault = ...,

655 skip_blank_lines: bool = ...,

656 parse_dates: bool | Sequence[Hashable] | None = ...,

657 infer_datetime_format: bool | lib.NoDefault = ...,

658 keep_date_col: bool | lib.NoDefault = ...,

659 date_parser: Callable | lib.NoDefault = ...,

660 date_format: str | dict[Hashable, str] | None = ...,

661 dayfirst: bool = ...,

662 cache_dates: bool = ...,

663 iterator: Literal[True],

664 chunksize: int | None = ...,

665 compression: CompressionOptions = ...,

666 thousands: str | None = ...,

667 decimal: str = ...,

668 lineterminator: str | None = ...,

669 quotechar: str = ...,

670 quoting: int = ...,

671 doublequote: bool = ...,

672 escapechar: str | None = ...,

673 comment: str | None = ...,

674 encoding: str | None = ...,

675 encoding_errors: str | None = ...,

676 dialect: str | csv.Dialect | None = ...,

677 on_bad_lines=...,

678 delim_whitespace: bool | lib.NoDefault = ...,

679 low_memory: bool = ...,

680 memory_map: bool = ...,

681 float_precision: Literal["high", "legacy"] | None = ...,

682 storage_options: StorageOptions = ...,

683 dtype_backend: DtypeBackend | lib.NoDefault = ...,

684) -> TextFileReader:

685 ...

686

687

688# chunksize=int -> TextFileReader

689@overload

690def read_csv(

691 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

692 *,

693 sep: str | None | lib.NoDefault = ...,

694 delimiter: str | None | lib.NoDefault = ...,

695 header: int | Sequence[int] | None | Literal["infer"] = ...,

696 names: Sequence[Hashable] | None | lib.NoDefault = ...,

697 index_col: IndexLabel | Literal[False] | None = ...,

698 usecols: UsecolsArgType = ...,

699 dtype: DtypeArg | None = ...,

700 engine: CSVEngine | None = ...,

701 converters: Mapping[Hashable, Callable] | None = ...,

702 true_values: list | None = ...,

703 false_values: list | None = ...,

704 skipinitialspace: bool = ...,

705 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

706 skipfooter: int = ...,

707 nrows: int | None = ...,

708 na_values: Hashable

709 | Iterable[Hashable]

710 | Mapping[Hashable, Iterable[Hashable]]

711 | None = ...,

712 keep_default_na: bool = ...,

713 na_filter: bool = ...,

714 verbose: bool | lib.NoDefault = ...,

715 skip_blank_lines: bool = ...,

716 parse_dates: bool | Sequence[Hashable] | None = ...,

717 infer_datetime_format: bool | lib.NoDefault = ...,

718 keep_date_col: bool | lib.NoDefault = ...,

719 date_parser: Callable | lib.NoDefault = ...,

720 date_format: str | dict[Hashable, str] | None = ...,

721 dayfirst: bool = ...,

722 cache_dates: bool = ...,

723 iterator: bool = ...,

724 chunksize: int,

725 compression: CompressionOptions = ...,

726 thousands: str | None = ...,

727 decimal: str = ...,

728 lineterminator: str | None = ...,

729 quotechar: str = ...,

730 quoting: int = ...,

731 doublequote: bool = ...,

732 escapechar: str | None = ...,

733 comment: str | None = ...,

734 encoding: str | None = ...,

735 encoding_errors: str | None = ...,

736 dialect: str | csv.Dialect | None = ...,

737 on_bad_lines=...,

738 delim_whitespace: bool | lib.NoDefault = ...,

739 low_memory: bool = ...,

740 memory_map: bool = ...,

741 float_precision: Literal["high", "legacy"] | None = ...,

742 storage_options: StorageOptions = ...,

743 dtype_backend: DtypeBackend | lib.NoDefault = ...,

744) -> TextFileReader:

745 ...

746

747

748# default case -> DataFrame

749@overload

750def read_csv(

751 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

752 *,

753 sep: str | None | lib.NoDefault = ...,

754 delimiter: str | None | lib.NoDefault = ...,

755 header: int | Sequence[int] | None | Literal["infer"] = ...,

756 names: Sequence[Hashable] | None | lib.NoDefault = ...,

757 index_col: IndexLabel | Literal[False] | None = ...,

758 usecols: UsecolsArgType = ...,

759 dtype: DtypeArg | None = ...,

760 engine: CSVEngine | None = ...,

761 converters: Mapping[Hashable, Callable] | None = ...,

762 true_values: list | None = ...,

763 false_values: list | None = ...,

764 skipinitialspace: bool = ...,

765 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

766 skipfooter: int = ...,

767 nrows: int | None = ...,

768 na_values: Hashable

769 | Iterable[Hashable]

770 | Mapping[Hashable, Iterable[Hashable]]

771 | None = ...,

772 keep_default_na: bool = ...,

773 na_filter: bool = ...,

774 verbose: bool | lib.NoDefault = ...,

775 skip_blank_lines: bool = ...,

776 parse_dates: bool | Sequence[Hashable] | None = ...,

777 infer_datetime_format: bool | lib.NoDefault = ...,

778 keep_date_col: bool | lib.NoDefault = ...,

779 date_parser: Callable | lib.NoDefault = ...,

780 date_format: str | dict[Hashable, str] | None = ...,

781 dayfirst: bool = ...,

782 cache_dates: bool = ...,

783 iterator: Literal[False] = ...,

784 chunksize: None = ...,

785 compression: CompressionOptions = ...,

786 thousands: str | None = ...,

787 decimal: str = ...,

788 lineterminator: str | None = ...,

789 quotechar: str = ...,

790 quoting: int = ...,

791 doublequote: bool = ...,

792 escapechar: str | None = ...,

793 comment: str | None = ...,

794 encoding: str | None = ...,

795 encoding_errors: str | None = ...,

796 dialect: str | csv.Dialect | None = ...,

797 on_bad_lines=...,

798 delim_whitespace: bool | lib.NoDefault = ...,

799 low_memory: bool = ...,

800 memory_map: bool = ...,

801 float_precision: Literal["high", "legacy"] | None = ...,

802 storage_options: StorageOptions = ...,

803 dtype_backend: DtypeBackend | lib.NoDefault = ...,

804) -> DataFrame:

805 ...

806

807

808# Unions -> DataFrame | TextFileReader

809@overload

810def read_csv(

811 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

812 *,

813 sep: str | None | lib.NoDefault = ...,

814 delimiter: str | None | lib.NoDefault = ...,

815 header: int | Sequence[int] | None | Literal["infer"] = ...,

816 names: Sequence[Hashable] | None | lib.NoDefault = ...,

817 index_col: IndexLabel | Literal[False] | None = ...,

818 usecols: UsecolsArgType = ...,

819 dtype: DtypeArg | None = ...,

820 engine: CSVEngine | None = ...,

821 converters: Mapping[Hashable, Callable] | None = ...,

822 true_values: list | None = ...,

823 false_values: list | None = ...,

824 skipinitialspace: bool = ...,

825 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

826 skipfooter: int = ...,

827 nrows: int | None = ...,

828 na_values: Hashable

829 | Iterable[Hashable]

830 | Mapping[Hashable, Iterable[Hashable]]

831 | None = ...,

832 keep_default_na: bool = ...,

833 na_filter: bool = ...,

834 verbose: bool | lib.NoDefault = ...,

835 skip_blank_lines: bool = ...,

836 parse_dates: bool | Sequence[Hashable] | None = ...,

837 infer_datetime_format: bool | lib.NoDefault = ...,

838 keep_date_col: bool | lib.NoDefault = ...,

839 date_parser: Callable | lib.NoDefault = ...,

840 date_format: str | dict[Hashable, str] | None = ...,

841 dayfirst: bool = ...,

842 cache_dates: bool = ...,

843 iterator: bool = ...,

844 chunksize: int | None = ...,

845 compression: CompressionOptions = ...,

846 thousands: str | None = ...,

847 decimal: str = ...,

848 lineterminator: str | None = ...,

849 quotechar: str = ...,

850 quoting: int = ...,

851 doublequote: bool = ...,

852 escapechar: str | None = ...,

853 comment: str | None = ...,

854 encoding: str | None = ...,

855 encoding_errors: str | None = ...,

856 dialect: str | csv.Dialect | None = ...,

857 on_bad_lines=...,

858 delim_whitespace: bool | lib.NoDefault = ...,

859 low_memory: bool = ...,

860 memory_map: bool = ...,

861 float_precision: Literal["high", "legacy"] | None = ...,

862 storage_options: StorageOptions = ...,

863 dtype_backend: DtypeBackend | lib.NoDefault = ...,

864) -> DataFrame | TextFileReader:

865 ...

866

867

868@Appender(

869 _doc_read_csv_and_table.format(

870 func_name="read_csv",

871 summary="Read a comma-separated values (csv) file into DataFrame.",

872 see_also_func_name="read_table",

873 see_also_func_summary="Read general delimited file into DataFrame.",

874 _default_sep="','",

875 storage_options=_shared_docs["storage_options"],

876 decompression_options=_shared_docs["decompression_options"]

877 % "filepath_or_buffer",

878 )

879)

880def read_csv(

881 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

882 *,

883 sep: str | None | lib.NoDefault = lib.no_default,

884 delimiter: str | None | lib.NoDefault = None,

885 # Column and Index Locations and Names

886 header: int | Sequence[int] | None | Literal["infer"] = "infer",

887 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,

888 index_col: IndexLabel | Literal[False] | None = None,

889 usecols: UsecolsArgType = None,

890 # General Parsing Configuration

891 dtype: DtypeArg | None = None,

892 engine: CSVEngine | None = None,

893 converters: Mapping[Hashable, Callable] | None = None,

894 true_values: list | None = None,

895 false_values: list | None = None,

896 skipinitialspace: bool = False,

897 skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,

898 skipfooter: int = 0,

899 nrows: int | None = None,

900 # NA and Missing Data Handling

901 na_values: Hashable

902 | Iterable[Hashable]

903 | Mapping[Hashable, Iterable[Hashable]]

904 | None = None,

905 keep_default_na: bool = True,

906 na_filter: bool = True,

907 verbose: bool | lib.NoDefault = lib.no_default,

908 skip_blank_lines: bool = True,

909 # Datetime Handling

910 parse_dates: bool | Sequence[Hashable] | None = None,

911 infer_datetime_format: bool | lib.NoDefault = lib.no_default,

912 keep_date_col: bool | lib.NoDefault = lib.no_default,

913 date_parser: Callable | lib.NoDefault = lib.no_default,

914 date_format: str | dict[Hashable, str] | None = None,

915 dayfirst: bool = False,

916 cache_dates: bool = True,

917 # Iteration

918 iterator: bool = False,

919 chunksize: int | None = None,

920 # Quoting, Compression, and File Format

921 compression: CompressionOptions = "infer",

922 thousands: str | None = None,

923 decimal: str = ".",

924 lineterminator: str | None = None,

925 quotechar: str = '"',

926 quoting: int = csv.QUOTE_MINIMAL,

927 doublequote: bool = True,

928 escapechar: str | None = None,

929 comment: str | None = None,

930 encoding: str | None = None,

931 encoding_errors: str | None = "strict",

932 dialect: str | csv.Dialect | None = None,

933 # Error Handling

934 on_bad_lines: str = "error",

935 # Internal

936 delim_whitespace: bool | lib.NoDefault = lib.no_default,

937 low_memory: bool = _c_parser_defaults["low_memory"],

938 memory_map: bool = False,

939 float_precision: Literal["high", "legacy"] | None = None,

940 storage_options: StorageOptions | None = None,

941 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

942) -> DataFrame | TextFileReader:

943 if keep_date_col is not lib.no_default:

944 # GH#55569

945 warnings.warn(

946 "The 'keep_date_col' keyword in pd.read_csv is deprecated and "

947 "will be removed in a future version. Explicitly remove unwanted "

948 "columns after parsing instead.",

949 FutureWarning,

950 stacklevel=find_stack_level(),

951 )

952 else:

953 keep_date_col = False

954

955 if lib.is_list_like(parse_dates):

956 # GH#55569

957 depr = False

958 # error: Item "bool" of "bool | Sequence[Hashable] | None" has no

959 # attribute "__iter__" (not iterable)

960 if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]

961 depr = True

962 elif isinstance(parse_dates, dict) and any(

963 lib.is_list_like(x) for x in parse_dates.values()

964 ):

965 depr = True

966 if depr:

967 warnings.warn(

968 "Support for nested sequences for 'parse_dates' in pd.read_csv "

969 "is deprecated. Combine the desired columns with pd.to_datetime "

970 "after parsing instead.",

971 FutureWarning,

972 stacklevel=find_stack_level(),

973 )

974

975 if infer_datetime_format is not lib.no_default:

976 warnings.warn(

977 "The argument 'infer_datetime_format' is deprecated and will "

978 "be removed in a future version. "

979 "A strict version of it is now the default, see "

980 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "

981 "You can safely remove this argument.",

982 FutureWarning,

983 stacklevel=find_stack_level(),

984 )

985

986 if delim_whitespace is not lib.no_default:

987 # GH#55569

988 warnings.warn(

989 "The 'delim_whitespace' keyword in pd.read_csv is deprecated and "

990 "will be removed in a future version. Use ``sep='\\s+'`` instead",

991 FutureWarning,

992 stacklevel=find_stack_level(),

993 )

994 else:

995 delim_whitespace = False

996

997 if verbose is not lib.no_default:

998 # GH#55569

999 warnings.warn(

1000 "The 'verbose' keyword in pd.read_csv is deprecated and "

1001 "will be removed in a future version.",

1002 FutureWarning,

1003 stacklevel=find_stack_level(),

1004 )

1005 else:

1006 verbose = False

1007

1008 # locals() should never be modified

1009 kwds = locals().copy()

1010 del kwds["filepath_or_buffer"]

1011 del kwds["sep"]

1012

1013 kwds_defaults = _refine_defaults_read(

1014 dialect,

1015 delimiter,

1016 delim_whitespace,

1017 engine,

1018 sep,

1019 on_bad_lines,

1020 names,

1021 defaults={"delimiter": ","},

1022 dtype_backend=dtype_backend,

1023 )

1024 kwds.update(kwds_defaults)

1025

1026 return _read(filepath_or_buffer, kwds)

1027

1028

1029# iterator=True -> TextFileReader

1030@overload

1031def read_table(

1032 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1033 *,

1034 sep: str | None | lib.NoDefault = ...,

1035 delimiter: str | None | lib.NoDefault = ...,

1036 header: int | Sequence[int] | None | Literal["infer"] = ...,

1037 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1038 index_col: IndexLabel | Literal[False] | None = ...,

1039 usecols: UsecolsArgType = ...,

1040 dtype: DtypeArg | None = ...,

1041 engine: CSVEngine | None = ...,

1042 converters: Mapping[Hashable, Callable] | None = ...,

1043 true_values: list | None = ...,

1044 false_values: list | None = ...,

1045 skipinitialspace: bool = ...,

1046 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

1047 skipfooter: int = ...,

1048 nrows: int | None = ...,

1049 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,

1050 keep_default_na: bool = ...,

1051 na_filter: bool = ...,

1052 verbose: bool | lib.NoDefault = ...,

1053 skip_blank_lines: bool = ...,

1054 parse_dates: bool | Sequence[Hashable] = ...,

1055 infer_datetime_format: bool | lib.NoDefault = ...,

1056 keep_date_col: bool | lib.NoDefault = ...,

1057 date_parser: Callable | lib.NoDefault = ...,

1058 date_format: str | dict[Hashable, str] | None = ...,

1059 dayfirst: bool = ...,

1060 cache_dates: bool = ...,

1061 iterator: Literal[True],

1062 chunksize: int | None = ...,

1063 compression: CompressionOptions = ...,

1064 thousands: str | None = ...,

1065 decimal: str = ...,

1066 lineterminator: str | None = ...,

1067 quotechar: str = ...,

1068 quoting: int = ...,

1069 doublequote: bool = ...,

1070 escapechar: str | None = ...,

1071 comment: str | None = ...,

1072 encoding: str | None = ...,

1073 encoding_errors: str | None = ...,

1074 dialect: str | csv.Dialect | None = ...,

1075 on_bad_lines=...,

1076 delim_whitespace: bool = ...,

1077 low_memory: bool = ...,

1078 memory_map: bool = ...,

1079 float_precision: str | None = ...,

1080 storage_options: StorageOptions = ...,

1081 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1082) -> TextFileReader:

1083 ...

1084

1085

1086# chunksize=int -> TextFileReader

1087@overload

1088def read_table(

1089 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1090 *,

1091 sep: str | None | lib.NoDefault = ...,

1092 delimiter: str | None | lib.NoDefault = ...,

1093 header: int | Sequence[int] | None | Literal["infer"] = ...,

1094 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1095 index_col: IndexLabel | Literal[False] | None = ...,

1096 usecols: UsecolsArgType = ...,

1097 dtype: DtypeArg | None = ...,

1098 engine: CSVEngine | None = ...,

1099 converters: Mapping[Hashable, Callable] | None = ...,

1100 true_values: list | None = ...,

1101 false_values: list | None = ...,

1102 skipinitialspace: bool = ...,

1103 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

1104 skipfooter: int = ...,

1105 nrows: int | None = ...,

1106 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,

1107 keep_default_na: bool = ...,

1108 na_filter: bool = ...,

1109 verbose: bool | lib.NoDefault = ...,

1110 skip_blank_lines: bool = ...,

1111 parse_dates: bool | Sequence[Hashable] = ...,

1112 infer_datetime_format: bool | lib.NoDefault = ...,

1113 keep_date_col: bool | lib.NoDefault = ...,

1114 date_parser: Callable | lib.NoDefault = ...,

1115 date_format: str | dict[Hashable, str] | None = ...,

1116 dayfirst: bool = ...,

1117 cache_dates: bool = ...,

1118 iterator: bool = ...,

1119 chunksize: int,

1120 compression: CompressionOptions = ...,

1121 thousands: str | None = ...,

1122 decimal: str = ...,

1123 lineterminator: str | None = ...,

1124 quotechar: str = ...,

1125 quoting: int = ...,

1126 doublequote: bool = ...,

1127 escapechar: str | None = ...,

1128 comment: str | None = ...,

1129 encoding: str | None = ...,

1130 encoding_errors: str | None = ...,

1131 dialect: str | csv.Dialect | None = ...,

1132 on_bad_lines=...,

1133 delim_whitespace: bool = ...,

1134 low_memory: bool = ...,

1135 memory_map: bool = ...,

1136 float_precision: str | None = ...,

1137 storage_options: StorageOptions = ...,

1138 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1139) -> TextFileReader:

1140 ...

1141

1142

1143# default -> DataFrame

1144@overload

1145def read_table(

1146 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1147 *,

1148 sep: str | None | lib.NoDefault = ...,

1149 delimiter: str | None | lib.NoDefault = ...,

1150 header: int | Sequence[int] | None | Literal["infer"] = ...,

1151 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1152 index_col: IndexLabel | Literal[False] | None = ...,

1153 usecols: UsecolsArgType = ...,

1154 dtype: DtypeArg | None = ...,

1155 engine: CSVEngine | None = ...,

1156 converters: Mapping[Hashable, Callable] | None = ...,

1157 true_values: list | None = ...,

1158 false_values: list | None = ...,

1159 skipinitialspace: bool = ...,

1160 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

1161 skipfooter: int = ...,

1162 nrows: int | None = ...,

1163 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,

1164 keep_default_na: bool = ...,

1165 na_filter: bool = ...,

1166 verbose: bool | lib.NoDefault = ...,

1167 skip_blank_lines: bool = ...,

1168 parse_dates: bool | Sequence[Hashable] = ...,

1169 infer_datetime_format: bool | lib.NoDefault = ...,

1170 keep_date_col: bool | lib.NoDefault = ...,

1171 date_parser: Callable | lib.NoDefault = ...,

1172 date_format: str | dict[Hashable, str] | None = ...,

1173 dayfirst: bool = ...,

1174 cache_dates: bool = ...,

1175 iterator: Literal[False] = ...,

1176 chunksize: None = ...,

1177 compression: CompressionOptions = ...,

1178 thousands: str | None = ...,

1179 decimal: str = ...,

1180 lineterminator: str | None = ...,

1181 quotechar: str = ...,

1182 quoting: int = ...,

1183 doublequote: bool = ...,

1184 escapechar: str | None = ...,

1185 comment: str | None = ...,

1186 encoding: str | None = ...,

1187 encoding_errors: str | None = ...,

1188 dialect: str | csv.Dialect | None = ...,

1189 on_bad_lines=...,

1190 delim_whitespace: bool = ...,

1191 low_memory: bool = ...,

1192 memory_map: bool = ...,

1193 float_precision: str | None = ...,

1194 storage_options: StorageOptions = ...,

1195 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1196) -> DataFrame:

1197 ...

1198

1199

1200# Unions -> DataFrame | TextFileReader

1201@overload

1202def read_table(

1203 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1204 *,

1205 sep: str | None | lib.NoDefault = ...,

1206 delimiter: str | None | lib.NoDefault = ...,

1207 header: int | Sequence[int] | None | Literal["infer"] = ...,

1208 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1209 index_col: IndexLabel | Literal[False] | None = ...,

1210 usecols: UsecolsArgType = ...,

1211 dtype: DtypeArg | None = ...,

1212 engine: CSVEngine | None = ...,

1213 converters: Mapping[Hashable, Callable] | None = ...,

1214 true_values: list | None = ...,

1215 false_values: list | None = ...,

1216 skipinitialspace: bool = ...,

1217 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ...,

1218 skipfooter: int = ...,

1219 nrows: int | None = ...,

1220 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ...,

1221 keep_default_na: bool = ...,

1222 na_filter: bool = ...,

1223 verbose: bool | lib.NoDefault = ...,

1224 skip_blank_lines: bool = ...,

1225 parse_dates: bool | Sequence[Hashable] = ...,

1226 infer_datetime_format: bool | lib.NoDefault = ...,

1227 keep_date_col: bool | lib.NoDefault = ...,

1228 date_parser: Callable | lib.NoDefault = ...,

1229 date_format: str | dict[Hashable, str] | None = ...,

1230 dayfirst: bool = ...,

1231 cache_dates: bool = ...,

1232 iterator: bool = ...,

1233 chunksize: int | None = ...,

1234 compression: CompressionOptions = ...,

1235 thousands: str | None = ...,

1236 decimal: str = ...,

1237 lineterminator: str | None = ...,

1238 quotechar: str = ...,

1239 quoting: int = ...,

1240 doublequote: bool = ...,

1241 escapechar: str | None = ...,

1242 comment: str | None = ...,

1243 encoding: str | None = ...,

1244 encoding_errors: str | None = ...,

1245 dialect: str | csv.Dialect | None = ...,

1246 on_bad_lines=...,

1247 delim_whitespace: bool = ...,

1248 low_memory: bool = ...,

1249 memory_map: bool = ...,

1250 float_precision: str | None = ...,

1251 storage_options: StorageOptions = ...,

1252 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1253) -> DataFrame | TextFileReader:

1254 ...

1255

1256

1257@Appender(

1258 _doc_read_csv_and_table.format(

1259 func_name="read_table",

1260 summary="Read general delimited file into DataFrame.",

1261 see_also_func_name="read_csv",

1262 see_also_func_summary=(

1263 "Read a comma-separated values (csv) file into DataFrame."

1264 ),

1265 _default_sep=r"'\\t' (tab-stop)",

1266 storage_options=_shared_docs["storage_options"],

1267 decompression_options=_shared_docs["decompression_options"]

1268 % "filepath_or_buffer",

1269 )

1270)

1271def read_table(

1272 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1273 *,

1274 sep: str | None | lib.NoDefault = lib.no_default,

1275 delimiter: str | None | lib.NoDefault = None,

1276 # Column and Index Locations and Names

1277 header: int | Sequence[int] | None | Literal["infer"] = "infer",

1278 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,

1279 index_col: IndexLabel | Literal[False] | None = None,

1280 usecols: UsecolsArgType = None,

1281 # General Parsing Configuration

1282 dtype: DtypeArg | None = None,

1283 engine: CSVEngine | None = None,

1284 converters: Mapping[Hashable, Callable] | None = None,

1285 true_values: list | None = None,

1286 false_values: list | None = None,

1287 skipinitialspace: bool = False,

1288 skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,

1289 skipfooter: int = 0,

1290 nrows: int | None = None,

1291 # NA and Missing Data Handling

1292 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None,

1293 keep_default_na: bool = True,

1294 na_filter: bool = True,

1295 verbose: bool | lib.NoDefault = lib.no_default,

1296 skip_blank_lines: bool = True,

1297 # Datetime Handling

1298 parse_dates: bool | Sequence[Hashable] = False,

1299 infer_datetime_format: bool | lib.NoDefault = lib.no_default,

1300 keep_date_col: bool | lib.NoDefault = lib.no_default,

1301 date_parser: Callable | lib.NoDefault = lib.no_default,

1302 date_format: str | dict[Hashable, str] | None = None,

1303 dayfirst: bool = False,

1304 cache_dates: bool = True,

1305 # Iteration

1306 iterator: bool = False,

1307 chunksize: int | None = None,

1308 # Quoting, Compression, and File Format

1309 compression: CompressionOptions = "infer",

1310 thousands: str | None = None,

1311 decimal: str = ".",

1312 lineterminator: str | None = None,

1313 quotechar: str = '"',

1314 quoting: int = csv.QUOTE_MINIMAL,

1315 doublequote: bool = True,

1316 escapechar: str | None = None,

1317 comment: str | None = None,

1318 encoding: str | None = None,

1319 encoding_errors: str | None = "strict",

1320 dialect: str | csv.Dialect | None = None,

1321 # Error Handling

1322 on_bad_lines: str = "error",

1323 # Internal

1324 delim_whitespace: bool | lib.NoDefault = lib.no_default,

1325 low_memory: bool = _c_parser_defaults["low_memory"],

1326 memory_map: bool = False,

1327 float_precision: str | None = None,

1328 storage_options: StorageOptions | None = None,

1329 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1330) -> DataFrame | TextFileReader:

1331 if keep_date_col is not lib.no_default:

1332 # GH#55569

1333 warnings.warn(

1334 "The 'keep_date_col' keyword in pd.read_table is deprecated and "

1335 "will be removed in a future version. Explicitly remove unwanted "

1336 "columns after parsing instead.",

1337 FutureWarning,

1338 stacklevel=find_stack_level(),

1339 )

1340 else:

1341 keep_date_col = False

1342

1343 # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__"

1344 if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]

1345 # GH#55569

1346 warnings.warn(

1347 "Support for nested sequences for 'parse_dates' in pd.read_table "

1348 "is deprecated. Combine the desired columns with pd.to_datetime "

1349 "after parsing instead.",

1350 FutureWarning,

1351 stacklevel=find_stack_level(),

1352 )

1353

1354 if infer_datetime_format is not lib.no_default:

1355 warnings.warn(

1356 "The argument 'infer_datetime_format' is deprecated and will "

1357 "be removed in a future version. "

1358 "A strict version of it is now the default, see "

1359 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "

1360 "You can safely remove this argument.",

1361 FutureWarning,

1362 stacklevel=find_stack_level(),

1363 )

1364

1365 if delim_whitespace is not lib.no_default:

1366 # GH#55569

1367 warnings.warn(

1368 "The 'delim_whitespace' keyword in pd.read_table is deprecated and "

1369 "will be removed in a future version. Use ``sep='\\s+'`` instead",

1370 FutureWarning,

1371 stacklevel=find_stack_level(),

1372 )

1373 else:

1374 delim_whitespace = False

1375

1376 if verbose is not lib.no_default:

1377 # GH#55569

1378 warnings.warn(

1379 "The 'verbose' keyword in pd.read_table is deprecated and "

1380 "will be removed in a future version.",

1381 FutureWarning,

1382 stacklevel=find_stack_level(),

1383 )

1384 else:

1385 verbose = False

1386

1387 # locals() should never be modified

1388 kwds = locals().copy()

1389 del kwds["filepath_or_buffer"]

1390 del kwds["sep"]

1391

1392 kwds_defaults = _refine_defaults_read(

1393 dialect,

1394 delimiter,

1395 delim_whitespace,

1396 engine,

1397 sep,

1398 on_bad_lines,

1399 names,

1400 defaults={"delimiter": "\t"},

1401 dtype_backend=dtype_backend,

1402 )

1403 kwds.update(kwds_defaults)

1404

1405 return _read(filepath_or_buffer, kwds)

1406

1407

1408@overload

1409def read_fwf(

1410 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1411 *,

1412 colspecs: Sequence[tuple[int, int]] | str | None = ...,

1413 widths: Sequence[int] | None = ...,

1414 infer_nrows: int = ...,

1415 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1416 iterator: Literal[True],

1417 chunksize: int | None = ...,

1418 **kwds,

1419) -> TextFileReader:

1420 ...

1421

1422

1423@overload

1424def read_fwf(

1425 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1426 *,

1427 colspecs: Sequence[tuple[int, int]] | str | None = ...,

1428 widths: Sequence[int] | None = ...,

1429 infer_nrows: int = ...,

1430 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1431 iterator: bool = ...,

1432 chunksize: int,

1433 **kwds,

1434) -> TextFileReader:

1435 ...

1436

1437

1438@overload

1439def read_fwf(

1440 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1441 *,

1442 colspecs: Sequence[tuple[int, int]] | str | None = ...,

1443 widths: Sequence[int] | None = ...,

1444 infer_nrows: int = ...,

1445 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1446 iterator: Literal[False] = ...,

1447 chunksize: None = ...,

1448 **kwds,

1449) -> DataFrame:

1450 ...

1451

1452

1453def read_fwf(

1454 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1455 *,

1456 colspecs: Sequence[tuple[int, int]] | str | None = "infer",

1457 widths: Sequence[int] | None = None,

1458 infer_nrows: int = 100,

1459 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1460 iterator: bool = False,

1461 chunksize: int | None = None,

1462 **kwds,

1463) -> DataFrame | TextFileReader:

1464 r"""

1465 Read a table of fixed-width formatted lines into DataFrame.

1466

1467 Also supports optionally iterating or breaking of the file

1468 into chunks.

1469

1470 Additional help can be found in the `online docs for IO Tools

1471 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.

1472

1473 Parameters

1474 ----------

1475 filepath_or_buffer : str, path object, or file-like object

1476 String, path object (implementing ``os.PathLike[str]``), or file-like

1477 object implementing a text ``read()`` function.The string could be a URL.

1478 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is

1479 expected. A local file could be:

1480 ``file://localhost/path/to/table.csv``.

1481 colspecs : list of tuple (int, int) or 'infer'. optional

1482 A list of tuples giving the extents of the fixed-width

1483 fields of each line as half-open intervals (i.e., [from, to[ ).

1484 String value 'infer' can be used to instruct the parser to try

1485 detecting the column specifications from the first 100 rows of

1486 the data which are not being skipped via skiprows (default='infer').

1487 widths : list of int, optional

1488 A list of field widths which can be used instead of 'colspecs' if

1489 the intervals are contiguous.

1490 infer_nrows : int, default 100

1491 The number of rows to consider when letting the parser determine the

1492 `colspecs`.

1493 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'

1494 Back-end data type applied to the resultant :class:`DataFrame`

1495 (still experimental). Behaviour is as follows:

1496

1497 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

1498 (default).

1499 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

1500 DataFrame.

1501

1502 .. versionadded:: 2.0

1503

1504 **kwds : optional

1505 Optional keyword arguments can be passed to ``TextFileReader``.

1506

1507 Returns

1508 -------

1509 DataFrame or TextFileReader

1510 A comma-separated values (csv) file is returned as two-dimensional

1511 data structure with labeled axes.

1512

1513 See Also

1514 --------

1515 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

1516 read_csv : Read a comma-separated values (csv) file into DataFrame.

1517

1518 Examples

1519 --------

1520 >>> pd.read_fwf('data.csv') # doctest: +SKIP

1521 """

1522 # Check input arguments.

1523 if colspecs is None and widths is None:

1524 raise ValueError("Must specify either colspecs or widths")

1525 if colspecs not in (None, "infer") and widths is not None:

1526 raise ValueError("You must specify only one of 'widths' and 'colspecs'")

1527

1528 # Compute 'colspecs' from 'widths', if specified.

1529 if widths is not None:

1530 colspecs, col = [], 0

1531 for w in widths:

1532 colspecs.append((col, col + w))

1533 col += w

1534

1535 # for mypy

1536 assert colspecs is not None

1537

1538 # GH#40830

1539 # Ensure length of `colspecs` matches length of `names`

1540 names = kwds.get("names")

1541 if names is not None:

1542 if len(names) != len(colspecs) and colspecs != "infer":

1543 # need to check len(index_col) as it might contain

1544 # unnamed indices, in which case it's name is not required

1545 len_index = 0

1546 if kwds.get("index_col") is not None:

1547 index_col: Any = kwds.get("index_col")

1548 if index_col is not False:

1549 if not is_list_like(index_col):

1550 len_index = 1

1551 else:

1552 len_index = len(index_col)

1553 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):

1554 # If usecols is used colspec may be longer than names

1555 raise ValueError("Length of colspecs must match length of names")

1556

1557 kwds["colspecs"] = colspecs

1558 kwds["infer_nrows"] = infer_nrows

1559 kwds["engine"] = "python-fwf"

1560 kwds["iterator"] = iterator

1561 kwds["chunksize"] = chunksize

1562

1563 check_dtype_backend(dtype_backend)

1564 kwds["dtype_backend"] = dtype_backend

1565 return _read(filepath_or_buffer, kwds)

1566

1567

1568class TextFileReader(abc.Iterator):

1569 """

1570

1571 Passed dialect overrides any of the related parser options

1572

1573 """

1574

1575 def __init__(

1576 self,

1577 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,

1578 engine: CSVEngine | None = None,

1579 **kwds,

1580 ) -> None:

1581 if engine is not None:

1582 engine_specified = True

1583 else:

1584 engine = "python"

1585 engine_specified = False

1586 self.engine = engine

1587 self._engine_specified = kwds.get("engine_specified", engine_specified)

1588

1589 _validate_skipfooter(kwds)

1590

1591 dialect = _extract_dialect(kwds)

1592 if dialect is not None:

1593 if engine == "pyarrow":

1594 raise ValueError(

1595 "The 'dialect' option is not supported with the 'pyarrow' engine"

1596 )

1597 kwds = _merge_with_dialect_properties(dialect, kwds)

1598

1599 if kwds.get("header", "infer") == "infer":

1600 kwds["header"] = 0 if kwds.get("names") is None else None

1601

1602 self.orig_options = kwds

1603

1604 # miscellanea

1605 self._currow = 0

1606

1607 options = self._get_options_with_defaults(engine)

1608 options["storage_options"] = kwds.get("storage_options", None)

1609

1610 self.chunksize = options.pop("chunksize", None)

1611 self.nrows = options.pop("nrows", None)

1612

1613 self._check_file_or_buffer(f, engine)

1614 self.options, self.engine = self._clean_options(options, engine)

1615

1616 if "has_index_names" in kwds:

1617 self.options["has_index_names"] = kwds["has_index_names"]

1618

1619 self.handles: IOHandles | None = None

1620 self._engine = self._make_engine(f, self.engine)

1621

1622 def close(self) -> None:

1623 if self.handles is not None:

1624 self.handles.close()

1625 self._engine.close()

1626

1627 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:

1628 kwds = self.orig_options

1629

1630 options = {}

1631 default: object | None

1632

1633 for argname, default in parser_defaults.items():

1634 value = kwds.get(argname, default)

1635

1636 # see gh-12935

1637 if (

1638 engine == "pyarrow"

1639 and argname in _pyarrow_unsupported

1640 and value != default

1641 and value != getattr(value, "value", default)

1642 ):

1643 raise ValueError(

1644 f"The {repr(argname)} option is not supported with the "

1645 f"'pyarrow' engine"

1646 )

1647 options[argname] = value

1648

1649 for argname, default in _c_parser_defaults.items():

1650 if argname in kwds:

1651 value = kwds[argname]

1652

1653 if engine != "c" and value != default:

1654 # TODO: Refactor this logic, its pretty convoluted

1655 if "python" in engine and argname not in _python_unsupported:

1656 pass

1657 elif "pyarrow" in engine and argname not in _pyarrow_unsupported:

1658 pass

1659 else:

1660 raise ValueError(

1661 f"The {repr(argname)} option is not supported with the "

1662 f"{repr(engine)} engine"

1663 )

1664 else:

1665 value = default

1666 options[argname] = value

1667

1668 if engine == "python-fwf":

1669 for argname, default in _fwf_defaults.items():

1670 options[argname] = kwds.get(argname, default)

1671

1672 return options

1673

1674 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:

1675 # see gh-16530

1676 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):

1677 # The C engine doesn't need the file-like to have the "__iter__"

1678 # attribute. However, the Python engine needs "__iter__(...)"

1679 # when iterating through such an object, meaning it

1680 # needs to have that attribute

1681 raise ValueError(

1682 "The 'python' engine cannot iterate through this file buffer."

1683 )

1684

1685 def _clean_options(

1686 self, options: dict[str, Any], engine: CSVEngine

1687 ) -> tuple[dict[str, Any], CSVEngine]:

1688 result = options.copy()

1689

1690 fallback_reason = None

1691

1692 # C engine not supported yet

1693 if engine == "c":

1694 if options["skipfooter"] > 0:

1695 fallback_reason = "the 'c' engine does not support skipfooter"

1696 engine = "python"

1697

1698 sep = options["delimiter"]

1699 delim_whitespace = options["delim_whitespace"]

1700

1701 if sep is None and not delim_whitespace:

1702 if engine in ("c", "pyarrow"):

1703 fallback_reason = (

1704 f"the '{engine}' engine does not support "

1705 "sep=None with delim_whitespace=False"

1706 )

1707 engine = "python"

1708 elif sep is not None and len(sep) > 1:

1709 if engine == "c" and sep == r"\s+":

1710 result["delim_whitespace"] = True

1711 del result["delimiter"]

1712 elif engine not in ("python", "python-fwf"):

1713 # wait until regex engine integrated

1714 fallback_reason = (

1715 f"the '{engine}' engine does not support "

1716 "regex separators (separators > 1 char and "

1717 r"different from '\s+' are interpreted as regex)"

1718 )

1719 engine = "python"

1720 elif delim_whitespace:

1721 if "python" in engine:

1722 result["delimiter"] = r"\s+"

1723 elif sep is not None:

1724 encodeable = True

1725 encoding = sys.getfilesystemencoding() or "utf-8"

1726 try:

1727 if len(sep.encode(encoding)) > 1:

1728 encodeable = False

1729 except UnicodeDecodeError:

1730 encodeable = False

1731 if not encodeable and engine not in ("python", "python-fwf"):

1732 fallback_reason = (

1733 f"the separator encoded in {encoding} "

1734 f"is > 1 char long, and the '{engine}' engine "

1735 "does not support such separators"

1736 )

1737 engine = "python"

1738

1739 quotechar = options["quotechar"]

1740 if quotechar is not None and isinstance(quotechar, (str, bytes)):

1741 if (

1742 len(quotechar) == 1

1743 and ord(quotechar) > 127

1744 and engine not in ("python", "python-fwf")

1745 ):

1746 fallback_reason = (

1747 "ord(quotechar) > 127, meaning the "

1748 "quotechar is larger than one byte, "

1749 f"and the '{engine}' engine does not support such quotechars"

1750 )

1751 engine = "python"

1752

1753 if fallback_reason and self._engine_specified:

1754 raise ValueError(fallback_reason)

1755

1756 if engine == "c":

1757 for arg in _c_unsupported:

1758 del result[arg]

1759

1760 if "python" in engine:

1761 for arg in _python_unsupported:

1762 if fallback_reason and result[arg] != _c_parser_defaults.get(arg):

1763 raise ValueError(

1764 "Falling back to the 'python' engine because "

1765 f"{fallback_reason}, but this causes {repr(arg)} to be "

1766 "ignored as it is not supported by the 'python' engine."

1767 )

1768 del result[arg]

1769

1770 if fallback_reason:

1771 warnings.warn(

1772 (

1773 "Falling back to the 'python' engine because "

1774 f"{fallback_reason}; you can avoid this warning by specifying "

1775 "engine='python'."

1776 ),

1777 ParserWarning,

1778 stacklevel=find_stack_level(),

1779 )

1780

1781 index_col = options["index_col"]

1782 names = options["names"]

1783 converters = options["converters"]

1784 na_values = options["na_values"]

1785 skiprows = options["skiprows"]

1786

1787 validate_header_arg(options["header"])

1788

1789 if index_col is True:

1790 raise ValueError("The value of index_col couldn't be 'True'")

1791 if is_index_col(index_col):

1792 if not isinstance(index_col, (list, tuple, np.ndarray)):

1793 index_col = [index_col]

1794 result["index_col"] = index_col

1795

1796 names = list(names) if names is not None else names

1797

1798 # type conversion-related

1799 if converters is not None:

1800 if not isinstance(converters, dict):

1801 raise TypeError(

1802 "Type converters must be a dict or subclass, "

1803 f"input was a {type(converters).__name__}"

1804 )

1805 else:

1806 converters = {}

1807

1808 # Converting values to NA

1809 keep_default_na = options["keep_default_na"]

1810 floatify = engine != "pyarrow"

1811 na_values, na_fvalues = _clean_na_values(

1812 na_values, keep_default_na, floatify=floatify

1813 )

1814

1815 # handle skiprows; this is internally handled by the

1816 # c-engine, so only need for python and pyarrow parsers

1817 if engine == "pyarrow":

1818 if not is_integer(skiprows) and skiprows is not None:

1819 # pyarrow expects skiprows to be passed as an integer

1820 raise ValueError(

1821 "skiprows argument must be an integer when using "

1822 "engine='pyarrow'"

1823 )

1824 else:

1825 if is_integer(skiprows):

1826 skiprows = list(range(skiprows))

1827 if skiprows is None:

1828 skiprows = set()

1829 elif not callable(skiprows):

1830 skiprows = set(skiprows)

1831

1832 # put stuff back

1833 result["names"] = names

1834 result["converters"] = converters

1835 result["na_values"] = na_values

1836 result["na_fvalues"] = na_fvalues

1837 result["skiprows"] = skiprows

1838

1839 return result, engine

1840

1841 def __next__(self) -> DataFrame:

1842 try:

1843 return self.get_chunk()

1844 except StopIteration:

1845 self.close()

1846 raise

1847

1848 def _make_engine(

1849 self,

1850 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,

1851 engine: CSVEngine = "c",

1852 ) -> ParserBase:

1853 mapping: dict[str, type[ParserBase]] = {

1854 "c": CParserWrapper,

1855 "python": PythonParser,

1856 "pyarrow": ArrowParserWrapper,

1857 "python-fwf": FixedWidthFieldParser,

1858 }

1859 if engine not in mapping:

1860 raise ValueError(

1861 f"Unknown engine: {engine} (valid options are {mapping.keys()})"

1862 )

1863 if not isinstance(f, list):

1864 # open file here

1865 is_text = True

1866 mode = "r"

1867 if engine == "pyarrow":

1868 is_text = False

1869 mode = "rb"

1870 elif (

1871 engine == "c"

1872 and self.options.get("encoding", "utf-8") == "utf-8"

1873 and isinstance(stringify_path(f), str)

1874 ):

1875 # c engine can decode utf-8 bytes, adding TextIOWrapper makes

1876 # the c-engine especially for memory_map=True far slower

1877 is_text = False

1878 if "b" not in mode:

1879 mode += "b"

1880 self.handles = get_handle(

1881 f,

1882 mode,

1883 encoding=self.options.get("encoding", None),

1884 compression=self.options.get("compression", None),

1885 memory_map=self.options.get("memory_map", False),

1886 is_text=is_text,

1887 errors=self.options.get("encoding_errors", "strict"),

1888 storage_options=self.options.get("storage_options", None),

1889 )

1890 assert self.handles is not None

1891 f = self.handles.handle

1892

1893 elif engine != "python":

1894 msg = f"Invalid file path or buffer object type: {type(f)}"

1895 raise ValueError(msg)

1896

1897 try:

1898 return mapping[engine](f, **self.options)

1899 except Exception:

1900 if self.handles is not None:

1901 self.handles.close()

1902 raise

1903

1904 def _failover_to_python(self) -> None:

1905 raise AbstractMethodError(self)

1906

1907 def read(self, nrows: int | None = None) -> DataFrame:

1908 if self.engine == "pyarrow":

1909 try:

1910 # error: "ParserBase" has no attribute "read"

1911 df = self._engine.read() # type: ignore[attr-defined]

1912 except Exception:

1913 self.close()

1914 raise

1915 else:

1916 nrows = validate_integer("nrows", nrows)

1917 try:

1918 # error: "ParserBase" has no attribute "read"

1919 (

1920 index,

1921 columns,

1922 col_dict,

1923 ) = self._engine.read( # type: ignore[attr-defined]

1924 nrows

1925 )

1926 except Exception:

1927 self.close()

1928 raise

1929

1930 if index is None:

1931 if col_dict:

1932 # Any column is actually fine:

1933 new_rows = len(next(iter(col_dict.values())))

1934 index = RangeIndex(self._currow, self._currow + new_rows)

1935 else:

1936 new_rows = 0

1937 else:

1938 new_rows = len(index)

1939

1940 if hasattr(self, "orig_options"):

1941 dtype_arg = self.orig_options.get("dtype", None)

1942 else:

1943 dtype_arg = None

1944

1945 if isinstance(dtype_arg, dict):

1946 dtype = defaultdict(lambda: None) # type: ignore[var-annotated]

1947 dtype.update(dtype_arg)

1948 elif dtype_arg is not None and pandas_dtype(dtype_arg) in (

1949 np.str_,

1950 np.object_,

1951 ):

1952 dtype = defaultdict(lambda: dtype_arg)

1953 else:

1954 dtype = None

1955

1956 if dtype is not None:

1957 new_col_dict = {}

1958 for k, v in col_dict.items():

1959 d = (

1960 dtype[k]

1961 if pandas_dtype(dtype[k]) in (np.str_, np.object_)

1962 else None

1963 )

1964 new_col_dict[k] = Series(v, index=index, dtype=d, copy=False)

1965 else:

1966 new_col_dict = col_dict

1967

1968 df = DataFrame(

1969 new_col_dict,

1970 columns=columns,

1971 index=index,

1972 copy=not using_copy_on_write(),

1973 )

1974

1975 self._currow += new_rows

1976 return df

1977

1978 def get_chunk(self, size: int | None = None) -> DataFrame:

1979 if size is None:

1980 size = self.chunksize

1981 if self.nrows is not None:

1982 if self._currow >= self.nrows:

1983 raise StopIteration

1984 size = min(size, self.nrows - self._currow)

1985 return self.read(nrows=size)

1986

1987 def __enter__(self) -> Self:

1988 return self

1989

1990 def __exit__(

1991 self,

1992 exc_type: type[BaseException] | None,

1993 exc_value: BaseException | None,

1994 traceback: TracebackType | None,

1995 ) -> None:

1996 self.close()

1997

1998

1999def TextParser(*args, **kwds) -> TextFileReader:

2000 """

2001 Converts lists of lists/tuples into DataFrames with proper type inference

2002 and optional (e.g. string to datetime) conversion. Also enables iterating

2003 lazily over chunks of large files

2004

2005 Parameters

2006 ----------

2007 data : file-like object or list

2008 delimiter : separator character to use

2009 dialect : str or csv.Dialect instance, optional

2010 Ignored if delimiter is longer than 1 character

2011 names : sequence, default

2012 header : int, default 0

2013 Row to use to parse column labels. Defaults to the first row. Prior

2014 rows will be discarded

2015 index_col : int or list, optional

2016 Column or columns to use as the (possibly hierarchical) index

2017 has_index_names: bool, default False

2018 True if the cols defined in index_col have an index name and are

2019 not in the header.

2020 na_values : scalar, str, list-like, or dict, optional

2021 Additional strings to recognize as NA/NaN.

2022 keep_default_na : bool, default True

2023 thousands : str, optional

2024 Thousands separator

2025 comment : str, optional

2026 Comment out remainder of line

2027 parse_dates : bool, default False

2028 keep_date_col : bool, default False

2029 date_parser : function, optional

2030

2031 .. deprecated:: 2.0.0

2032 date_format : str or dict of column -> format, default ``None``

2033

2034 .. versionadded:: 2.0.0

2035 skiprows : list of integers

2036 Row numbers to skip

2037 skipfooter : int

2038 Number of line at bottom of file to skip

2039 converters : dict, optional

2040 Dict of functions for converting values in certain columns. Keys can

2041 either be integers or column labels, values are functions that take one

2042 input argument, the cell (not column) content, and return the

2043 transformed content.

2044 encoding : str, optional

2045 Encoding to use for UTF when reading/writing (ex. 'utf-8')

2046 float_precision : str, optional

2047 Specifies which converter the C engine should use for floating-point

2048 values. The options are `None` or `high` for the ordinary converter,

2049 `legacy` for the original lower precision pandas converter, and

2050 `round_trip` for the round-trip converter.

2051 """

2052 kwds["engine"] = "python"

2053 return TextFileReader(*args, **kwds)

2054

2055

2056def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True):

2057 na_fvalues: set | dict

2058 if na_values is None:

2059 if keep_default_na:

2060 na_values = STR_NA_VALUES

2061 else:

2062 na_values = set()

2063 na_fvalues = set()

2064 elif isinstance(na_values, dict):

2065 old_na_values = na_values.copy()

2066 na_values = {} # Prevent aliasing.

2067

2068 # Convert the values in the na_values dictionary

2069 # into array-likes for further use. This is also

2070 # where we append the default NaN values, provided

2071 # that `keep_default_na=True`.

2072 for k, v in old_na_values.items():

2073 if not is_list_like(v):

2074 v = [v]

2075

2076 if keep_default_na:

2077 v = set(v) | STR_NA_VALUES

2078

2079 na_values[k] = v

2080 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}

2081 else:

2082 if not is_list_like(na_values):

2083 na_values = [na_values]

2084 na_values = _stringify_na_values(na_values, floatify)

2085 if keep_default_na:

2086 na_values = na_values | STR_NA_VALUES

2087

2088 na_fvalues = _floatify_na_values(na_values)

2089

2090 return na_values, na_fvalues

2091

2092

2093def _floatify_na_values(na_values):

2094 # create float versions of the na_values

2095 result = set()

2096 for v in na_values:

2097 try:

2098 v = float(v)

2099 if not np.isnan(v):

2100 result.add(v)

2101 except (TypeError, ValueError, OverflowError):

2102 pass

2103 return result

2104

2105

2106def _stringify_na_values(na_values, floatify: bool):

2107 """return a stringified and numeric for these values"""

2108 result: list[str | float] = []

2109 for x in na_values:

2110 result.append(str(x))

2111 result.append(x)

2112 try:

2113 v = float(x)

2114

2115 # we are like 999 here

2116 if v == int(v):

2117 v = int(v)

2118 result.append(f"{v}.0")

2119 result.append(str(v))

2120

2121 if floatify:

2122 result.append(v)

2123 except (TypeError, ValueError, OverflowError):

2124 pass

2125 if floatify:

2126 try:

2127 result.append(int(x))

2128 except (TypeError, ValueError, OverflowError):

2129 pass

2130 return set(result)

2131

2132

2133def _refine_defaults_read(

2134 dialect: str | csv.Dialect | None,

2135 delimiter: str | None | lib.NoDefault,

2136 delim_whitespace: bool,

2137 engine: CSVEngine | None,

2138 sep: str | None | lib.NoDefault,

2139 on_bad_lines: str | Callable,

2140 names: Sequence[Hashable] | None | lib.NoDefault,

2141 defaults: dict[str, Any],

2142 dtype_backend: DtypeBackend | lib.NoDefault,

2143):

2144 """Validate/refine default values of input parameters of read_csv, read_table.

2145

2146 Parameters

2147 ----------

2148 dialect : str or csv.Dialect

2149 If provided, this parameter will override values (default or not) for the

2150 following parameters: `delimiter`, `doublequote`, `escapechar`,

2151 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to

2152 override values, a ParserWarning will be issued. See csv.Dialect

2153 documentation for more details.

2154 delimiter : str or object

2155 Alias for sep.

2156 delim_whitespace : bool

2157 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be

2158 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option

2159 is set to True, nothing should be passed in for the ``delimiter``

2160 parameter.

2161

2162 .. deprecated:: 2.2.0

2163 Use ``sep="\\s+"`` instead.

2164 engine : {{'c', 'python'}}

2165 Parser engine to use. The C engine is faster while the python engine is

2166 currently more feature-complete.

2167 sep : str or object

2168 A delimiter provided by the user (str) or a sentinel value, i.e.

2169 pandas._libs.lib.no_default.

2170 on_bad_lines : str, callable

2171 An option for handling bad lines or a sentinel value(None).

2172 names : array-like, optional

2173 List of column names to use. If the file contains a header row,

2174 then you should explicitly pass ``header=0`` to override the column names.

2175 Duplicates in this list are not allowed.

2176 defaults: dict

2177 Default values of input parameters.

2178

2179 Returns

2180 -------

2181 kwds : dict

2182 Input parameters with correct values.

2183

2184 Raises

2185 ------

2186 ValueError :

2187 If a delimiter was specified with ``sep`` (or ``delimiter``) and

2188 ``delim_whitespace=True``.

2189 """

2190 # fix types for sep, delimiter to Union(str, Any)

2191 delim_default = defaults["delimiter"]

2192 kwds: dict[str, Any] = {}

2193 # gh-23761

2194 #

2195 # When a dialect is passed, it overrides any of the overlapping

2196 # parameters passed in directly. We don't want to warn if the

2197 # default parameters were passed in (since it probably means

2198 # that the user didn't pass them in explicitly in the first place).

2199 #

2200 # "delimiter" is the annoying corner case because we alias it to

2201 # "sep" before doing comparison to the dialect values later on.

2202 # Thus, we need a flag to indicate that we need to "override"

2203 # the comparison to dialect values by checking if default values

2204 # for BOTH "delimiter" and "sep" were provided.

2205 if dialect is not None:

2206 kwds["sep_override"] = delimiter is None and (

2207 sep is lib.no_default or sep == delim_default

2208 )

2209

2210 if delimiter and (sep is not lib.no_default):

2211 raise ValueError("Specified a sep and a delimiter; you can only specify one.")

2212

2213 kwds["names"] = None if names is lib.no_default else names

2214

2215 # Alias sep -> delimiter.

2216 if delimiter is None:

2217 delimiter = sep

2218

2219 if delim_whitespace and (delimiter is not lib.no_default):

2220 raise ValueError(

2221 "Specified a delimiter with both sep and "

2222 "delim_whitespace=True; you can only specify one."

2223 )

2224

2225 if delimiter == "\n":

2226 raise ValueError(

2227 r"Specified \n as separator or delimiter. This forces the python engine "

2228 "which does not accept a line terminator. Hence it is not allowed to use "

2229 "the line terminator as separator.",

2230 )

2231

2232 if delimiter is lib.no_default:

2233 # assign default separator value

2234 kwds["delimiter"] = delim_default

2235 else:

2236 kwds["delimiter"] = delimiter

2237

2238 if engine is not None:

2239 kwds["engine_specified"] = True

2240 else:

2241 kwds["engine"] = "c"

2242 kwds["engine_specified"] = False

2243

2244 if on_bad_lines == "error":

2245 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR

2246 elif on_bad_lines == "warn":

2247 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN

2248 elif on_bad_lines == "skip":

2249 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP

2250 elif callable(on_bad_lines):

2251 if engine not in ["python", "pyarrow"]:

2252 raise ValueError(

2253 "on_bad_line can only be a callable function "

2254 "if engine='python' or 'pyarrow'"

2255 )

2256 kwds["on_bad_lines"] = on_bad_lines

2257 else:

2258 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")

2259

2260 check_dtype_backend(dtype_backend)

2261

2262 kwds["dtype_backend"] = dtype_backend

2263

2264 return kwds

2265

2266

2267def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:

2268 """

2269 Extract concrete csv dialect instance.

2270

2271 Returns

2272 -------

2273 csv.Dialect or None

2274 """

2275 if kwds.get("dialect") is None:

2276 return None

2277

2278 dialect = kwds["dialect"]

2279 if dialect in csv.list_dialects():

2280 dialect = csv.get_dialect(dialect)

2281

2282 _validate_dialect(dialect)

2283

2284 return dialect

2285

2286

2287MANDATORY_DIALECT_ATTRS = (

2288 "delimiter",

2289 "doublequote",

2290 "escapechar",

2291 "skipinitialspace",

2292 "quotechar",

2293 "quoting",

2294)

2295

2296

2297def _validate_dialect(dialect: csv.Dialect) -> None:

2298 """

2299 Validate csv dialect instance.

2300

2301 Raises

2302 ------

2303 ValueError

2304 If incorrect dialect is provided.

2305 """

2306 for param in MANDATORY_DIALECT_ATTRS:

2307 if not hasattr(dialect, param):

2308 raise ValueError(f"Invalid dialect {dialect} provided")

2309

2310

2311def _merge_with_dialect_properties(

2312 dialect: csv.Dialect,

2313 defaults: dict[str, Any],

2314) -> dict[str, Any]:

2315 """

2316 Merge default kwargs in TextFileReader with dialect parameters.

2317

2318 Parameters

2319 ----------

2320 dialect : csv.Dialect

2321 Concrete csv dialect. See csv.Dialect documentation for more details.

2322 defaults : dict

2323 Keyword arguments passed to TextFileReader.

2324

2325 Returns

2326 -------

2327 kwds : dict

2328 Updated keyword arguments, merged with dialect parameters.

2329 """

2330 kwds = defaults.copy()

2331

2332 for param in MANDATORY_DIALECT_ATTRS:

2333 dialect_val = getattr(dialect, param)

2334

2335 parser_default = parser_defaults[param]

2336 provided = kwds.get(param, parser_default)

2337

2338 # Messages for conflicting values between the dialect

2339 # instance and the actual parameters provided.

2340 conflict_msgs = []

2341

2342 # Don't warn if the default parameter was passed in,

2343 # even if it conflicts with the dialect (gh-23761).

2344 if provided not in (parser_default, dialect_val):

2345 msg = (

2346 f"Conflicting values for '{param}': '{provided}' was "

2347 f"provided, but the dialect specifies '{dialect_val}'. "

2348 "Using the dialect-specified value."

2349 )

2350

2351 # Annoying corner case for not warning about

2352 # conflicts between dialect and delimiter parameter.

2353 # Refer to the outer "_read_" function for more info.

2354 if not (param == "delimiter" and kwds.pop("sep_override", False)):

2355 conflict_msgs.append(msg)

2356

2357 if conflict_msgs:

2358 warnings.warn(

2359 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()

2360 )

2361 kwds[param] = dialect_val

2362 return kwds

2363

2364

2365def _validate_skipfooter(kwds: dict[str, Any]) -> None:

2366 """

2367 Check whether skipfooter is compatible with other kwargs in TextFileReader.

2368

2369 Parameters

2370 ----------

2371 kwds : dict

2372 Keyword arguments passed to TextFileReader.

2373

2374 Raises

2375 ------

2376 ValueError

2377 If skipfooter is not compatible with other parameters.

2378 """

2379 if kwds.get("skipfooter"):

2380 if kwds.get("iterator") or kwds.get("chunksize"):

2381 raise ValueError("'skipfooter' not supported for iteration")

2382 if kwds.get("nrows"):

2383 raise ValueError("'skipfooter' not supported with 'nrows'")