Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/parsers/readers.py: 18%

1"""

2Module contains tools for processing files into DataFrames or other objects

4GH#48849 provides a convenient way of deprecating keyword arguments

5"""

6from __future__ import annotations

8from collections import abc

9import csv

10import sys

11from textwrap import fill

12from types import TracebackType

13from typing import (

14 IO,

15 Any,

16 Callable,

17 Hashable,

18 Literal,

19 NamedTuple,

20 Sequence,

21 overload,

22)

23import warnings

25import numpy as np

27from pandas._libs import lib

28from pandas._libs.parsers import STR_NA_VALUES

29from pandas._typing import (

30 CompressionOptions,

31 CSVEngine,

32 DtypeArg,

33 DtypeBackend,

34 FilePath,

35 IndexLabel,

36 ReadCsvBuffer,

37 StorageOptions,

38)

39from pandas.errors import (

40 AbstractMethodError,

41 ParserWarning,

42)

43from pandas.util._decorators import Appender

44from pandas.util._exceptions import find_stack_level

45from pandas.util._validators import check_dtype_backend

47from pandas.core.dtypes.common import (

48 is_file_like,

49 is_float,

50 is_integer,

51 is_list_like,

52)

54from pandas.core.frame import DataFrame

55from pandas.core.indexes.api import RangeIndex

56from pandas.core.shared_docs import _shared_docs

58from pandas.io.common import (

59 IOHandles,

60 get_handle,

61 stringify_path,

62 validate_header_arg,

63)

64from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper

65from pandas.io.parsers.base_parser import (

66 ParserBase,

67 is_index_col,

68 parser_defaults,

69)

70from pandas.io.parsers.c_parser_wrapper import CParserWrapper

71from pandas.io.parsers.python_parser import (

72 FixedWidthFieldParser,

73 PythonParser,

74)

76_doc_read_csv_and_table = (

77 r"""

78{summary}

80Also supports optionally iterating or breaking of the file

81into chunks.

83Additional help can be found in the online docs for

84`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.

86Parameters

87----------

88filepath_or_buffer : str, path object or file-like object

89 Any valid string path is acceptable. The string could be a URL. Valid

90 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is

91 expected. A local file could be: file://localhost/path/to/table.csv.

93 If you want to pass in a path object, pandas accepts any ``os.PathLike``.

95 By file-like object, we refer to objects with a ``read()`` method, such as

96 a file handle (e.g. via builtin ``open`` function) or ``StringIO``.

97sep : str, default {_default_sep}

98 Delimiter to use. If sep is None, the C engine cannot automatically detect

99 the separator, but the Python parsing engine can, meaning the latter will

100 be used and automatically detect the separator by Python's builtin sniffer

101 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and

102 different from ``'\s+'`` will be interpreted as regular expressions and

103 will also force the use of the Python parsing engine. Note that regex

104 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.

105delimiter : str, default ``None``

106 Alias for sep.

107header : int, list of int, None, default 'infer'

108 Row number(s) to use as the column names, and the start of the

109 data. Default behavior is to infer the column names: if no names

110 are passed the behavior is identical to ``header=0`` and column

111 names are inferred from the first line of the file, if column

112 names are passed explicitly then the behavior is identical to

113 ``header=None``. Explicitly pass ``header=0`` to be able to

114 replace existing names. The header can be a list of integers that

115 specify row locations for a multi-index on the columns

116 e.g. [0,1,3]. Intervening rows that are not specified will be

117 skipped (e.g. 2 in this example is skipped). Note that this

118 parameter ignores commented lines and empty lines if

119 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of

120 data rather than the first line of the file.

121names : array-like, optional

122 List of column names to use. If the file contains a header row,

123 then you should explicitly pass ``header=0`` to override the column names.

124 Duplicates in this list are not allowed.

125index_col : int, str, sequence of int / str, or False, optional, default ``None``

126 Column(s) to use as the row labels of the ``DataFrame``, either given as

127 string name or column index. If a sequence of int / str is given, a

128 MultiIndex is used.

129

130 Note: ``index_col=False`` can be used to force pandas to *not* use the first

131 column as the index, e.g. when you have a malformed file with delimiters at

132 the end of each line.

133usecols : list-like or callable, optional

134 Return a subset of the columns. If list-like, all elements must either

135 be positional (i.e. integer indices into the document columns) or strings

136 that correspond to column names provided either by the user in `names` or

137 inferred from the document header row(s). If ``names`` are given, the document

138 header row(s) are not taken into account. For example, a valid list-like

139 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.

140 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.

141 To instantiate a DataFrame from ``data`` with element order preserved use

142 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns

143 in ``['foo', 'bar']`` order or

144 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``

145 for ``['bar', 'foo']`` order.

146

147 If callable, the callable function will be evaluated against the column

148 names, returning names where the callable function evaluates to True. An

149 example of a valid callable argument would be ``lambda x: x.upper() in

150 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster

151 parsing time and lower memory usage.

152dtype : Type name or dict of column -> type, optional

153 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,

154 'c': 'Int64'}}

155 Use `str` or `object` together with suitable `na_values` settings

156 to preserve and not interpret dtype.

157 If converters are specified, they will be applied INSTEAD

158 of dtype conversion.

159

160 .. versionadded:: 1.5.0

161

162 Support for defaultdict was added. Specify a defaultdict as input where

163 the default determines the dtype of the columns which are not explicitly

164 listed.

165engine : {{'c', 'python', 'pyarrow'}}, optional

166 Parser engine to use. The C and pyarrow engines are faster, while the python engine

167 is currently more feature-complete. Multithreading is currently only supported by

168 the pyarrow engine.

169

170 .. versionadded:: 1.4.0

171

172 The "pyarrow" engine was added as an *experimental* engine, and some features

173 are unsupported, or may not work correctly, with this engine.

174converters : dict, optional

175 Dict of functions for converting values in certain columns. Keys can either

176 be integers or column labels.

177true_values : list, optional

178 Values to consider as True in addition to case-insensitive variants of "True".

179false_values : list, optional

180 Values to consider as False in addition to case-insensitive variants of "False".

181skipinitialspace : bool, default False

182 Skip spaces after delimiter.

183skiprows : list-like, int or callable, optional

184 Line numbers to skip (0-indexed) or number of lines to skip (int)

185 at the start of the file.

186

187 If callable, the callable function will be evaluated against the row

188 indices, returning True if the row should be skipped and False otherwise.

189 An example of a valid callable argument would be ``lambda x: x in [0, 2]``.

190skipfooter : int, default 0

191 Number of lines at bottom of file to skip (Unsupported with engine='c').

192nrows : int, optional

193 Number of rows of file to read. Useful for reading pieces of large files.

194na_values : scalar, str, list-like, or dict, optional

195 Additional strings to recognize as NA/NaN. If dict passed, specific

196 per-column NA values. By default the following values are interpreted as

197 NaN: '"""

198 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")

199 + """'.

200keep_default_na : bool, default True

201 Whether or not to include the default NaN values when parsing the data.

202 Depending on whether `na_values` is passed in, the behavior is as follows:

203

204 * If `keep_default_na` is True, and `na_values` are specified, `na_values`

205 is appended to the default NaN values used for parsing.

206 * If `keep_default_na` is True, and `na_values` are not specified, only

207 the default NaN values are used for parsing.

208 * If `keep_default_na` is False, and `na_values` are specified, only

209 the NaN values specified `na_values` are used for parsing.

210 * If `keep_default_na` is False, and `na_values` are not specified, no

211 strings will be parsed as NaN.

212

213 Note that if `na_filter` is passed in as False, the `keep_default_na` and

214 `na_values` parameters will be ignored.

215na_filter : bool, default True

216 Detect missing value markers (empty strings and the value of na_values). In

217 data without any NAs, passing na_filter=False can improve the performance

218 of reading a large file.

219verbose : bool, default False

220 Indicate number of NA values placed in non-numeric columns.

221skip_blank_lines : bool, default True

222 If True, skip over blank lines rather than interpreting as NaN values.

223parse_dates : bool or list of int or names or list of lists or dict, \

224default False

225 The behavior is as follows:

226

227 * boolean. If True -> try parsing the index.

228 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3

229 each as a separate date column.

230 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as

231 a single date column.

232 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call

233 result 'foo'

234

235 If a column or index cannot be represented as an array of datetimes,

236 say because of an unparsable value or a mixture of timezones, the column

237 or index will be returned unaltered as an object data type. For

238 non-standard datetime parsing, use ``pd.to_datetime`` after

239 ``pd.read_csv``.

240

241 Note: A fast-path exists for iso8601-formatted dates.

242infer_datetime_format : bool, default False

243 If True and `parse_dates` is enabled, pandas will attempt to infer the

244 format of the datetime strings in the columns, and if it can be inferred,

245 switch to a faster method of parsing them. In some cases this can increase

246 the parsing speed by 5-10x.

247

248 .. deprecated:: 2.0.0

249 A strict version of this argument is now the default, passing it has no effect.

250

251keep_date_col : bool, default False

252 If True and `parse_dates` specifies combining multiple columns then

253 keep the original columns.

254date_parser : function, optional

255 Function to use for converting a sequence of string columns to an array of

256 datetime instances. The default uses ``dateutil.parser.parser`` to do the

257 conversion. Pandas will try to call `date_parser` in three different ways,

258 advancing to the next if an exception occurs: 1) Pass one or more arrays

259 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the

260 string values from the columns defined by `parse_dates` into a single array

261 and pass that; and 3) call `date_parser` once for each row using one or

262 more strings (corresponding to the columns defined by `parse_dates`) as

263 arguments.

264

265 .. deprecated:: 2.0.0

266 Use ``date_format`` instead, or read in as ``object`` and then apply

267 :func:`to_datetime` as-needed.

268date_format : str or dict of column -> format, default ``None``

269 If used in conjunction with ``parse_dates``, will parse dates according to this

270 format. For anything more complex,

271 please read in as ``object`` and then apply :func:`to_datetime` as-needed.

272

273 .. versionadded:: 2.0.0

274dayfirst : bool, default False

275 DD/MM format dates, international and European format.

276cache_dates : bool, default True

277 If True, use a cache of unique, converted dates to apply the datetime

278 conversion. May produce significant speed-up when parsing duplicate

279 date strings, especially ones with timezone offsets.

280

281iterator : bool, default False

282 Return TextFileReader object for iteration or getting chunks with

283 ``get_chunk()``.

284

285 .. versionchanged:: 1.2

286

287 ``TextFileReader`` is a context manager.

288chunksize : int, optional

289 Return TextFileReader object for iteration.

290 See the `IO Tools docs

291 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_

292 for more information on ``iterator`` and ``chunksize``.

293

294 .. versionchanged:: 1.2

295

296 ``TextFileReader`` is a context manager.

297{decompression_options}

298

299 .. versionchanged:: 1.4.0 Zstandard support.

300

301thousands : str, optional

302 Thousands separator.

303decimal : str, default '.'

304 Character to recognize as decimal point (e.g. use ',' for European data).

305lineterminator : str (length 1), optional

306 Character to break file into lines. Only valid with C parser.

307quotechar : str (length 1), optional

308 The character used to denote the start and end of a quoted item. Quoted

309 items can include the delimiter and it will be ignored.

310quoting : int or csv.QUOTE_* instance, default 0

311 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of

312 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).

313doublequote : bool, default ``True``

314 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate

315 whether or not to interpret two consecutive quotechar elements INSIDE a

316 field as a single ``quotechar`` element.

317escapechar : str (length 1), optional

318 One-character string used to escape other characters.

319comment : str, optional

320 Indicates remainder of line should not be parsed. If found at the beginning

321 of a line, the line will be ignored altogether. This parameter must be a

322 single character. Like empty lines (as long as ``skip_blank_lines=True``),

323 fully commented lines are ignored by the parameter `header` but not by

324 `skiprows`. For example, if ``comment='#'``, parsing

325 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being

326 treated as the header.

327encoding : str, optional, default "utf-8"

328 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python

329 standard encodings

330 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .

331

332 .. versionchanged:: 1.2

333

334 When ``encoding`` is ``None``, ``errors="replace"`` is passed to

335 ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.

336 This behavior was previously only the case for ``engine="python"``.

337

338 .. versionchanged:: 1.3.0

339

340 ``encoding_errors`` is a new argument. ``encoding`` has no longer an

341 influence on how encoding errors are handled.

342

343encoding_errors : str, optional, default "strict"

344 How encoding errors are treated. `List of possible values

345 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .

346

347 .. versionadded:: 1.3.0

348

349dialect : str or csv.Dialect, optional

350 If provided, this parameter will override values (default or not) for the

351 following parameters: `delimiter`, `doublequote`, `escapechar`,

352 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to

353 override values, a ParserWarning will be issued. See csv.Dialect

354 documentation for more details.

355on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error'

356 Specifies what to do upon encountering a bad line (a line with too many fields).

357 Allowed values are :

358

359 - 'error', raise an Exception when a bad line is encountered.

360 - 'warn', raise a warning when a bad line is encountered and skip that line.

361 - 'skip', skip bad lines without raising or warning when they are encountered.

362

363 .. versionadded:: 1.3.0

364

365 .. versionadded:: 1.4.0

366

367 - callable, function with signature

368 ``(bad_line: list[str]) -> list[str] | None`` that will process a single

369 bad line. ``bad_line`` is a list of strings split by the ``sep``.

370 If the function returns ``None``, the bad line will be ignored.

371 If the function returns a new list of strings with more elements than

372 expected, a ``ParserWarning`` will be emitted while dropping extra elements.

373 Only supported when ``engine="python"``

374

375delim_whitespace : bool, default False

376 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be

377 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option

378 is set to True, nothing should be passed in for the ``delimiter``

379 parameter.

380low_memory : bool, default True

381 Internally process the file in chunks, resulting in lower memory use

382 while parsing, but possibly mixed type inference. To ensure no mixed

383 types either set False, or specify the type with the `dtype` parameter.

384 Note that the entire file is read into a single DataFrame regardless,

385 use the `chunksize` or `iterator` parameter to return the data in chunks.

386 (Only valid with C parser).

387memory_map : bool, default False

388 If a filepath is provided for `filepath_or_buffer`, map the file object

389 directly onto memory and access the data directly from there. Using this

390 option can improve performance because there is no longer any I/O overhead.

391float_precision : str, optional

392 Specifies which converter the C engine should use for floating-point

393 values. The options are ``None`` or 'high' for the ordinary converter,

394 'legacy' for the original lower precision pandas converter, and

395 'round_trip' for the round-trip converter.

396

397 .. versionchanged:: 1.2

398

399{storage_options}

400

401 .. versionadded:: 1.2

402

403dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames

404 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy

405 arrays, nullable dtypes are used for all dtypes that have a nullable

406 implementation when "numpy_nullable" is set, pyarrow is used for all

407 dtypes if "pyarrow" is set.

408

409 The dtype_backends are still experimential.

410

411 .. versionadded:: 2.0

412

413Returns

414-------

415DataFrame or TextFileReader

416 A comma-separated values (csv) file is returned as two-dimensional

417 data structure with labeled axes.

418

419See Also

420--------

421DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

422read_csv : Read a comma-separated values (csv) file into DataFrame.

423read_fwf : Read a table of fixed-width formatted lines into DataFrame.

424

425Examples

426--------

427>>> pd.{func_name}('data.csv') # doctest: +SKIP

428"""

429)

430

431

432_c_parser_defaults = {

433 "delim_whitespace": False,

434 "na_filter": True,

435 "low_memory": True,

436 "memory_map": False,

437 "float_precision": None,

438}

439

440_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}

441

442_c_unsupported = {"skipfooter"}

443_python_unsupported = {"low_memory", "float_precision"}

444_pyarrow_unsupported = {

445 "skipfooter",

446 "float_precision",

447 "chunksize",

448 "comment",

449 "nrows",

450 "thousands",

451 "memory_map",

452 "dialect",

453 "on_bad_lines",

454 "delim_whitespace",

455 "quoting",

456 "lineterminator",

457 "converters",

458 "iterator",

459 "dayfirst",

460 "verbose",

461 "skipinitialspace",

462 "low_memory",

463}

464

465

466class _DeprecationConfig(NamedTuple):

467 default_value: Any

468 msg: str | None

469

470

471@overload

472def validate_integer(name, val: None, min_val: int = ...) -> None:

473 ...

474

475

476@overload

477def validate_integer(name, val: float, min_val: int = ...) -> int:

478 ...

479

480

481@overload

482def validate_integer(name, val: int | None, min_val: int = ...) -> int | None:

483 ...

484

485

486def validate_integer(name, val: int | float | None, min_val: int = 0) -> int | None:

487 """

488 Checks whether the 'name' parameter for parsing is either

489 an integer OR float that can SAFELY be cast to an integer

490 without losing accuracy. Raises a ValueError if that is

491 not the case.

492

493 Parameters

494 ----------

495 name : str

496 Parameter name (used for error reporting)

497 val : int or float

498 The value to check

499 min_val : int

500 Minimum allowed value (val < min_val will result in a ValueError)

501 """

502 if val is None:

503 return val

504

505 msg = f"'{name:s}' must be an integer >={min_val:d}"

506 if is_float(val):

507 if int(val) != val:

508 raise ValueError(msg)

509 val = int(val)

510 elif not (is_integer(val) and val >= min_val):

511 raise ValueError(msg)

512

513 return int(val)

514

515

516def _validate_names(names: Sequence[Hashable] | None) -> None:

517 """

518 Raise ValueError if the `names` parameter contains duplicates or has an

519 invalid data type.

520

521 Parameters

522 ----------

523 names : array-like or None

524 An array containing a list of the names used for the output DataFrame.

525

526 Raises

527 ------

528 ValueError

529 If names are not unique or are not ordered (e.g. set).

530 """

531 if names is not None:

532 if len(names) != len(set(names)):

533 raise ValueError("Duplicate names are not allowed.")

534 if not (

535 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)

536 ):

537 raise ValueError("Names should be an ordered collection.")

538

539

540def _read(

541 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds

542) -> DataFrame | TextFileReader:

543 """Generic reader of line files."""

544 # if we pass a date_parser and parse_dates=False, we should not parse the

545 # dates GH#44366

546 if kwds.get("parse_dates", None) is None:

547 if (

548 kwds.get("date_parser", lib.no_default) is lib.no_default

549 and kwds.get("date_format", None) is None

550 ):

551 kwds["parse_dates"] = False

552 else:

553 kwds["parse_dates"] = True

554

555 # Extract some of the arguments (pass chunksize on).

556 iterator = kwds.get("iterator", False)

557 chunksize = kwds.get("chunksize", None)

558 if kwds.get("engine") == "pyarrow":

559 if iterator:

560 raise ValueError(

561 "The 'iterator' option is not supported with the 'pyarrow' engine"

562 )

563

564 if chunksize is not None:

565 raise ValueError(

566 "The 'chunksize' option is not supported with the 'pyarrow' engine"

567 )

568 else:

569 chunksize = validate_integer("chunksize", chunksize, 1)

570

571 nrows = kwds.get("nrows", None)

572

573 # Check for duplicates in names.

574 _validate_names(kwds.get("names", None))

575

576 # Create the parser.

577 parser = TextFileReader(filepath_or_buffer, **kwds)

578

579 if chunksize or iterator:

580 return parser

581

582 with parser:

583 return parser.read(nrows)

584

585

586# iterator=True -> TextFileReader

587@overload

588def read_csv(

589 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

590 *,

591 sep: str | None | lib.NoDefault = ...,

592 delimiter: str | None | lib.NoDefault = ...,

593 header: int | Sequence[int] | None | Literal["infer"] = ...,

594 names: Sequence[Hashable] | None | lib.NoDefault = ...,

595 index_col: IndexLabel | Literal[False] | None = ...,

596 usecols=...,

597 dtype: DtypeArg | None = ...,

598 engine: CSVEngine | None = ...,

599 converters=...,

600 true_values=...,

601 false_values=...,

602 skipinitialspace: bool = ...,

603 skiprows=...,

604 skipfooter: int = ...,

605 nrows: int | None = ...,

606 na_values=...,

607 keep_default_na: bool = ...,

608 na_filter: bool = ...,

609 verbose: bool = ...,

610 skip_blank_lines: bool = ...,

611 parse_dates: bool | Sequence[Hashable] | None = ...,

612 infer_datetime_format: bool | lib.NoDefault = ...,

613 keep_date_col: bool = ...,

614 date_parser=...,

615 date_format: str | None = ...,

616 dayfirst: bool = ...,

617 cache_dates: bool = ...,

618 iterator: Literal[True],

619 chunksize: int | None = ...,

620 compression: CompressionOptions = ...,

621 thousands: str | None = ...,

622 decimal: str = ...,

623 lineterminator: str | None = ...,

624 quotechar: str = ...,

625 quoting: int = ...,

626 doublequote: bool = ...,

627 escapechar: str | None = ...,

628 comment: str | None = ...,

629 encoding: str | None = ...,

630 encoding_errors: str | None = ...,

631 dialect: str | csv.Dialect | None = ...,

632 on_bad_lines=...,

633 delim_whitespace: bool = ...,

634 low_memory=...,

635 memory_map: bool = ...,

636 float_precision: Literal["high", "legacy"] | None = ...,

637 storage_options: StorageOptions = ...,

638 dtype_backend: DtypeBackend | lib.NoDefault = ...,

639) -> TextFileReader:

640 ...

641

642

643# chunksize=int -> TextFileReader

644@overload

645def read_csv(

646 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

647 *,

648 sep: str | None | lib.NoDefault = ...,

649 delimiter: str | None | lib.NoDefault = ...,

650 header: int | Sequence[int] | None | Literal["infer"] = ...,

651 names: Sequence[Hashable] | None | lib.NoDefault = ...,

652 index_col: IndexLabel | Literal[False] | None = ...,

653 usecols=...,

654 dtype: DtypeArg | None = ...,

655 engine: CSVEngine | None = ...,

656 converters=...,

657 true_values=...,

658 false_values=...,

659 skipinitialspace: bool = ...,

660 skiprows=...,

661 skipfooter: int = ...,

662 nrows: int | None = ...,

663 na_values=...,

664 keep_default_na: bool = ...,

665 na_filter: bool = ...,

666 verbose: bool = ...,

667 skip_blank_lines: bool = ...,

668 parse_dates: bool | Sequence[Hashable] | None = ...,

669 infer_datetime_format: bool | lib.NoDefault = ...,

670 keep_date_col: bool = ...,

671 date_parser=...,

672 date_format: str | None = ...,

673 dayfirst: bool = ...,

674 cache_dates: bool = ...,

675 iterator: bool = ...,

676 chunksize: int,

677 compression: CompressionOptions = ...,

678 thousands: str | None = ...,

679 decimal: str = ...,

680 lineterminator: str | None = ...,

681 quotechar: str = ...,

682 quoting: int = ...,

683 doublequote: bool = ...,

684 escapechar: str | None = ...,

685 comment: str | None = ...,

686 encoding: str | None = ...,

687 encoding_errors: str | None = ...,

688 dialect: str | csv.Dialect | None = ...,

689 on_bad_lines=...,

690 delim_whitespace: bool = ...,

691 low_memory=...,

692 memory_map: bool = ...,

693 float_precision: Literal["high", "legacy"] | None = ...,

694 storage_options: StorageOptions = ...,

695 dtype_backend: DtypeBackend | lib.NoDefault = ...,

696) -> TextFileReader:

697 ...

698

699

700# default case -> DataFrame

701@overload

702def read_csv(

703 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

704 *,

705 sep: str | None | lib.NoDefault = ...,

706 delimiter: str | None | lib.NoDefault = ...,

707 header: int | Sequence[int] | None | Literal["infer"] = ...,

708 names: Sequence[Hashable] | None | lib.NoDefault = ...,

709 index_col: IndexLabel | Literal[False] | None = ...,

710 usecols=...,

711 dtype: DtypeArg | None = ...,

712 engine: CSVEngine | None = ...,

713 converters=...,

714 true_values=...,

715 false_values=...,

716 skipinitialspace: bool = ...,

717 skiprows=...,

718 skipfooter: int = ...,

719 nrows: int | None = ...,

720 na_values=...,

721 keep_default_na: bool = ...,

722 na_filter: bool = ...,

723 verbose: bool = ...,

724 skip_blank_lines: bool = ...,

725 parse_dates: bool | Sequence[Hashable] | None = ...,

726 infer_datetime_format: bool | lib.NoDefault = ...,

727 keep_date_col: bool = ...,

728 date_parser=...,

729 date_format: str | None = ...,

730 dayfirst: bool = ...,

731 cache_dates: bool = ...,

732 iterator: Literal[False] = ...,

733 chunksize: None = ...,

734 compression: CompressionOptions = ...,

735 thousands: str | None = ...,

736 decimal: str = ...,

737 lineterminator: str | None = ...,

738 quotechar: str = ...,

739 quoting: int = ...,

740 doublequote: bool = ...,

741 escapechar: str | None = ...,

742 comment: str | None = ...,

743 encoding: str | None = ...,

744 encoding_errors: str | None = ...,

745 dialect: str | csv.Dialect | None = ...,

746 on_bad_lines=...,

747 delim_whitespace: bool = ...,

748 low_memory=...,

749 memory_map: bool = ...,

750 float_precision: Literal["high", "legacy"] | None = ...,

751 storage_options: StorageOptions = ...,

752 dtype_backend: DtypeBackend | lib.NoDefault = ...,

753) -> DataFrame:

754 ...

755

756

757# Unions -> DataFrame | TextFileReader

758@overload

759def read_csv(

760 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

761 *,

762 sep: str | None | lib.NoDefault = ...,

763 delimiter: str | None | lib.NoDefault = ...,

764 header: int | Sequence[int] | None | Literal["infer"] = ...,

765 names: Sequence[Hashable] | None | lib.NoDefault = ...,

766 index_col: IndexLabel | Literal[False] | None = ...,

767 usecols=...,

768 dtype: DtypeArg | None = ...,

769 engine: CSVEngine | None = ...,

770 converters=...,

771 true_values=...,

772 false_values=...,

773 skipinitialspace: bool = ...,

774 skiprows=...,

775 skipfooter: int = ...,

776 nrows: int | None = ...,

777 na_values=...,

778 keep_default_na: bool = ...,

779 na_filter: bool = ...,

780 verbose: bool = ...,

781 skip_blank_lines: bool = ...,

782 parse_dates: bool | Sequence[Hashable] | None = ...,

783 infer_datetime_format: bool | lib.NoDefault = ...,

784 keep_date_col: bool = ...,

785 date_parser=...,

786 date_format: str | None = ...,

787 dayfirst: bool = ...,

788 cache_dates: bool = ...,

789 iterator: bool = ...,

790 chunksize: int | None = ...,

791 compression: CompressionOptions = ...,

792 thousands: str | None = ...,

793 decimal: str = ...,

794 lineterminator: str | None = ...,

795 quotechar: str = ...,

796 quoting: int = ...,

797 doublequote: bool = ...,

798 escapechar: str | None = ...,

799 comment: str | None = ...,

800 encoding: str | None = ...,

801 encoding_errors: str | None = ...,

802 dialect: str | csv.Dialect | None = ...,

803 on_bad_lines=...,

804 delim_whitespace: bool = ...,

805 low_memory=...,

806 memory_map: bool = ...,

807 float_precision: Literal["high", "legacy"] | None = ...,

808 storage_options: StorageOptions = ...,

809 dtype_backend: DtypeBackend | lib.NoDefault = ...,

810) -> DataFrame | TextFileReader:

811 ...

812

813

814@Appender(

815 _doc_read_csv_and_table.format(

816 func_name="read_csv",

817 summary="Read a comma-separated values (csv) file into DataFrame.",

818 _default_sep="','",

819 storage_options=_shared_docs["storage_options"],

820 decompression_options=_shared_docs["decompression_options"]

821 % "filepath_or_buffer",

822 )

823)

824def read_csv(

825 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

826 *,

827 sep: str | None | lib.NoDefault = lib.no_default,

828 delimiter: str | None | lib.NoDefault = None,

829 # Column and Index Locations and Names

830 header: int | Sequence[int] | None | Literal["infer"] = "infer",

831 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,

832 index_col: IndexLabel | Literal[False] | None = None,

833 usecols=None,

834 # General Parsing Configuration

835 dtype: DtypeArg | None = None,

836 engine: CSVEngine | None = None,

837 converters=None,

838 true_values=None,

839 false_values=None,

840 skipinitialspace: bool = False,

841 skiprows=None,

842 skipfooter: int = 0,

843 nrows: int | None = None,

844 # NA and Missing Data Handling

845 na_values=None,

846 keep_default_na: bool = True,

847 na_filter: bool = True,

848 verbose: bool = False,

849 skip_blank_lines: bool = True,

850 # Datetime Handling

851 parse_dates: bool | Sequence[Hashable] | None = None,

852 infer_datetime_format: bool | lib.NoDefault = lib.no_default,

853 keep_date_col: bool = False,

854 date_parser=lib.no_default,

855 date_format: str | None = None,

856 dayfirst: bool = False,

857 cache_dates: bool = True,

858 # Iteration

859 iterator: bool = False,

860 chunksize: int | None = None,

861 # Quoting, Compression, and File Format

862 compression: CompressionOptions = "infer",

863 thousands: str | None = None,

864 decimal: str = ".",

865 lineterminator: str | None = None,

866 quotechar: str = '"',

867 quoting: int = csv.QUOTE_MINIMAL,

868 doublequote: bool = True,

869 escapechar: str | None = None,

870 comment: str | None = None,

871 encoding: str | None = None,

872 encoding_errors: str | None = "strict",

873 dialect: str | csv.Dialect | None = None,

874 # Error Handling

875 on_bad_lines: str = "error",

876 # Internal

877 delim_whitespace: bool = False,

878 low_memory=_c_parser_defaults["low_memory"],

879 memory_map: bool = False,

880 float_precision: Literal["high", "legacy"] | None = None,

881 storage_options: StorageOptions = None,

882 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

883) -> DataFrame | TextFileReader:

884 if infer_datetime_format is not lib.no_default:

885 warnings.warn(

886 "The argument 'infer_datetime_format' is deprecated and will "

887 "be removed in a future version. "

888 "A strict version of it is now the default, see "

889 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "

890 "You can safely remove this argument.",

891 FutureWarning,

892 stacklevel=find_stack_level(),

893 )

894 # locals() should never be modified

895 kwds = locals().copy()

896 del kwds["filepath_or_buffer"]

897 del kwds["sep"]

898

899 kwds_defaults = _refine_defaults_read(

900 dialect,

901 delimiter,

902 delim_whitespace,

903 engine,

904 sep,

905 on_bad_lines,

906 names,

907 defaults={"delimiter": ","},

908 dtype_backend=dtype_backend,

909 )

910 kwds.update(kwds_defaults)

911

912 return _read(filepath_or_buffer, kwds)

913

914

915# iterator=True -> TextFileReader

916@overload

917def read_table(

918 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

919 *,

920 sep: str | None | lib.NoDefault = ...,

921 delimiter: str | None | lib.NoDefault = ...,

922 header: int | Sequence[int] | None | Literal["infer"] = ...,

923 names: Sequence[Hashable] | None | lib.NoDefault = ...,

924 index_col: IndexLabel | Literal[False] | None = ...,

925 usecols=...,

926 dtype: DtypeArg | None = ...,

927 engine: CSVEngine | None = ...,

928 converters=...,

929 true_values=...,

930 false_values=...,

931 skipinitialspace: bool = ...,

932 skiprows=...,

933 skipfooter: int = ...,

934 nrows: int | None = ...,

935 na_values=...,

936 keep_default_na: bool = ...,

937 na_filter: bool = ...,

938 verbose: bool = ...,

939 skip_blank_lines: bool = ...,

940 parse_dates: bool | Sequence[Hashable] = ...,

941 infer_datetime_format: bool | lib.NoDefault = ...,

942 keep_date_col: bool = ...,

943 date_parser=...,

944 date_format: str | None = ...,

945 dayfirst: bool = ...,

946 cache_dates: bool = ...,

947 iterator: Literal[True],

948 chunksize: int | None = ...,

949 compression: CompressionOptions = ...,

950 thousands: str | None = ...,

951 decimal: str = ...,

952 lineterminator: str | None = ...,

953 quotechar: str = ...,

954 quoting: int = ...,

955 doublequote: bool = ...,

956 escapechar: str | None = ...,

957 comment: str | None = ...,

958 encoding: str | None = ...,

959 encoding_errors: str | None = ...,

960 dialect: str | csv.Dialect | None = ...,

961 on_bad_lines=...,

962 delim_whitespace: bool = ...,

963 low_memory=...,

964 memory_map: bool = ...,

965 float_precision: str | None = ...,

966 storage_options: StorageOptions = ...,

967 dtype_backend: DtypeBackend | lib.NoDefault = ...,

968) -> TextFileReader:

969 ...

970

971

972# chunksize=int -> TextFileReader

973@overload

974def read_table(

975 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

976 *,

977 sep: str | None | lib.NoDefault = ...,

978 delimiter: str | None | lib.NoDefault = ...,

979 header: int | Sequence[int] | None | Literal["infer"] = ...,

980 names: Sequence[Hashable] | None | lib.NoDefault = ...,

981 index_col: IndexLabel | Literal[False] | None = ...,

982 usecols=...,

983 dtype: DtypeArg | None = ...,

984 engine: CSVEngine | None = ...,

985 converters=...,

986 true_values=...,

987 false_values=...,

988 skipinitialspace: bool = ...,

989 skiprows=...,

990 skipfooter: int = ...,

991 nrows: int | None = ...,

992 na_values=...,

993 keep_default_na: bool = ...,

994 na_filter: bool = ...,

995 verbose: bool = ...,

996 skip_blank_lines: bool = ...,

997 parse_dates: bool | Sequence[Hashable] = ...,

998 infer_datetime_format: bool | lib.NoDefault = ...,

999 keep_date_col: bool = ...,

1000 date_parser=...,

1001 date_format: str | None = ...,

1002 dayfirst: bool = ...,

1003 cache_dates: bool = ...,

1004 iterator: bool = ...,

1005 chunksize: int,

1006 compression: CompressionOptions = ...,

1007 thousands: str | None = ...,

1008 decimal: str = ...,

1009 lineterminator: str | None = ...,

1010 quotechar: str = ...,

1011 quoting: int = ...,

1012 doublequote: bool = ...,

1013 escapechar: str | None = ...,

1014 comment: str | None = ...,

1015 encoding: str | None = ...,

1016 encoding_errors: str | None = ...,

1017 dialect: str | csv.Dialect | None = ...,

1018 on_bad_lines=...,

1019 delim_whitespace: bool = ...,

1020 low_memory=...,

1021 memory_map: bool = ...,

1022 float_precision: str | None = ...,

1023 storage_options: StorageOptions = ...,

1024 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1025) -> TextFileReader:

1026 ...

1027

1028

1029# default -> DataFrame

1030@overload

1031def read_table(

1032 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1033 *,

1034 sep: str | None | lib.NoDefault = ...,

1035 delimiter: str | None | lib.NoDefault = ...,

1036 header: int | Sequence[int] | None | Literal["infer"] = ...,

1037 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1038 index_col: IndexLabel | Literal[False] | None = ...,

1039 usecols=...,

1040 dtype: DtypeArg | None = ...,

1041 engine: CSVEngine | None = ...,

1042 converters=...,

1043 true_values=...,

1044 false_values=...,

1045 skipinitialspace: bool = ...,

1046 skiprows=...,

1047 skipfooter: int = ...,

1048 nrows: int | None = ...,

1049 na_values=...,

1050 keep_default_na: bool = ...,

1051 na_filter: bool = ...,

1052 verbose: bool = ...,

1053 skip_blank_lines: bool = ...,

1054 parse_dates: bool | Sequence[Hashable] = ...,

1055 infer_datetime_format: bool | lib.NoDefault = ...,

1056 keep_date_col: bool = ...,

1057 date_parser=...,

1058 date_format: str | None = ...,

1059 dayfirst: bool = ...,

1060 cache_dates: bool = ...,

1061 iterator: Literal[False] = ...,

1062 chunksize: None = ...,

1063 compression: CompressionOptions = ...,

1064 thousands: str | None = ...,

1065 decimal: str = ...,

1066 lineterminator: str | None = ...,

1067 quotechar: str = ...,

1068 quoting: int = ...,

1069 doublequote: bool = ...,

1070 escapechar: str | None = ...,

1071 comment: str | None = ...,

1072 encoding: str | None = ...,

1073 encoding_errors: str | None = ...,

1074 dialect: str | csv.Dialect | None = ...,

1075 on_bad_lines=...,

1076 delim_whitespace: bool = ...,

1077 low_memory=...,

1078 memory_map: bool = ...,

1079 float_precision: str | None = ...,

1080 storage_options: StorageOptions = ...,

1081 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1082) -> DataFrame:

1083 ...

1084

1085

1086# Unions -> DataFrame | TextFileReader

1087@overload

1088def read_table(

1089 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1090 *,

1091 sep: str | None | lib.NoDefault = ...,

1092 delimiter: str | None | lib.NoDefault = ...,

1093 header: int | Sequence[int] | None | Literal["infer"] = ...,

1094 names: Sequence[Hashable] | None | lib.NoDefault = ...,

1095 index_col: IndexLabel | Literal[False] | None = ...,

1096 usecols=...,

1097 dtype: DtypeArg | None = ...,

1098 engine: CSVEngine | None = ...,

1099 converters=...,

1100 true_values=...,

1101 false_values=...,

1102 skipinitialspace: bool = ...,

1103 skiprows=...,

1104 skipfooter: int = ...,

1105 nrows: int | None = ...,

1106 na_values=...,

1107 keep_default_na: bool = ...,

1108 na_filter: bool = ...,

1109 verbose: bool = ...,

1110 skip_blank_lines: bool = ...,

1111 parse_dates: bool | Sequence[Hashable] = ...,

1112 infer_datetime_format: bool | lib.NoDefault = ...,

1113 keep_date_col: bool = ...,

1114 date_parser=...,

1115 date_format: str | None = ...,

1116 dayfirst: bool = ...,

1117 cache_dates: bool = ...,

1118 iterator: bool = ...,

1119 chunksize: int | None = ...,

1120 compression: CompressionOptions = ...,

1121 thousands: str | None = ...,

1122 decimal: str = ...,

1123 lineterminator: str | None = ...,

1124 quotechar: str = ...,

1125 quoting: int = ...,

1126 doublequote: bool = ...,

1127 escapechar: str | None = ...,

1128 comment: str | None = ...,

1129 encoding: str | None = ...,

1130 encoding_errors: str | None = ...,

1131 dialect: str | csv.Dialect | None = ...,

1132 on_bad_lines=...,

1133 delim_whitespace: bool = ...,

1134 low_memory=...,

1135 memory_map: bool = ...,

1136 float_precision: str | None = ...,

1137 storage_options: StorageOptions = ...,

1138 dtype_backend: DtypeBackend | lib.NoDefault = ...,

1139) -> DataFrame | TextFileReader:

1140 ...

1141

1142

1143@Appender(

1144 _doc_read_csv_and_table.format(

1145 func_name="read_table",

1146 summary="Read general delimited file into DataFrame.",

1147 _default_sep=r"'\\t' (tab-stop)",

1148 storage_options=_shared_docs["storage_options"],

1149 decompression_options=_shared_docs["decompression_options"]

1150 % "filepath_or_buffer",

1151 )

1152)

1153def read_table(

1154 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1155 *,

1156 sep: str | None | lib.NoDefault = lib.no_default,

1157 delimiter: str | None | lib.NoDefault = None,

1158 # Column and Index Locations and Names

1159 header: int | Sequence[int] | None | Literal["infer"] = "infer",

1160 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,

1161 index_col: IndexLabel | Literal[False] | None = None,

1162 usecols=None,

1163 # General Parsing Configuration

1164 dtype: DtypeArg | None = None,

1165 engine: CSVEngine | None = None,

1166 converters=None,

1167 true_values=None,

1168 false_values=None,

1169 skipinitialspace: bool = False,

1170 skiprows=None,

1171 skipfooter: int = 0,

1172 nrows: int | None = None,

1173 # NA and Missing Data Handling

1174 na_values=None,

1175 keep_default_na: bool = True,

1176 na_filter: bool = True,

1177 verbose: bool = False,

1178 skip_blank_lines: bool = True,

1179 # Datetime Handling

1180 parse_dates: bool | Sequence[Hashable] = False,

1181 infer_datetime_format: bool | lib.NoDefault = lib.no_default,

1182 keep_date_col: bool = False,

1183 date_parser=lib.no_default,

1184 date_format: str | None = None,

1185 dayfirst: bool = False,

1186 cache_dates: bool = True,

1187 # Iteration

1188 iterator: bool = False,

1189 chunksize: int | None = None,

1190 # Quoting, Compression, and File Format

1191 compression: CompressionOptions = "infer",

1192 thousands: str | None = None,

1193 decimal: str = ".",

1194 lineterminator: str | None = None,

1195 quotechar: str = '"',

1196 quoting: int = csv.QUOTE_MINIMAL,

1197 doublequote: bool = True,

1198 escapechar: str | None = None,

1199 comment: str | None = None,

1200 encoding: str | None = None,

1201 encoding_errors: str | None = "strict",

1202 dialect: str | csv.Dialect | None = None,

1203 # Error Handling

1204 on_bad_lines: str = "error",

1205 # Internal

1206 delim_whitespace: bool = False,

1207 low_memory=_c_parser_defaults["low_memory"],

1208 memory_map: bool = False,

1209 float_precision: str | None = None,

1210 storage_options: StorageOptions = None,

1211 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1212) -> DataFrame | TextFileReader:

1213 if infer_datetime_format is not lib.no_default:

1214 warnings.warn(

1215 "The argument 'infer_datetime_format' is deprecated and will "

1216 "be removed in a future version. "

1217 "A strict version of it is now the default, see "

1218 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "

1219 "You can safely remove this argument.",

1220 FutureWarning,

1221 stacklevel=find_stack_level(),

1222 )

1223

1224 # locals() should never be modified

1225 kwds = locals().copy()

1226 del kwds["filepath_or_buffer"]

1227 del kwds["sep"]

1228

1229 kwds_defaults = _refine_defaults_read(

1230 dialect,

1231 delimiter,

1232 delim_whitespace,

1233 engine,

1234 sep,

1235 on_bad_lines,

1236 names,

1237 defaults={"delimiter": "\t"},

1238 dtype_backend=dtype_backend,

1239 )

1240 kwds.update(kwds_defaults)

1241

1242 return _read(filepath_or_buffer, kwds)

1243

1244

1245def read_fwf(

1246 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],

1247 *,

1248 colspecs: Sequence[tuple[int, int]] | str | None = "infer",

1249 widths: Sequence[int] | None = None,

1250 infer_nrows: int = 100,

1251 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1252 **kwds,

1253) -> DataFrame | TextFileReader:

1254 r"""

1255 Read a table of fixed-width formatted lines into DataFrame.

1256

1257 Also supports optionally iterating or breaking of the file

1258 into chunks.

1259

1260 Additional help can be found in the `online docs for IO Tools

1261 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.

1262

1263 Parameters

1264 ----------

1265 filepath_or_buffer : str, path object, or file-like object

1266 String, path object (implementing ``os.PathLike[str]``), or file-like

1267 object implementing a text ``read()`` function.The string could be a URL.

1268 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is

1269 expected. A local file could be:

1270 ``file://localhost/path/to/table.csv``.

1271 colspecs : list of tuple (int, int) or 'infer'. optional

1272 A list of tuples giving the extents of the fixed-width

1273 fields of each line as half-open intervals (i.e., [from, to[ ).

1274 String value 'infer' can be used to instruct the parser to try

1275 detecting the column specifications from the first 100 rows of

1276 the data which are not being skipped via skiprows (default='infer').

1277 widths : list of int, optional

1278 A list of field widths which can be used instead of 'colspecs' if

1279 the intervals are contiguous.

1280 infer_nrows : int, default 100

1281 The number of rows to consider when letting the parser determine the

1282 `colspecs`.

1283 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames

1284 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy

1285 arrays, nullable dtypes are used for all dtypes that have a nullable

1286 implementation when "numpy_nullable" is set, pyarrow is used for all

1287 dtypes if "pyarrow" is set.

1288

1289 The dtype_backends are still experimential.

1290

1291 .. versionadded:: 2.0

1292

1293 **kwds : optional

1294 Optional keyword arguments can be passed to ``TextFileReader``.

1295

1296 Returns

1297 -------

1298 DataFrame or TextFileReader

1299 A comma-separated values (csv) file is returned as two-dimensional

1300 data structure with labeled axes.

1301

1302 See Also

1303 --------

1304 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

1305 read_csv : Read a comma-separated values (csv) file into DataFrame.

1306

1307 Examples

1308 --------

1309 >>> pd.read_fwf('data.csv') # doctest: +SKIP

1310 """

1311 # Check input arguments.

1312 if colspecs is None and widths is None:

1313 raise ValueError("Must specify either colspecs or widths")

1314 if colspecs not in (None, "infer") and widths is not None:

1315 raise ValueError("You must specify only one of 'widths' and 'colspecs'")

1316

1317 # Compute 'colspecs' from 'widths', if specified.

1318 if widths is not None:

1319 colspecs, col = [], 0

1320 for w in widths:

1321 colspecs.append((col, col + w))

1322 col += w

1323

1324 # for mypy

1325 assert colspecs is not None

1326

1327 # GH#40830

1328 # Ensure length of `colspecs` matches length of `names`

1329 names = kwds.get("names")

1330 if names is not None:

1331 if len(names) != len(colspecs) and colspecs != "infer":

1332 # need to check len(index_col) as it might contain

1333 # unnamed indices, in which case it's name is not required

1334 len_index = 0

1335 if kwds.get("index_col") is not None:

1336 index_col: Any = kwds.get("index_col")

1337 if index_col is not False:

1338 if not is_list_like(index_col):

1339 len_index = 1

1340 else:

1341 len_index = len(index_col)

1342 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):

1343 # If usecols is used colspec may be longer than names

1344 raise ValueError("Length of colspecs must match length of names")

1345

1346 kwds["colspecs"] = colspecs

1347 kwds["infer_nrows"] = infer_nrows

1348 kwds["engine"] = "python-fwf"

1349

1350 check_dtype_backend(dtype_backend)

1351 kwds["dtype_backend"] = dtype_backend

1352 return _read(filepath_or_buffer, kwds)

1353

1354

1355class TextFileReader(abc.Iterator):

1356 """

1357

1358 Passed dialect overrides any of the related parser options

1359

1360 """

1361

1362 def __init__(

1363 self,

1364 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,

1365 engine: CSVEngine | None = None,

1366 **kwds,

1367 ) -> None:

1368 if engine is not None:

1369 engine_specified = True

1370 else:

1371 engine = "python"

1372 engine_specified = False

1373 self.engine = engine

1374 self._engine_specified = kwds.get("engine_specified", engine_specified)

1375

1376 _validate_skipfooter(kwds)

1377

1378 dialect = _extract_dialect(kwds)

1379 if dialect is not None:

1380 if engine == "pyarrow":

1381 raise ValueError(

1382 "The 'dialect' option is not supported with the 'pyarrow' engine"

1383 )

1384 kwds = _merge_with_dialect_properties(dialect, kwds)

1385

1386 if kwds.get("header", "infer") == "infer":

1387 kwds["header"] = 0 if kwds.get("names") is None else None

1388

1389 self.orig_options = kwds

1390

1391 # miscellanea

1392 self._currow = 0

1393

1394 options = self._get_options_with_defaults(engine)

1395 options["storage_options"] = kwds.get("storage_options", None)

1396

1397 self.chunksize = options.pop("chunksize", None)

1398 self.nrows = options.pop("nrows", None)

1399

1400 self._check_file_or_buffer(f, engine)

1401 self.options, self.engine = self._clean_options(options, engine)

1402

1403 if "has_index_names" in kwds:

1404 self.options["has_index_names"] = kwds["has_index_names"]

1405

1406 self.handles: IOHandles | None = None

1407 self._engine = self._make_engine(f, self.engine)

1408

1409 def close(self) -> None:

1410 if self.handles is not None:

1411 self.handles.close()

1412 self._engine.close()

1413

1414 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:

1415 kwds = self.orig_options

1416

1417 options = {}

1418 default: object | None

1419

1420 for argname, default in parser_defaults.items():

1421 value = kwds.get(argname, default)

1422

1423 # see gh-12935

1424 if (

1425 engine == "pyarrow"

1426 and argname in _pyarrow_unsupported

1427 and value != default

1428 and value != getattr(value, "value", default)

1429 ):

1430 raise ValueError(

1431 f"The {repr(argname)} option is not supported with the "

1432 f"'pyarrow' engine"

1433 )

1434 options[argname] = value

1435

1436 for argname, default in _c_parser_defaults.items():

1437 if argname in kwds:

1438 value = kwds[argname]

1439

1440 if engine != "c" and value != default:

1441 if "python" in engine and argname not in _python_unsupported:

1442 pass

1443 else:

1444 raise ValueError(

1445 f"The {repr(argname)} option is not supported with the "

1446 f"{repr(engine)} engine"

1447 )

1448 else:

1449 value = default

1450 options[argname] = value

1451

1452 if engine == "python-fwf":

1453 for argname, default in _fwf_defaults.items():

1454 options[argname] = kwds.get(argname, default)

1455

1456 return options

1457

1458 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:

1459 # see gh-16530

1460 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):

1461 # The C engine doesn't need the file-like to have the "__iter__"

1462 # attribute. However, the Python engine needs "__iter__(...)"

1463 # when iterating through such an object, meaning it

1464 # needs to have that attribute

1465 raise ValueError(

1466 "The 'python' engine cannot iterate through this file buffer."

1467 )

1468

1469 def _clean_options(

1470 self, options: dict[str, Any], engine: CSVEngine

1471 ) -> tuple[dict[str, Any], CSVEngine]:

1472 result = options.copy()

1473

1474 fallback_reason = None

1475

1476 # C engine not supported yet

1477 if engine == "c":

1478 if options["skipfooter"] > 0:

1479 fallback_reason = "the 'c' engine does not support skipfooter"

1480 engine = "python"

1481

1482 sep = options["delimiter"]

1483 delim_whitespace = options["delim_whitespace"]

1484

1485 if sep is None and not delim_whitespace:

1486 if engine in ("c", "pyarrow"):

1487 fallback_reason = (

1488 f"the '{engine}' engine does not support "

1489 "sep=None with delim_whitespace=False"

1490 )

1491 engine = "python"

1492 elif sep is not None and len(sep) > 1:

1493 if engine == "c" and sep == r"\s+":

1494 result["delim_whitespace"] = True

1495 del result["delimiter"]

1496 elif engine not in ("python", "python-fwf"):

1497 # wait until regex engine integrated

1498 fallback_reason = (

1499 f"the '{engine}' engine does not support "

1500 "regex separators (separators > 1 char and "

1501 r"different from '\s+' are interpreted as regex)"

1502 )

1503 engine = "python"

1504 elif delim_whitespace:

1505 if "python" in engine:

1506 result["delimiter"] = r"\s+"

1507 elif sep is not None:

1508 encodeable = True

1509 encoding = sys.getfilesystemencoding() or "utf-8"

1510 try:

1511 if len(sep.encode(encoding)) > 1:

1512 encodeable = False

1513 except UnicodeDecodeError:

1514 encodeable = False

1515 if not encodeable and engine not in ("python", "python-fwf"):

1516 fallback_reason = (

1517 f"the separator encoded in {encoding} "

1518 f"is > 1 char long, and the '{engine}' engine "

1519 "does not support such separators"

1520 )

1521 engine = "python"

1522

1523 quotechar = options["quotechar"]

1524 if quotechar is not None and isinstance(quotechar, (str, bytes)):

1525 if (

1526 len(quotechar) == 1

1527 and ord(quotechar) > 127

1528 and engine not in ("python", "python-fwf")

1529 ):

1530 fallback_reason = (

1531 "ord(quotechar) > 127, meaning the "

1532 "quotechar is larger than one byte, "

1533 f"and the '{engine}' engine does not support such quotechars"

1534 )

1535 engine = "python"

1536

1537 if fallback_reason and self._engine_specified:

1538 raise ValueError(fallback_reason)

1539

1540 if engine == "c":

1541 for arg in _c_unsupported:

1542 del result[arg]

1543

1544 if "python" in engine:

1545 for arg in _python_unsupported:

1546 if fallback_reason and result[arg] != _c_parser_defaults[arg]:

1547 raise ValueError(

1548 "Falling back to the 'python' engine because "

1549 f"{fallback_reason}, but this causes {repr(arg)} to be "

1550 "ignored as it is not supported by the 'python' engine."

1551 )

1552 del result[arg]

1553

1554 if fallback_reason:

1555 warnings.warn(

1556 (

1557 "Falling back to the 'python' engine because "

1558 f"{fallback_reason}; you can avoid this warning by specifying "

1559 "engine='python'."

1560 ),

1561 ParserWarning,

1562 stacklevel=find_stack_level(),

1563 )

1564

1565 index_col = options["index_col"]

1566 names = options["names"]

1567 converters = options["converters"]

1568 na_values = options["na_values"]

1569 skiprows = options["skiprows"]

1570

1571 validate_header_arg(options["header"])

1572

1573 if index_col is True:

1574 raise ValueError("The value of index_col couldn't be 'True'")

1575 if is_index_col(index_col):

1576 if not isinstance(index_col, (list, tuple, np.ndarray)):

1577 index_col = [index_col]

1578 result["index_col"] = index_col

1579

1580 names = list(names) if names is not None else names

1581

1582 # type conversion-related

1583 if converters is not None:

1584 if not isinstance(converters, dict):

1585 raise TypeError(

1586 "Type converters must be a dict or subclass, "

1587 f"input was a {type(converters).__name__}"

1588 )

1589 else:

1590 converters = {}

1591

1592 # Converting values to NA

1593 keep_default_na = options["keep_default_na"]

1594 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)

1595

1596 # handle skiprows; this is internally handled by the

1597 # c-engine, so only need for python and pyarrow parsers

1598 if engine == "pyarrow":

1599 if not is_integer(skiprows) and skiprows is not None:

1600 # pyarrow expects skiprows to be passed as an integer

1601 raise ValueError(

1602 "skiprows argument must be an integer when using "

1603 "engine='pyarrow'"

1604 )

1605 else:

1606 if is_integer(skiprows):

1607 skiprows = list(range(skiprows))

1608 if skiprows is None:

1609 skiprows = set()

1610 elif not callable(skiprows):

1611 skiprows = set(skiprows)

1612

1613 # put stuff back

1614 result["names"] = names

1615 result["converters"] = converters

1616 result["na_values"] = na_values

1617 result["na_fvalues"] = na_fvalues

1618 result["skiprows"] = skiprows

1619

1620 return result, engine

1621

1622 def __next__(self) -> DataFrame:

1623 try:

1624 return self.get_chunk()

1625 except StopIteration:

1626 self.close()

1627 raise

1628

1629 def _make_engine(

1630 self,

1631 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,

1632 engine: CSVEngine = "c",

1633 ) -> ParserBase:

1634 mapping: dict[str, type[ParserBase]] = {

1635 "c": CParserWrapper,

1636 "python": PythonParser,

1637 "pyarrow": ArrowParserWrapper,

1638 "python-fwf": FixedWidthFieldParser,

1639 }

1640 if engine not in mapping:

1641 raise ValueError(

1642 f"Unknown engine: {engine} (valid options are {mapping.keys()})"

1643 )

1644 if not isinstance(f, list):

1645 # open file here

1646 is_text = True

1647 mode = "r"

1648 if engine == "pyarrow":

1649 is_text = False

1650 mode = "rb"

1651 elif (

1652 engine == "c"

1653 and self.options.get("encoding", "utf-8") == "utf-8"

1654 and isinstance(stringify_path(f), str)

1655 ):

1656 # c engine can decode utf-8 bytes, adding TextIOWrapper makes

1657 # the c-engine especially for memory_map=True far slower

1658 is_text = False

1659 if "b" not in mode:

1660 mode += "b"

1661 self.handles = get_handle(

1662 f,

1663 mode,

1664 encoding=self.options.get("encoding", None),

1665 compression=self.options.get("compression", None),

1666 memory_map=self.options.get("memory_map", False),

1667 is_text=is_text,

1668 errors=self.options.get("encoding_errors", "strict"),

1669 storage_options=self.options.get("storage_options", None),

1670 )

1671 assert self.handles is not None

1672 f = self.handles.handle

1673

1674 elif engine != "python":

1675 msg = f"Invalid file path or buffer object type: {type(f)}"

1676 raise ValueError(msg)

1677

1678 try:

1679 return mapping[engine](f, **self.options)

1680 except Exception:

1681 if self.handles is not None:

1682 self.handles.close()

1683 raise

1684

1685 def _failover_to_python(self) -> None:

1686 raise AbstractMethodError(self)

1687

1688 def read(self, nrows: int | None = None) -> DataFrame:

1689 if self.engine == "pyarrow":

1690 try:

1691 # error: "ParserBase" has no attribute "read"

1692 df = self._engine.read() # type: ignore[attr-defined]

1693 except Exception:

1694 self.close()

1695 raise

1696 else:

1697 nrows = validate_integer("nrows", nrows)

1698 try:

1699 # error: "ParserBase" has no attribute "read"

1700 (

1701 index,

1702 columns,

1703 col_dict,

1704 ) = self._engine.read( # type: ignore[attr-defined]

1705 nrows

1706 )

1707 except Exception:

1708 self.close()

1709 raise

1710

1711 if index is None:

1712 if col_dict:

1713 # Any column is actually fine:

1714 new_rows = len(next(iter(col_dict.values())))

1715 index = RangeIndex(self._currow, self._currow + new_rows)

1716 else:

1717 new_rows = 0

1718 else:

1719 new_rows = len(index)

1720

1721 df = DataFrame(col_dict, columns=columns, index=index)

1722

1723 self._currow += new_rows

1724 return df

1725

1726 def get_chunk(self, size: int | None = None) -> DataFrame:

1727 if size is None:

1728 size = self.chunksize

1729 if self.nrows is not None:

1730 if self._currow >= self.nrows:

1731 raise StopIteration

1732 size = min(size, self.nrows - self._currow)

1733 return self.read(nrows=size)

1734

1735 def __enter__(self) -> TextFileReader:

1736 return self

1737

1738 def __exit__(

1739 self,

1740 exc_type: type[BaseException] | None,

1741 exc_value: BaseException | None,

1742 traceback: TracebackType | None,

1743 ) -> None:

1744 self.close()

1745

1746

1747def TextParser(*args, **kwds) -> TextFileReader:

1748 """

1749 Converts lists of lists/tuples into DataFrames with proper type inference

1750 and optional (e.g. string to datetime) conversion. Also enables iterating

1751 lazily over chunks of large files

1752

1753 Parameters

1754 ----------

1755 data : file-like object or list

1756 delimiter : separator character to use

1757 dialect : str or csv.Dialect instance, optional

1758 Ignored if delimiter is longer than 1 character

1759 names : sequence, default

1760 header : int, default 0

1761 Row to use to parse column labels. Defaults to the first row. Prior

1762 rows will be discarded

1763 index_col : int or list, optional

1764 Column or columns to use as the (possibly hierarchical) index

1765 has_index_names: bool, default False

1766 True if the cols defined in index_col have an index name and are

1767 not in the header.

1768 na_values : scalar, str, list-like, or dict, optional

1769 Additional strings to recognize as NA/NaN.

1770 keep_default_na : bool, default True

1771 thousands : str, optional

1772 Thousands separator

1773 comment : str, optional

1774 Comment out remainder of line

1775 parse_dates : bool, default False

1776 keep_date_col : bool, default False

1777 date_parser : function, optional

1778

1779 .. deprecated:: 2.0.0

1780 date_format : str or dict of column -> format, default ``None``

1781

1782 .. versionadded:: 2.0.0

1783 skiprows : list of integers

1784 Row numbers to skip

1785 skipfooter : int

1786 Number of line at bottom of file to skip

1787 converters : dict, optional

1788 Dict of functions for converting values in certain columns. Keys can

1789 either be integers or column labels, values are functions that take one

1790 input argument, the cell (not column) content, and return the

1791 transformed content.

1792 encoding : str, optional

1793 Encoding to use for UTF when reading/writing (ex. 'utf-8')

1794 float_precision : str, optional

1795 Specifies which converter the C engine should use for floating-point

1796 values. The options are `None` or `high` for the ordinary converter,

1797 `legacy` for the original lower precision pandas converter, and

1798 `round_trip` for the round-trip converter.

1799

1800 .. versionchanged:: 1.2

1801 """

1802 kwds["engine"] = "python"

1803 return TextFileReader(*args, **kwds)

1804

1805

1806def _clean_na_values(na_values, keep_default_na: bool = True):

1807 na_fvalues: set | dict

1808 if na_values is None:

1809 if keep_default_na:

1810 na_values = STR_NA_VALUES

1811 else:

1812 na_values = set()

1813 na_fvalues = set()

1814 elif isinstance(na_values, dict):

1815 old_na_values = na_values.copy()

1816 na_values = {} # Prevent aliasing.

1817

1818 # Convert the values in the na_values dictionary

1819 # into array-likes for further use. This is also

1820 # where we append the default NaN values, provided

1821 # that `keep_default_na=True`.

1822 for k, v in old_na_values.items():

1823 if not is_list_like(v):

1824 v = [v]

1825

1826 if keep_default_na:

1827 v = set(v) | STR_NA_VALUES

1828

1829 na_values[k] = v

1830 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}

1831 else:

1832 if not is_list_like(na_values):

1833 na_values = [na_values]

1834 na_values = _stringify_na_values(na_values)

1835 if keep_default_na:

1836 na_values = na_values | STR_NA_VALUES

1837

1838 na_fvalues = _floatify_na_values(na_values)

1839

1840 return na_values, na_fvalues

1841

1842

1843def _floatify_na_values(na_values):

1844 # create float versions of the na_values

1845 result = set()

1846 for v in na_values:

1847 try:

1848 v = float(v)

1849 if not np.isnan(v):

1850 result.add(v)

1851 except (TypeError, ValueError, OverflowError):

1852 pass

1853 return result

1854

1855

1856def _stringify_na_values(na_values):

1857 """return a stringified and numeric for these values"""

1858 result: list[str | float] = []

1859 for x in na_values:

1860 result.append(str(x))

1861 result.append(x)

1862 try:

1863 v = float(x)

1864

1865 # we are like 999 here

1866 if v == int(v):

1867 v = int(v)

1868 result.append(f"{v}.0")

1869 result.append(str(v))

1870

1871 result.append(v)

1872 except (TypeError, ValueError, OverflowError):

1873 pass

1874 try:

1875 result.append(int(x))

1876 except (TypeError, ValueError, OverflowError):

1877 pass

1878 return set(result)

1879

1880

1881def _refine_defaults_read(

1882 dialect: str | csv.Dialect | None,

1883 delimiter: str | None | lib.NoDefault,

1884 delim_whitespace: bool,

1885 engine: CSVEngine | None,

1886 sep: str | None | lib.NoDefault,

1887 on_bad_lines: str | Callable,

1888 names: Sequence[Hashable] | None | lib.NoDefault,

1889 defaults: dict[str, Any],

1890 dtype_backend: DtypeBackend | lib.NoDefault,

1891):

1892 """Validate/refine default values of input parameters of read_csv, read_table.

1893

1894 Parameters

1895 ----------

1896 dialect : str or csv.Dialect

1897 If provided, this parameter will override values (default or not) for the

1898 following parameters: `delimiter`, `doublequote`, `escapechar`,

1899 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to

1900 override values, a ParserWarning will be issued. See csv.Dialect

1901 documentation for more details.

1902 delimiter : str or object

1903 Alias for sep.

1904 delim_whitespace : bool

1905 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be

1906 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option

1907 is set to True, nothing should be passed in for the ``delimiter``

1908 parameter.

1909 engine : {{'c', 'python'}}

1910 Parser engine to use. The C engine is faster while the python engine is

1911 currently more feature-complete.

1912 sep : str or object

1913 A delimiter provided by the user (str) or a sentinel value, i.e.

1914 pandas._libs.lib.no_default.

1915 on_bad_lines : str, callable

1916 An option for handling bad lines or a sentinel value(None).

1917 names : array-like, optional

1918 List of column names to use. If the file contains a header row,

1919 then you should explicitly pass ``header=0`` to override the column names.

1920 Duplicates in this list are not allowed.

1921 defaults: dict

1922 Default values of input parameters.

1923

1924 Returns

1925 -------

1926 kwds : dict

1927 Input parameters with correct values.

1928

1929 Raises

1930 ------

1931 ValueError :

1932 If a delimiter was specified with ``sep`` (or ``delimiter``) and

1933 ``delim_whitespace=True``.

1934 """

1935 # fix types for sep, delimiter to Union(str, Any)

1936 delim_default = defaults["delimiter"]

1937 kwds: dict[str, Any] = {}

1938 # gh-23761

1939 #

1940 # When a dialect is passed, it overrides any of the overlapping

1941 # parameters passed in directly. We don't want to warn if the

1942 # default parameters were passed in (since it probably means

1943 # that the user didn't pass them in explicitly in the first place).

1944 #

1945 # "delimiter" is the annoying corner case because we alias it to

1946 # "sep" before doing comparison to the dialect values later on.

1947 # Thus, we need a flag to indicate that we need to "override"

1948 # the comparison to dialect values by checking if default values

1949 # for BOTH "delimiter" and "sep" were provided.

1950 if dialect is not None:

1951 kwds["sep_override"] = delimiter is None and (

1952 sep is lib.no_default or sep == delim_default

1953 )

1954

1955 if delimiter and (sep is not lib.no_default):

1956 raise ValueError("Specified a sep and a delimiter; you can only specify one.")

1957

1958 kwds["names"] = None if names is lib.no_default else names

1959

1960 # Alias sep -> delimiter.

1961 if delimiter is None:

1962 delimiter = sep

1963

1964 if delim_whitespace and (delimiter is not lib.no_default):

1965 raise ValueError(

1966 "Specified a delimiter with both sep and "

1967 "delim_whitespace=True; you can only specify one."

1968 )

1969

1970 if delimiter == "\n":

1971 raise ValueError(

1972 r"Specified \n as separator or delimiter. This forces the python engine "

1973 "which does not accept a line terminator. Hence it is not allowed to use "

1974 "the line terminator as separator.",

1975 )

1976

1977 if delimiter is lib.no_default:

1978 # assign default separator value

1979 kwds["delimiter"] = delim_default

1980 else:

1981 kwds["delimiter"] = delimiter

1982

1983 if engine is not None:

1984 kwds["engine_specified"] = True

1985 else:

1986 kwds["engine"] = "c"

1987 kwds["engine_specified"] = False

1988

1989 if on_bad_lines == "error":

1990 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR

1991 elif on_bad_lines == "warn":

1992 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN

1993 elif on_bad_lines == "skip":

1994 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP

1995 elif callable(on_bad_lines):

1996 if engine != "python":

1997 raise ValueError(

1998 "on_bad_line can only be a callable function if engine='python'"

1999 )

2000 kwds["on_bad_lines"] = on_bad_lines

2001 else:

2002 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")

2003

2004 check_dtype_backend(dtype_backend)

2005

2006 kwds["dtype_backend"] = dtype_backend

2007

2008 return kwds

2009

2010

2011def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:

2012 """

2013 Extract concrete csv dialect instance.

2014

2015 Returns

2016 -------

2017 csv.Dialect or None

2018 """

2019 if kwds.get("dialect") is None:

2020 return None

2021

2022 dialect = kwds["dialect"]

2023 if dialect in csv.list_dialects():

2024 dialect = csv.get_dialect(dialect)

2025

2026 _validate_dialect(dialect)

2027

2028 return dialect

2029

2030

2031MANDATORY_DIALECT_ATTRS = (

2032 "delimiter",

2033 "doublequote",

2034 "escapechar",

2035 "skipinitialspace",

2036 "quotechar",

2037 "quoting",

2038)

2039

2040

2041def _validate_dialect(dialect: csv.Dialect) -> None:

2042 """

2043 Validate csv dialect instance.

2044

2045 Raises

2046 ------

2047 ValueError

2048 If incorrect dialect is provided.

2049 """

2050 for param in MANDATORY_DIALECT_ATTRS:

2051 if not hasattr(dialect, param):

2052 raise ValueError(f"Invalid dialect {dialect} provided")

2053

2054

2055def _merge_with_dialect_properties(

2056 dialect: csv.Dialect,

2057 defaults: dict[str, Any],

2058) -> dict[str, Any]:

2059 """

2060 Merge default kwargs in TextFileReader with dialect parameters.

2061

2062 Parameters

2063 ----------

2064 dialect : csv.Dialect

2065 Concrete csv dialect. See csv.Dialect documentation for more details.

2066 defaults : dict

2067 Keyword arguments passed to TextFileReader.

2068

2069 Returns

2070 -------

2071 kwds : dict

2072 Updated keyword arguments, merged with dialect parameters.

2073 """

2074 kwds = defaults.copy()

2075

2076 for param in MANDATORY_DIALECT_ATTRS:

2077 dialect_val = getattr(dialect, param)

2078

2079 parser_default = parser_defaults[param]

2080 provided = kwds.get(param, parser_default)

2081

2082 # Messages for conflicting values between the dialect

2083 # instance and the actual parameters provided.

2084 conflict_msgs = []

2085

2086 # Don't warn if the default parameter was passed in,

2087 # even if it conflicts with the dialect (gh-23761).

2088 if provided not in (parser_default, dialect_val):

2089 msg = (

2090 f"Conflicting values for '{param}': '{provided}' was "

2091 f"provided, but the dialect specifies '{dialect_val}'. "

2092 "Using the dialect-specified value."

2093 )

2094

2095 # Annoying corner case for not warning about

2096 # conflicts between dialect and delimiter parameter.

2097 # Refer to the outer "_read_" function for more info.

2098 if not (param == "delimiter" and kwds.pop("sep_override", False)):

2099 conflict_msgs.append(msg)

2100

2101 if conflict_msgs:

2102 warnings.warn(

2103 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()

2104 )

2105 kwds[param] = dialect_val

2106 return kwds

2107

2108

2109def _validate_skipfooter(kwds: dict[str, Any]) -> None:

2110 """

2111 Check whether skipfooter is compatible with other kwargs in TextFileReader.

2112

2113 Parameters

2114 ----------

2115 kwds : dict

2116 Keyword arguments passed to TextFileReader.

2117

2118 Raises

2119 ------

2120 ValueError

2121 If skipfooter is not compatible with other parameters.

2122 """

2123 if kwds.get("skipfooter"):

2124 if kwds.get("iterator") or kwds.get("chunksize"):

2125 raise ValueError("'skipfooter' not supported for iteration")

2126 if kwds.get("nrows"):

2127 raise ValueError("'skipfooter' not supported with 'nrows'")