Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/parsers/readers.py: 18%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

486 statements  

1""" 

2Module contains tools for processing files into DataFrames or other objects 

3 

4GH#48849 provides a convenient way of deprecating keyword arguments 

5""" 

6from __future__ import annotations 

7 

8from collections import abc 

9import csv 

10import sys 

11from textwrap import fill 

12from types import TracebackType 

13from typing import ( 

14 IO, 

15 Any, 

16 Callable, 

17 Hashable, 

18 Literal, 

19 NamedTuple, 

20 Sequence, 

21 overload, 

22) 

23import warnings 

24 

25import numpy as np 

26 

27from pandas._libs import lib 

28from pandas._libs.parsers import STR_NA_VALUES 

29from pandas._typing import ( 

30 CompressionOptions, 

31 CSVEngine, 

32 DtypeArg, 

33 DtypeBackend, 

34 FilePath, 

35 IndexLabel, 

36 ReadCsvBuffer, 

37 StorageOptions, 

38) 

39from pandas.errors import ( 

40 AbstractMethodError, 

41 ParserWarning, 

42) 

43from pandas.util._decorators import Appender 

44from pandas.util._exceptions import find_stack_level 

45from pandas.util._validators import check_dtype_backend 

46 

47from pandas.core.dtypes.common import ( 

48 is_file_like, 

49 is_float, 

50 is_integer, 

51 is_list_like, 

52) 

53 

54from pandas.core.frame import DataFrame 

55from pandas.core.indexes.api import RangeIndex 

56from pandas.core.shared_docs import _shared_docs 

57 

58from pandas.io.common import ( 

59 IOHandles, 

60 get_handle, 

61 stringify_path, 

62 validate_header_arg, 

63) 

64from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper 

65from pandas.io.parsers.base_parser import ( 

66 ParserBase, 

67 is_index_col, 

68 parser_defaults, 

69) 

70from pandas.io.parsers.c_parser_wrapper import CParserWrapper 

71from pandas.io.parsers.python_parser import ( 

72 FixedWidthFieldParser, 

73 PythonParser, 

74) 

75 

76_doc_read_csv_and_table = ( 

77 r""" 

78{summary} 

79 

80Also supports optionally iterating or breaking of the file 

81into chunks. 

82 

83Additional help can be found in the online docs for 

84`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

85 

86Parameters 

87---------- 

88filepath_or_buffer : str, path object or file-like object 

89 Any valid string path is acceptable. The string could be a URL. Valid 

90 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is 

91 expected. A local file could be: file://localhost/path/to/table.csv. 

92 

93 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

94 

95 By file-like object, we refer to objects with a ``read()`` method, such as 

96 a file handle (e.g. via builtin ``open`` function) or ``StringIO``. 

97sep : str, default {_default_sep} 

98 Delimiter to use. If sep is None, the C engine cannot automatically detect 

99 the separator, but the Python parsing engine can, meaning the latter will 

100 be used and automatically detect the separator by Python's builtin sniffer 

101 tool, ``csv.Sniffer``. In addition, separators longer than 1 character and 

102 different from ``'\s+'`` will be interpreted as regular expressions and 

103 will also force the use of the Python parsing engine. Note that regex 

104 delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. 

105delimiter : str, default ``None`` 

106 Alias for sep. 

107header : int, list of int, None, default 'infer' 

108 Row number(s) to use as the column names, and the start of the 

109 data. Default behavior is to infer the column names: if no names 

110 are passed the behavior is identical to ``header=0`` and column 

111 names are inferred from the first line of the file, if column 

112 names are passed explicitly then the behavior is identical to 

113 ``header=None``. Explicitly pass ``header=0`` to be able to 

114 replace existing names. The header can be a list of integers that 

115 specify row locations for a multi-index on the columns 

116 e.g. [0,1,3]. Intervening rows that are not specified will be 

117 skipped (e.g. 2 in this example is skipped). Note that this 

118 parameter ignores commented lines and empty lines if 

119 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of 

120 data rather than the first line of the file. 

121names : array-like, optional 

122 List of column names to use. If the file contains a header row, 

123 then you should explicitly pass ``header=0`` to override the column names. 

124 Duplicates in this list are not allowed. 

125index_col : int, str, sequence of int / str, or False, optional, default ``None`` 

126 Column(s) to use as the row labels of the ``DataFrame``, either given as 

127 string name or column index. If a sequence of int / str is given, a 

128 MultiIndex is used. 

129 

130 Note: ``index_col=False`` can be used to force pandas to *not* use the first 

131 column as the index, e.g. when you have a malformed file with delimiters at 

132 the end of each line. 

133usecols : list-like or callable, optional 

134 Return a subset of the columns. If list-like, all elements must either 

135 be positional (i.e. integer indices into the document columns) or strings 

136 that correspond to column names provided either by the user in `names` or 

137 inferred from the document header row(s). If ``names`` are given, the document 

138 header row(s) are not taken into account. For example, a valid list-like 

139 `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. 

140 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. 

141 To instantiate a DataFrame from ``data`` with element order preserved use 

142 ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns 

143 in ``['foo', 'bar']`` order or 

144 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` 

145 for ``['bar', 'foo']`` order. 

146 

147 If callable, the callable function will be evaluated against the column 

148 names, returning names where the callable function evaluates to True. An 

149 example of a valid callable argument would be ``lambda x: x.upper() in 

150 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster 

151 parsing time and lower memory usage. 

152dtype : Type name or dict of column -> type, optional 

153 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 

154 'c': 'Int64'}} 

155 Use `str` or `object` together with suitable `na_values` settings 

156 to preserve and not interpret dtype. 

157 If converters are specified, they will be applied INSTEAD 

158 of dtype conversion. 

159 

160 .. versionadded:: 1.5.0 

161 

162 Support for defaultdict was added. Specify a defaultdict as input where 

163 the default determines the dtype of the columns which are not explicitly 

164 listed. 

165engine : {{'c', 'python', 'pyarrow'}}, optional 

166 Parser engine to use. The C and pyarrow engines are faster, while the python engine 

167 is currently more feature-complete. Multithreading is currently only supported by 

168 the pyarrow engine. 

169 

170 .. versionadded:: 1.4.0 

171 

172 The "pyarrow" engine was added as an *experimental* engine, and some features 

173 are unsupported, or may not work correctly, with this engine. 

174converters : dict, optional 

175 Dict of functions for converting values in certain columns. Keys can either 

176 be integers or column labels. 

177true_values : list, optional 

178 Values to consider as True in addition to case-insensitive variants of "True". 

179false_values : list, optional 

180 Values to consider as False in addition to case-insensitive variants of "False". 

181skipinitialspace : bool, default False 

182 Skip spaces after delimiter. 

183skiprows : list-like, int or callable, optional 

184 Line numbers to skip (0-indexed) or number of lines to skip (int) 

185 at the start of the file. 

186 

187 If callable, the callable function will be evaluated against the row 

188 indices, returning True if the row should be skipped and False otherwise. 

189 An example of a valid callable argument would be ``lambda x: x in [0, 2]``. 

190skipfooter : int, default 0 

191 Number of lines at bottom of file to skip (Unsupported with engine='c'). 

192nrows : int, optional 

193 Number of rows of file to read. Useful for reading pieces of large files. 

194na_values : scalar, str, list-like, or dict, optional 

195 Additional strings to recognize as NA/NaN. If dict passed, specific 

196 per-column NA values. By default the following values are interpreted as 

197 NaN: '""" 

198 + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 

199 + """'. 

200keep_default_na : bool, default True 

201 Whether or not to include the default NaN values when parsing the data. 

202 Depending on whether `na_values` is passed in, the behavior is as follows: 

203 

204 * If `keep_default_na` is True, and `na_values` are specified, `na_values` 

205 is appended to the default NaN values used for parsing. 

206 * If `keep_default_na` is True, and `na_values` are not specified, only 

207 the default NaN values are used for parsing. 

208 * If `keep_default_na` is False, and `na_values` are specified, only 

209 the NaN values specified `na_values` are used for parsing. 

210 * If `keep_default_na` is False, and `na_values` are not specified, no 

211 strings will be parsed as NaN. 

212 

213 Note that if `na_filter` is passed in as False, the `keep_default_na` and 

214 `na_values` parameters will be ignored. 

215na_filter : bool, default True 

216 Detect missing value markers (empty strings and the value of na_values). In 

217 data without any NAs, passing na_filter=False can improve the performance 

218 of reading a large file. 

219verbose : bool, default False 

220 Indicate number of NA values placed in non-numeric columns. 

221skip_blank_lines : bool, default True 

222 If True, skip over blank lines rather than interpreting as NaN values. 

223parse_dates : bool or list of int or names or list of lists or dict, \ 

224default False 

225 The behavior is as follows: 

226 

227 * boolean. If True -> try parsing the index. 

228 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

229 each as a separate date column. 

230 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

231 a single date column. 

232 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

233 result 'foo' 

234 

235 If a column or index cannot be represented as an array of datetimes, 

236 say because of an unparsable value or a mixture of timezones, the column 

237 or index will be returned unaltered as an object data type. For 

238 non-standard datetime parsing, use ``pd.to_datetime`` after 

239 ``pd.read_csv``. 

240 

241 Note: A fast-path exists for iso8601-formatted dates. 

242infer_datetime_format : bool, default False 

243 If True and `parse_dates` is enabled, pandas will attempt to infer the 

244 format of the datetime strings in the columns, and if it can be inferred, 

245 switch to a faster method of parsing them. In some cases this can increase 

246 the parsing speed by 5-10x. 

247 

248 .. deprecated:: 2.0.0 

249 A strict version of this argument is now the default, passing it has no effect. 

250 

251keep_date_col : bool, default False 

252 If True and `parse_dates` specifies combining multiple columns then 

253 keep the original columns. 

254date_parser : function, optional 

255 Function to use for converting a sequence of string columns to an array of 

256 datetime instances. The default uses ``dateutil.parser.parser`` to do the 

257 conversion. Pandas will try to call `date_parser` in three different ways, 

258 advancing to the next if an exception occurs: 1) Pass one or more arrays 

259 (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the 

260 string values from the columns defined by `parse_dates` into a single array 

261 and pass that; and 3) call `date_parser` once for each row using one or 

262 more strings (corresponding to the columns defined by `parse_dates`) as 

263 arguments. 

264 

265 .. deprecated:: 2.0.0 

266 Use ``date_format`` instead, or read in as ``object`` and then apply 

267 :func:`to_datetime` as-needed. 

268date_format : str or dict of column -> format, default ``None`` 

269 If used in conjunction with ``parse_dates``, will parse dates according to this 

270 format. For anything more complex, 

271 please read in as ``object`` and then apply :func:`to_datetime` as-needed. 

272 

273 .. versionadded:: 2.0.0 

274dayfirst : bool, default False 

275 DD/MM format dates, international and European format. 

276cache_dates : bool, default True 

277 If True, use a cache of unique, converted dates to apply the datetime 

278 conversion. May produce significant speed-up when parsing duplicate 

279 date strings, especially ones with timezone offsets. 

280 

281iterator : bool, default False 

282 Return TextFileReader object for iteration or getting chunks with 

283 ``get_chunk()``. 

284 

285 .. versionchanged:: 1.2 

286 

287 ``TextFileReader`` is a context manager. 

288chunksize : int, optional 

289 Return TextFileReader object for iteration. 

290 See the `IO Tools docs 

291 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ 

292 for more information on ``iterator`` and ``chunksize``. 

293 

294 .. versionchanged:: 1.2 

295 

296 ``TextFileReader`` is a context manager. 

297{decompression_options} 

298 

299 .. versionchanged:: 1.4.0 Zstandard support. 

300 

301thousands : str, optional 

302 Thousands separator. 

303decimal : str, default '.' 

304 Character to recognize as decimal point (e.g. use ',' for European data). 

305lineterminator : str (length 1), optional 

306 Character to break file into lines. Only valid with C parser. 

307quotechar : str (length 1), optional 

308 The character used to denote the start and end of a quoted item. Quoted 

309 items can include the delimiter and it will be ignored. 

310quoting : int or csv.QUOTE_* instance, default 0 

311 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of 

312 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). 

313doublequote : bool, default ``True`` 

314 When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate 

315 whether or not to interpret two consecutive quotechar elements INSIDE a 

316 field as a single ``quotechar`` element. 

317escapechar : str (length 1), optional 

318 One-character string used to escape other characters. 

319comment : str, optional 

320 Indicates remainder of line should not be parsed. If found at the beginning 

321 of a line, the line will be ignored altogether. This parameter must be a 

322 single character. Like empty lines (as long as ``skip_blank_lines=True``), 

323 fully commented lines are ignored by the parameter `header` but not by 

324 `skiprows`. For example, if ``comment='#'``, parsing 

325 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being 

326 treated as the header. 

327encoding : str, optional, default "utf-8" 

328 Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python 

329 standard encodings 

330 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ . 

331 

332 .. versionchanged:: 1.2 

333 

334 When ``encoding`` is ``None``, ``errors="replace"`` is passed to 

335 ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. 

336 This behavior was previously only the case for ``engine="python"``. 

337 

338 .. versionchanged:: 1.3.0 

339 

340 ``encoding_errors`` is a new argument. ``encoding`` has no longer an 

341 influence on how encoding errors are handled. 

342 

343encoding_errors : str, optional, default "strict" 

344 How encoding errors are treated. `List of possible values 

345 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ . 

346 

347 .. versionadded:: 1.3.0 

348 

349dialect : str or csv.Dialect, optional 

350 If provided, this parameter will override values (default or not) for the 

351 following parameters: `delimiter`, `doublequote`, `escapechar`, 

352 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 

353 override values, a ParserWarning will be issued. See csv.Dialect 

354 documentation for more details. 

355on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error' 

356 Specifies what to do upon encountering a bad line (a line with too many fields). 

357 Allowed values are : 

358 

359 - 'error', raise an Exception when a bad line is encountered. 

360 - 'warn', raise a warning when a bad line is encountered and skip that line. 

361 - 'skip', skip bad lines without raising or warning when they are encountered. 

362 

363 .. versionadded:: 1.3.0 

364 

365 .. versionadded:: 1.4.0 

366 

367 - callable, function with signature 

368 ``(bad_line: list[str]) -> list[str] | None`` that will process a single 

369 bad line. ``bad_line`` is a list of strings split by the ``sep``. 

370 If the function returns ``None``, the bad line will be ignored. 

371 If the function returns a new list of strings with more elements than 

372 expected, a ``ParserWarning`` will be emitted while dropping extra elements. 

373 Only supported when ``engine="python"`` 

374 

375delim_whitespace : bool, default False 

376 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 

377 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 

378 is set to True, nothing should be passed in for the ``delimiter`` 

379 parameter. 

380low_memory : bool, default True 

381 Internally process the file in chunks, resulting in lower memory use 

382 while parsing, but possibly mixed type inference. To ensure no mixed 

383 types either set False, or specify the type with the `dtype` parameter. 

384 Note that the entire file is read into a single DataFrame regardless, 

385 use the `chunksize` or `iterator` parameter to return the data in chunks. 

386 (Only valid with C parser). 

387memory_map : bool, default False 

388 If a filepath is provided for `filepath_or_buffer`, map the file object 

389 directly onto memory and access the data directly from there. Using this 

390 option can improve performance because there is no longer any I/O overhead. 

391float_precision : str, optional 

392 Specifies which converter the C engine should use for floating-point 

393 values. The options are ``None`` or 'high' for the ordinary converter, 

394 'legacy' for the original lower precision pandas converter, and 

395 'round_trip' for the round-trip converter. 

396 

397 .. versionchanged:: 1.2 

398 

399{storage_options} 

400 

401 .. versionadded:: 1.2 

402 

403dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames 

404 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy 

405 arrays, nullable dtypes are used for all dtypes that have a nullable 

406 implementation when "numpy_nullable" is set, pyarrow is used for all 

407 dtypes if "pyarrow" is set. 

408 

409 The dtype_backends are still experimential. 

410 

411 .. versionadded:: 2.0 

412 

413Returns 

414------- 

415DataFrame or TextFileReader 

416 A comma-separated values (csv) file is returned as two-dimensional 

417 data structure with labeled axes. 

418 

419See Also 

420-------- 

421DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

422read_csv : Read a comma-separated values (csv) file into DataFrame. 

423read_fwf : Read a table of fixed-width formatted lines into DataFrame. 

424 

425Examples 

426-------- 

427>>> pd.{func_name}('data.csv') # doctest: +SKIP 

428""" 

429) 

430 

431 

432_c_parser_defaults = { 

433 "delim_whitespace": False, 

434 "na_filter": True, 

435 "low_memory": True, 

436 "memory_map": False, 

437 "float_precision": None, 

438} 

439 

440_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} 

441 

442_c_unsupported = {"skipfooter"} 

443_python_unsupported = {"low_memory", "float_precision"} 

444_pyarrow_unsupported = { 

445 "skipfooter", 

446 "float_precision", 

447 "chunksize", 

448 "comment", 

449 "nrows", 

450 "thousands", 

451 "memory_map", 

452 "dialect", 

453 "on_bad_lines", 

454 "delim_whitespace", 

455 "quoting", 

456 "lineterminator", 

457 "converters", 

458 "iterator", 

459 "dayfirst", 

460 "verbose", 

461 "skipinitialspace", 

462 "low_memory", 

463} 

464 

465 

466class _DeprecationConfig(NamedTuple): 

467 default_value: Any 

468 msg: str | None 

469 

470 

471@overload 

472def validate_integer(name, val: None, min_val: int = ...) -> None: 

473 ... 

474 

475 

476@overload 

477def validate_integer(name, val: float, min_val: int = ...) -> int: 

478 ... 

479 

480 

481@overload 

482def validate_integer(name, val: int | None, min_val: int = ...) -> int | None: 

483 ... 

484 

485 

486def validate_integer(name, val: int | float | None, min_val: int = 0) -> int | None: 

487 """ 

488 Checks whether the 'name' parameter for parsing is either 

489 an integer OR float that can SAFELY be cast to an integer 

490 without losing accuracy. Raises a ValueError if that is 

491 not the case. 

492 

493 Parameters 

494 ---------- 

495 name : str 

496 Parameter name (used for error reporting) 

497 val : int or float 

498 The value to check 

499 min_val : int 

500 Minimum allowed value (val < min_val will result in a ValueError) 

501 """ 

502 if val is None: 

503 return val 

504 

505 msg = f"'{name:s}' must be an integer >={min_val:d}" 

506 if is_float(val): 

507 if int(val) != val: 

508 raise ValueError(msg) 

509 val = int(val) 

510 elif not (is_integer(val) and val >= min_val): 

511 raise ValueError(msg) 

512 

513 return int(val) 

514 

515 

516def _validate_names(names: Sequence[Hashable] | None) -> None: 

517 """ 

518 Raise ValueError if the `names` parameter contains duplicates or has an 

519 invalid data type. 

520 

521 Parameters 

522 ---------- 

523 names : array-like or None 

524 An array containing a list of the names used for the output DataFrame. 

525 

526 Raises 

527 ------ 

528 ValueError 

529 If names are not unique or are not ordered (e.g. set). 

530 """ 

531 if names is not None: 

532 if len(names) != len(set(names)): 

533 raise ValueError("Duplicate names are not allowed.") 

534 if not ( 

535 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) 

536 ): 

537 raise ValueError("Names should be an ordered collection.") 

538 

539 

540def _read( 

541 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds 

542) -> DataFrame | TextFileReader: 

543 """Generic reader of line files.""" 

544 # if we pass a date_parser and parse_dates=False, we should not parse the 

545 # dates GH#44366 

546 if kwds.get("parse_dates", None) is None: 

547 if ( 

548 kwds.get("date_parser", lib.no_default) is lib.no_default 

549 and kwds.get("date_format", None) is None 

550 ): 

551 kwds["parse_dates"] = False 

552 else: 

553 kwds["parse_dates"] = True 

554 

555 # Extract some of the arguments (pass chunksize on). 

556 iterator = kwds.get("iterator", False) 

557 chunksize = kwds.get("chunksize", None) 

558 if kwds.get("engine") == "pyarrow": 

559 if iterator: 

560 raise ValueError( 

561 "The 'iterator' option is not supported with the 'pyarrow' engine" 

562 ) 

563 

564 if chunksize is not None: 

565 raise ValueError( 

566 "The 'chunksize' option is not supported with the 'pyarrow' engine" 

567 ) 

568 else: 

569 chunksize = validate_integer("chunksize", chunksize, 1) 

570 

571 nrows = kwds.get("nrows", None) 

572 

573 # Check for duplicates in names. 

574 _validate_names(kwds.get("names", None)) 

575 

576 # Create the parser. 

577 parser = TextFileReader(filepath_or_buffer, **kwds) 

578 

579 if chunksize or iterator: 

580 return parser 

581 

582 with parser: 

583 return parser.read(nrows) 

584 

585 

586# iterator=True -> TextFileReader 

587@overload 

588def read_csv( 

589 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

590 *, 

591 sep: str | None | lib.NoDefault = ..., 

592 delimiter: str | None | lib.NoDefault = ..., 

593 header: int | Sequence[int] | None | Literal["infer"] = ..., 

594 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

595 index_col: IndexLabel | Literal[False] | None = ..., 

596 usecols=..., 

597 dtype: DtypeArg | None = ..., 

598 engine: CSVEngine | None = ..., 

599 converters=..., 

600 true_values=..., 

601 false_values=..., 

602 skipinitialspace: bool = ..., 

603 skiprows=..., 

604 skipfooter: int = ..., 

605 nrows: int | None = ..., 

606 na_values=..., 

607 keep_default_na: bool = ..., 

608 na_filter: bool = ..., 

609 verbose: bool = ..., 

610 skip_blank_lines: bool = ..., 

611 parse_dates: bool | Sequence[Hashable] | None = ..., 

612 infer_datetime_format: bool | lib.NoDefault = ..., 

613 keep_date_col: bool = ..., 

614 date_parser=..., 

615 date_format: str | None = ..., 

616 dayfirst: bool = ..., 

617 cache_dates: bool = ..., 

618 iterator: Literal[True], 

619 chunksize: int | None = ..., 

620 compression: CompressionOptions = ..., 

621 thousands: str | None = ..., 

622 decimal: str = ..., 

623 lineterminator: str | None = ..., 

624 quotechar: str = ..., 

625 quoting: int = ..., 

626 doublequote: bool = ..., 

627 escapechar: str | None = ..., 

628 comment: str | None = ..., 

629 encoding: str | None = ..., 

630 encoding_errors: str | None = ..., 

631 dialect: str | csv.Dialect | None = ..., 

632 on_bad_lines=..., 

633 delim_whitespace: bool = ..., 

634 low_memory=..., 

635 memory_map: bool = ..., 

636 float_precision: Literal["high", "legacy"] | None = ..., 

637 storage_options: StorageOptions = ..., 

638 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

639) -> TextFileReader: 

640 ... 

641 

642 

643# chunksize=int -> TextFileReader 

644@overload 

645def read_csv( 

646 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

647 *, 

648 sep: str | None | lib.NoDefault = ..., 

649 delimiter: str | None | lib.NoDefault = ..., 

650 header: int | Sequence[int] | None | Literal["infer"] = ..., 

651 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

652 index_col: IndexLabel | Literal[False] | None = ..., 

653 usecols=..., 

654 dtype: DtypeArg | None = ..., 

655 engine: CSVEngine | None = ..., 

656 converters=..., 

657 true_values=..., 

658 false_values=..., 

659 skipinitialspace: bool = ..., 

660 skiprows=..., 

661 skipfooter: int = ..., 

662 nrows: int | None = ..., 

663 na_values=..., 

664 keep_default_na: bool = ..., 

665 na_filter: bool = ..., 

666 verbose: bool = ..., 

667 skip_blank_lines: bool = ..., 

668 parse_dates: bool | Sequence[Hashable] | None = ..., 

669 infer_datetime_format: bool | lib.NoDefault = ..., 

670 keep_date_col: bool = ..., 

671 date_parser=..., 

672 date_format: str | None = ..., 

673 dayfirst: bool = ..., 

674 cache_dates: bool = ..., 

675 iterator: bool = ..., 

676 chunksize: int, 

677 compression: CompressionOptions = ..., 

678 thousands: str | None = ..., 

679 decimal: str = ..., 

680 lineterminator: str | None = ..., 

681 quotechar: str = ..., 

682 quoting: int = ..., 

683 doublequote: bool = ..., 

684 escapechar: str | None = ..., 

685 comment: str | None = ..., 

686 encoding: str | None = ..., 

687 encoding_errors: str | None = ..., 

688 dialect: str | csv.Dialect | None = ..., 

689 on_bad_lines=..., 

690 delim_whitespace: bool = ..., 

691 low_memory=..., 

692 memory_map: bool = ..., 

693 float_precision: Literal["high", "legacy"] | None = ..., 

694 storage_options: StorageOptions = ..., 

695 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

696) -> TextFileReader: 

697 ... 

698 

699 

700# default case -> DataFrame 

701@overload 

702def read_csv( 

703 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

704 *, 

705 sep: str | None | lib.NoDefault = ..., 

706 delimiter: str | None | lib.NoDefault = ..., 

707 header: int | Sequence[int] | None | Literal["infer"] = ..., 

708 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

709 index_col: IndexLabel | Literal[False] | None = ..., 

710 usecols=..., 

711 dtype: DtypeArg | None = ..., 

712 engine: CSVEngine | None = ..., 

713 converters=..., 

714 true_values=..., 

715 false_values=..., 

716 skipinitialspace: bool = ..., 

717 skiprows=..., 

718 skipfooter: int = ..., 

719 nrows: int | None = ..., 

720 na_values=..., 

721 keep_default_na: bool = ..., 

722 na_filter: bool = ..., 

723 verbose: bool = ..., 

724 skip_blank_lines: bool = ..., 

725 parse_dates: bool | Sequence[Hashable] | None = ..., 

726 infer_datetime_format: bool | lib.NoDefault = ..., 

727 keep_date_col: bool = ..., 

728 date_parser=..., 

729 date_format: str | None = ..., 

730 dayfirst: bool = ..., 

731 cache_dates: bool = ..., 

732 iterator: Literal[False] = ..., 

733 chunksize: None = ..., 

734 compression: CompressionOptions = ..., 

735 thousands: str | None = ..., 

736 decimal: str = ..., 

737 lineterminator: str | None = ..., 

738 quotechar: str = ..., 

739 quoting: int = ..., 

740 doublequote: bool = ..., 

741 escapechar: str | None = ..., 

742 comment: str | None = ..., 

743 encoding: str | None = ..., 

744 encoding_errors: str | None = ..., 

745 dialect: str | csv.Dialect | None = ..., 

746 on_bad_lines=..., 

747 delim_whitespace: bool = ..., 

748 low_memory=..., 

749 memory_map: bool = ..., 

750 float_precision: Literal["high", "legacy"] | None = ..., 

751 storage_options: StorageOptions = ..., 

752 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

753) -> DataFrame: 

754 ... 

755 

756 

757# Unions -> DataFrame | TextFileReader 

758@overload 

759def read_csv( 

760 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

761 *, 

762 sep: str | None | lib.NoDefault = ..., 

763 delimiter: str | None | lib.NoDefault = ..., 

764 header: int | Sequence[int] | None | Literal["infer"] = ..., 

765 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

766 index_col: IndexLabel | Literal[False] | None = ..., 

767 usecols=..., 

768 dtype: DtypeArg | None = ..., 

769 engine: CSVEngine | None = ..., 

770 converters=..., 

771 true_values=..., 

772 false_values=..., 

773 skipinitialspace: bool = ..., 

774 skiprows=..., 

775 skipfooter: int = ..., 

776 nrows: int | None = ..., 

777 na_values=..., 

778 keep_default_na: bool = ..., 

779 na_filter: bool = ..., 

780 verbose: bool = ..., 

781 skip_blank_lines: bool = ..., 

782 parse_dates: bool | Sequence[Hashable] | None = ..., 

783 infer_datetime_format: bool | lib.NoDefault = ..., 

784 keep_date_col: bool = ..., 

785 date_parser=..., 

786 date_format: str | None = ..., 

787 dayfirst: bool = ..., 

788 cache_dates: bool = ..., 

789 iterator: bool = ..., 

790 chunksize: int | None = ..., 

791 compression: CompressionOptions = ..., 

792 thousands: str | None = ..., 

793 decimal: str = ..., 

794 lineterminator: str | None = ..., 

795 quotechar: str = ..., 

796 quoting: int = ..., 

797 doublequote: bool = ..., 

798 escapechar: str | None = ..., 

799 comment: str | None = ..., 

800 encoding: str | None = ..., 

801 encoding_errors: str | None = ..., 

802 dialect: str | csv.Dialect | None = ..., 

803 on_bad_lines=..., 

804 delim_whitespace: bool = ..., 

805 low_memory=..., 

806 memory_map: bool = ..., 

807 float_precision: Literal["high", "legacy"] | None = ..., 

808 storage_options: StorageOptions = ..., 

809 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

810) -> DataFrame | TextFileReader: 

811 ... 

812 

813 

814@Appender( 

815 _doc_read_csv_and_table.format( 

816 func_name="read_csv", 

817 summary="Read a comma-separated values (csv) file into DataFrame.", 

818 _default_sep="','", 

819 storage_options=_shared_docs["storage_options"], 

820 decompression_options=_shared_docs["decompression_options"] 

821 % "filepath_or_buffer", 

822 ) 

823) 

824def read_csv( 

825 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

826 *, 

827 sep: str | None | lib.NoDefault = lib.no_default, 

828 delimiter: str | None | lib.NoDefault = None, 

829 # Column and Index Locations and Names 

830 header: int | Sequence[int] | None | Literal["infer"] = "infer", 

831 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, 

832 index_col: IndexLabel | Literal[False] | None = None, 

833 usecols=None, 

834 # General Parsing Configuration 

835 dtype: DtypeArg | None = None, 

836 engine: CSVEngine | None = None, 

837 converters=None, 

838 true_values=None, 

839 false_values=None, 

840 skipinitialspace: bool = False, 

841 skiprows=None, 

842 skipfooter: int = 0, 

843 nrows: int | None = None, 

844 # NA and Missing Data Handling 

845 na_values=None, 

846 keep_default_na: bool = True, 

847 na_filter: bool = True, 

848 verbose: bool = False, 

849 skip_blank_lines: bool = True, 

850 # Datetime Handling 

851 parse_dates: bool | Sequence[Hashable] | None = None, 

852 infer_datetime_format: bool | lib.NoDefault = lib.no_default, 

853 keep_date_col: bool = False, 

854 date_parser=lib.no_default, 

855 date_format: str | None = None, 

856 dayfirst: bool = False, 

857 cache_dates: bool = True, 

858 # Iteration 

859 iterator: bool = False, 

860 chunksize: int | None = None, 

861 # Quoting, Compression, and File Format 

862 compression: CompressionOptions = "infer", 

863 thousands: str | None = None, 

864 decimal: str = ".", 

865 lineterminator: str | None = None, 

866 quotechar: str = '"', 

867 quoting: int = csv.QUOTE_MINIMAL, 

868 doublequote: bool = True, 

869 escapechar: str | None = None, 

870 comment: str | None = None, 

871 encoding: str | None = None, 

872 encoding_errors: str | None = "strict", 

873 dialect: str | csv.Dialect | None = None, 

874 # Error Handling 

875 on_bad_lines: str = "error", 

876 # Internal 

877 delim_whitespace: bool = False, 

878 low_memory=_c_parser_defaults["low_memory"], 

879 memory_map: bool = False, 

880 float_precision: Literal["high", "legacy"] | None = None, 

881 storage_options: StorageOptions = None, 

882 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

883) -> DataFrame | TextFileReader: 

884 if infer_datetime_format is not lib.no_default: 

885 warnings.warn( 

886 "The argument 'infer_datetime_format' is deprecated and will " 

887 "be removed in a future version. " 

888 "A strict version of it is now the default, see " 

889 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " 

890 "You can safely remove this argument.", 

891 FutureWarning, 

892 stacklevel=find_stack_level(), 

893 ) 

894 # locals() should never be modified 

895 kwds = locals().copy() 

896 del kwds["filepath_or_buffer"] 

897 del kwds["sep"] 

898 

899 kwds_defaults = _refine_defaults_read( 

900 dialect, 

901 delimiter, 

902 delim_whitespace, 

903 engine, 

904 sep, 

905 on_bad_lines, 

906 names, 

907 defaults={"delimiter": ","}, 

908 dtype_backend=dtype_backend, 

909 ) 

910 kwds.update(kwds_defaults) 

911 

912 return _read(filepath_or_buffer, kwds) 

913 

914 

915# iterator=True -> TextFileReader 

916@overload 

917def read_table( 

918 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

919 *, 

920 sep: str | None | lib.NoDefault = ..., 

921 delimiter: str | None | lib.NoDefault = ..., 

922 header: int | Sequence[int] | None | Literal["infer"] = ..., 

923 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

924 index_col: IndexLabel | Literal[False] | None = ..., 

925 usecols=..., 

926 dtype: DtypeArg | None = ..., 

927 engine: CSVEngine | None = ..., 

928 converters=..., 

929 true_values=..., 

930 false_values=..., 

931 skipinitialspace: bool = ..., 

932 skiprows=..., 

933 skipfooter: int = ..., 

934 nrows: int | None = ..., 

935 na_values=..., 

936 keep_default_na: bool = ..., 

937 na_filter: bool = ..., 

938 verbose: bool = ..., 

939 skip_blank_lines: bool = ..., 

940 parse_dates: bool | Sequence[Hashable] = ..., 

941 infer_datetime_format: bool | lib.NoDefault = ..., 

942 keep_date_col: bool = ..., 

943 date_parser=..., 

944 date_format: str | None = ..., 

945 dayfirst: bool = ..., 

946 cache_dates: bool = ..., 

947 iterator: Literal[True], 

948 chunksize: int | None = ..., 

949 compression: CompressionOptions = ..., 

950 thousands: str | None = ..., 

951 decimal: str = ..., 

952 lineterminator: str | None = ..., 

953 quotechar: str = ..., 

954 quoting: int = ..., 

955 doublequote: bool = ..., 

956 escapechar: str | None = ..., 

957 comment: str | None = ..., 

958 encoding: str | None = ..., 

959 encoding_errors: str | None = ..., 

960 dialect: str | csv.Dialect | None = ..., 

961 on_bad_lines=..., 

962 delim_whitespace: bool = ..., 

963 low_memory=..., 

964 memory_map: bool = ..., 

965 float_precision: str | None = ..., 

966 storage_options: StorageOptions = ..., 

967 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

968) -> TextFileReader: 

969 ... 

970 

971 

972# chunksize=int -> TextFileReader 

973@overload 

974def read_table( 

975 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

976 *, 

977 sep: str | None | lib.NoDefault = ..., 

978 delimiter: str | None | lib.NoDefault = ..., 

979 header: int | Sequence[int] | None | Literal["infer"] = ..., 

980 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

981 index_col: IndexLabel | Literal[False] | None = ..., 

982 usecols=..., 

983 dtype: DtypeArg | None = ..., 

984 engine: CSVEngine | None = ..., 

985 converters=..., 

986 true_values=..., 

987 false_values=..., 

988 skipinitialspace: bool = ..., 

989 skiprows=..., 

990 skipfooter: int = ..., 

991 nrows: int | None = ..., 

992 na_values=..., 

993 keep_default_na: bool = ..., 

994 na_filter: bool = ..., 

995 verbose: bool = ..., 

996 skip_blank_lines: bool = ..., 

997 parse_dates: bool | Sequence[Hashable] = ..., 

998 infer_datetime_format: bool | lib.NoDefault = ..., 

999 keep_date_col: bool = ..., 

1000 date_parser=..., 

1001 date_format: str | None = ..., 

1002 dayfirst: bool = ..., 

1003 cache_dates: bool = ..., 

1004 iterator: bool = ..., 

1005 chunksize: int, 

1006 compression: CompressionOptions = ..., 

1007 thousands: str | None = ..., 

1008 decimal: str = ..., 

1009 lineterminator: str | None = ..., 

1010 quotechar: str = ..., 

1011 quoting: int = ..., 

1012 doublequote: bool = ..., 

1013 escapechar: str | None = ..., 

1014 comment: str | None = ..., 

1015 encoding: str | None = ..., 

1016 encoding_errors: str | None = ..., 

1017 dialect: str | csv.Dialect | None = ..., 

1018 on_bad_lines=..., 

1019 delim_whitespace: bool = ..., 

1020 low_memory=..., 

1021 memory_map: bool = ..., 

1022 float_precision: str | None = ..., 

1023 storage_options: StorageOptions = ..., 

1024 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1025) -> TextFileReader: 

1026 ... 

1027 

1028 

1029# default -> DataFrame 

1030@overload 

1031def read_table( 

1032 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1033 *, 

1034 sep: str | None | lib.NoDefault = ..., 

1035 delimiter: str | None | lib.NoDefault = ..., 

1036 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1037 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1038 index_col: IndexLabel | Literal[False] | None = ..., 

1039 usecols=..., 

1040 dtype: DtypeArg | None = ..., 

1041 engine: CSVEngine | None = ..., 

1042 converters=..., 

1043 true_values=..., 

1044 false_values=..., 

1045 skipinitialspace: bool = ..., 

1046 skiprows=..., 

1047 skipfooter: int = ..., 

1048 nrows: int | None = ..., 

1049 na_values=..., 

1050 keep_default_na: bool = ..., 

1051 na_filter: bool = ..., 

1052 verbose: bool = ..., 

1053 skip_blank_lines: bool = ..., 

1054 parse_dates: bool | Sequence[Hashable] = ..., 

1055 infer_datetime_format: bool | lib.NoDefault = ..., 

1056 keep_date_col: bool = ..., 

1057 date_parser=..., 

1058 date_format: str | None = ..., 

1059 dayfirst: bool = ..., 

1060 cache_dates: bool = ..., 

1061 iterator: Literal[False] = ..., 

1062 chunksize: None = ..., 

1063 compression: CompressionOptions = ..., 

1064 thousands: str | None = ..., 

1065 decimal: str = ..., 

1066 lineterminator: str | None = ..., 

1067 quotechar: str = ..., 

1068 quoting: int = ..., 

1069 doublequote: bool = ..., 

1070 escapechar: str | None = ..., 

1071 comment: str | None = ..., 

1072 encoding: str | None = ..., 

1073 encoding_errors: str | None = ..., 

1074 dialect: str | csv.Dialect | None = ..., 

1075 on_bad_lines=..., 

1076 delim_whitespace: bool = ..., 

1077 low_memory=..., 

1078 memory_map: bool = ..., 

1079 float_precision: str | None = ..., 

1080 storage_options: StorageOptions = ..., 

1081 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1082) -> DataFrame: 

1083 ... 

1084 

1085 

1086# Unions -> DataFrame | TextFileReader 

1087@overload 

1088def read_table( 

1089 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1090 *, 

1091 sep: str | None | lib.NoDefault = ..., 

1092 delimiter: str | None | lib.NoDefault = ..., 

1093 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1094 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1095 index_col: IndexLabel | Literal[False] | None = ..., 

1096 usecols=..., 

1097 dtype: DtypeArg | None = ..., 

1098 engine: CSVEngine | None = ..., 

1099 converters=..., 

1100 true_values=..., 

1101 false_values=..., 

1102 skipinitialspace: bool = ..., 

1103 skiprows=..., 

1104 skipfooter: int = ..., 

1105 nrows: int | None = ..., 

1106 na_values=..., 

1107 keep_default_na: bool = ..., 

1108 na_filter: bool = ..., 

1109 verbose: bool = ..., 

1110 skip_blank_lines: bool = ..., 

1111 parse_dates: bool | Sequence[Hashable] = ..., 

1112 infer_datetime_format: bool | lib.NoDefault = ..., 

1113 keep_date_col: bool = ..., 

1114 date_parser=..., 

1115 date_format: str | None = ..., 

1116 dayfirst: bool = ..., 

1117 cache_dates: bool = ..., 

1118 iterator: bool = ..., 

1119 chunksize: int | None = ..., 

1120 compression: CompressionOptions = ..., 

1121 thousands: str | None = ..., 

1122 decimal: str = ..., 

1123 lineterminator: str | None = ..., 

1124 quotechar: str = ..., 

1125 quoting: int = ..., 

1126 doublequote: bool = ..., 

1127 escapechar: str | None = ..., 

1128 comment: str | None = ..., 

1129 encoding: str | None = ..., 

1130 encoding_errors: str | None = ..., 

1131 dialect: str | csv.Dialect | None = ..., 

1132 on_bad_lines=..., 

1133 delim_whitespace: bool = ..., 

1134 low_memory=..., 

1135 memory_map: bool = ..., 

1136 float_precision: str | None = ..., 

1137 storage_options: StorageOptions = ..., 

1138 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1139) -> DataFrame | TextFileReader: 

1140 ... 

1141 

1142 

1143@Appender( 

1144 _doc_read_csv_and_table.format( 

1145 func_name="read_table", 

1146 summary="Read general delimited file into DataFrame.", 

1147 _default_sep=r"'\\t' (tab-stop)", 

1148 storage_options=_shared_docs["storage_options"], 

1149 decompression_options=_shared_docs["decompression_options"] 

1150 % "filepath_or_buffer", 

1151 ) 

1152) 

1153def read_table( 

1154 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1155 *, 

1156 sep: str | None | lib.NoDefault = lib.no_default, 

1157 delimiter: str | None | lib.NoDefault = None, 

1158 # Column and Index Locations and Names 

1159 header: int | Sequence[int] | None | Literal["infer"] = "infer", 

1160 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, 

1161 index_col: IndexLabel | Literal[False] | None = None, 

1162 usecols=None, 

1163 # General Parsing Configuration 

1164 dtype: DtypeArg | None = None, 

1165 engine: CSVEngine | None = None, 

1166 converters=None, 

1167 true_values=None, 

1168 false_values=None, 

1169 skipinitialspace: bool = False, 

1170 skiprows=None, 

1171 skipfooter: int = 0, 

1172 nrows: int | None = None, 

1173 # NA and Missing Data Handling 

1174 na_values=None, 

1175 keep_default_na: bool = True, 

1176 na_filter: bool = True, 

1177 verbose: bool = False, 

1178 skip_blank_lines: bool = True, 

1179 # Datetime Handling 

1180 parse_dates: bool | Sequence[Hashable] = False, 

1181 infer_datetime_format: bool | lib.NoDefault = lib.no_default, 

1182 keep_date_col: bool = False, 

1183 date_parser=lib.no_default, 

1184 date_format: str | None = None, 

1185 dayfirst: bool = False, 

1186 cache_dates: bool = True, 

1187 # Iteration 

1188 iterator: bool = False, 

1189 chunksize: int | None = None, 

1190 # Quoting, Compression, and File Format 

1191 compression: CompressionOptions = "infer", 

1192 thousands: str | None = None, 

1193 decimal: str = ".", 

1194 lineterminator: str | None = None, 

1195 quotechar: str = '"', 

1196 quoting: int = csv.QUOTE_MINIMAL, 

1197 doublequote: bool = True, 

1198 escapechar: str | None = None, 

1199 comment: str | None = None, 

1200 encoding: str | None = None, 

1201 encoding_errors: str | None = "strict", 

1202 dialect: str | csv.Dialect | None = None, 

1203 # Error Handling 

1204 on_bad_lines: str = "error", 

1205 # Internal 

1206 delim_whitespace: bool = False, 

1207 low_memory=_c_parser_defaults["low_memory"], 

1208 memory_map: bool = False, 

1209 float_precision: str | None = None, 

1210 storage_options: StorageOptions = None, 

1211 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1212) -> DataFrame | TextFileReader: 

1213 if infer_datetime_format is not lib.no_default: 

1214 warnings.warn( 

1215 "The argument 'infer_datetime_format' is deprecated and will " 

1216 "be removed in a future version. " 

1217 "A strict version of it is now the default, see " 

1218 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " 

1219 "You can safely remove this argument.", 

1220 FutureWarning, 

1221 stacklevel=find_stack_level(), 

1222 ) 

1223 

1224 # locals() should never be modified 

1225 kwds = locals().copy() 

1226 del kwds["filepath_or_buffer"] 

1227 del kwds["sep"] 

1228 

1229 kwds_defaults = _refine_defaults_read( 

1230 dialect, 

1231 delimiter, 

1232 delim_whitespace, 

1233 engine, 

1234 sep, 

1235 on_bad_lines, 

1236 names, 

1237 defaults={"delimiter": "\t"}, 

1238 dtype_backend=dtype_backend, 

1239 ) 

1240 kwds.update(kwds_defaults) 

1241 

1242 return _read(filepath_or_buffer, kwds) 

1243 

1244 

1245def read_fwf( 

1246 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1247 *, 

1248 colspecs: Sequence[tuple[int, int]] | str | None = "infer", 

1249 widths: Sequence[int] | None = None, 

1250 infer_nrows: int = 100, 

1251 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1252 **kwds, 

1253) -> DataFrame | TextFileReader: 

1254 r""" 

1255 Read a table of fixed-width formatted lines into DataFrame. 

1256 

1257 Also supports optionally iterating or breaking of the file 

1258 into chunks. 

1259 

1260 Additional help can be found in the `online docs for IO Tools 

1261 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

1262 

1263 Parameters 

1264 ---------- 

1265 filepath_or_buffer : str, path object, or file-like object 

1266 String, path object (implementing ``os.PathLike[str]``), or file-like 

1267 object implementing a text ``read()`` function.The string could be a URL. 

1268 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is 

1269 expected. A local file could be: 

1270 ``file://localhost/path/to/table.csv``. 

1271 colspecs : list of tuple (int, int) or 'infer'. optional 

1272 A list of tuples giving the extents of the fixed-width 

1273 fields of each line as half-open intervals (i.e., [from, to[ ). 

1274 String value 'infer' can be used to instruct the parser to try 

1275 detecting the column specifications from the first 100 rows of 

1276 the data which are not being skipped via skiprows (default='infer'). 

1277 widths : list of int, optional 

1278 A list of field widths which can be used instead of 'colspecs' if 

1279 the intervals are contiguous. 

1280 infer_nrows : int, default 100 

1281 The number of rows to consider when letting the parser determine the 

1282 `colspecs`. 

1283 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames 

1284 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy 

1285 arrays, nullable dtypes are used for all dtypes that have a nullable 

1286 implementation when "numpy_nullable" is set, pyarrow is used for all 

1287 dtypes if "pyarrow" is set. 

1288 

1289 The dtype_backends are still experimential. 

1290 

1291 .. versionadded:: 2.0 

1292 

1293 **kwds : optional 

1294 Optional keyword arguments can be passed to ``TextFileReader``. 

1295 

1296 Returns 

1297 ------- 

1298 DataFrame or TextFileReader 

1299 A comma-separated values (csv) file is returned as two-dimensional 

1300 data structure with labeled axes. 

1301 

1302 See Also 

1303 -------- 

1304 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

1305 read_csv : Read a comma-separated values (csv) file into DataFrame. 

1306 

1307 Examples 

1308 -------- 

1309 >>> pd.read_fwf('data.csv') # doctest: +SKIP 

1310 """ 

1311 # Check input arguments. 

1312 if colspecs is None and widths is None: 

1313 raise ValueError("Must specify either colspecs or widths") 

1314 if colspecs not in (None, "infer") and widths is not None: 

1315 raise ValueError("You must specify only one of 'widths' and 'colspecs'") 

1316 

1317 # Compute 'colspecs' from 'widths', if specified. 

1318 if widths is not None: 

1319 colspecs, col = [], 0 

1320 for w in widths: 

1321 colspecs.append((col, col + w)) 

1322 col += w 

1323 

1324 # for mypy 

1325 assert colspecs is not None 

1326 

1327 # GH#40830 

1328 # Ensure length of `colspecs` matches length of `names` 

1329 names = kwds.get("names") 

1330 if names is not None: 

1331 if len(names) != len(colspecs) and colspecs != "infer": 

1332 # need to check len(index_col) as it might contain 

1333 # unnamed indices, in which case it's name is not required 

1334 len_index = 0 

1335 if kwds.get("index_col") is not None: 

1336 index_col: Any = kwds.get("index_col") 

1337 if index_col is not False: 

1338 if not is_list_like(index_col): 

1339 len_index = 1 

1340 else: 

1341 len_index = len(index_col) 

1342 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): 

1343 # If usecols is used colspec may be longer than names 

1344 raise ValueError("Length of colspecs must match length of names") 

1345 

1346 kwds["colspecs"] = colspecs 

1347 kwds["infer_nrows"] = infer_nrows 

1348 kwds["engine"] = "python-fwf" 

1349 

1350 check_dtype_backend(dtype_backend) 

1351 kwds["dtype_backend"] = dtype_backend 

1352 return _read(filepath_or_buffer, kwds) 

1353 

1354 

1355class TextFileReader(abc.Iterator): 

1356 """ 

1357 

1358 Passed dialect overrides any of the related parser options 

1359 

1360 """ 

1361 

1362 def __init__( 

1363 self, 

1364 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, 

1365 engine: CSVEngine | None = None, 

1366 **kwds, 

1367 ) -> None: 

1368 if engine is not None: 

1369 engine_specified = True 

1370 else: 

1371 engine = "python" 

1372 engine_specified = False 

1373 self.engine = engine 

1374 self._engine_specified = kwds.get("engine_specified", engine_specified) 

1375 

1376 _validate_skipfooter(kwds) 

1377 

1378 dialect = _extract_dialect(kwds) 

1379 if dialect is not None: 

1380 if engine == "pyarrow": 

1381 raise ValueError( 

1382 "The 'dialect' option is not supported with the 'pyarrow' engine" 

1383 ) 

1384 kwds = _merge_with_dialect_properties(dialect, kwds) 

1385 

1386 if kwds.get("header", "infer") == "infer": 

1387 kwds["header"] = 0 if kwds.get("names") is None else None 

1388 

1389 self.orig_options = kwds 

1390 

1391 # miscellanea 

1392 self._currow = 0 

1393 

1394 options = self._get_options_with_defaults(engine) 

1395 options["storage_options"] = kwds.get("storage_options", None) 

1396 

1397 self.chunksize = options.pop("chunksize", None) 

1398 self.nrows = options.pop("nrows", None) 

1399 

1400 self._check_file_or_buffer(f, engine) 

1401 self.options, self.engine = self._clean_options(options, engine) 

1402 

1403 if "has_index_names" in kwds: 

1404 self.options["has_index_names"] = kwds["has_index_names"] 

1405 

1406 self.handles: IOHandles | None = None 

1407 self._engine = self._make_engine(f, self.engine) 

1408 

1409 def close(self) -> None: 

1410 if self.handles is not None: 

1411 self.handles.close() 

1412 self._engine.close() 

1413 

1414 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: 

1415 kwds = self.orig_options 

1416 

1417 options = {} 

1418 default: object | None 

1419 

1420 for argname, default in parser_defaults.items(): 

1421 value = kwds.get(argname, default) 

1422 

1423 # see gh-12935 

1424 if ( 

1425 engine == "pyarrow" 

1426 and argname in _pyarrow_unsupported 

1427 and value != default 

1428 and value != getattr(value, "value", default) 

1429 ): 

1430 raise ValueError( 

1431 f"The {repr(argname)} option is not supported with the " 

1432 f"'pyarrow' engine" 

1433 ) 

1434 options[argname] = value 

1435 

1436 for argname, default in _c_parser_defaults.items(): 

1437 if argname in kwds: 

1438 value = kwds[argname] 

1439 

1440 if engine != "c" and value != default: 

1441 if "python" in engine and argname not in _python_unsupported: 

1442 pass 

1443 else: 

1444 raise ValueError( 

1445 f"The {repr(argname)} option is not supported with the " 

1446 f"{repr(engine)} engine" 

1447 ) 

1448 else: 

1449 value = default 

1450 options[argname] = value 

1451 

1452 if engine == "python-fwf": 

1453 for argname, default in _fwf_defaults.items(): 

1454 options[argname] = kwds.get(argname, default) 

1455 

1456 return options 

1457 

1458 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: 

1459 # see gh-16530 

1460 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): 

1461 # The C engine doesn't need the file-like to have the "__iter__" 

1462 # attribute. However, the Python engine needs "__iter__(...)" 

1463 # when iterating through such an object, meaning it 

1464 # needs to have that attribute 

1465 raise ValueError( 

1466 "The 'python' engine cannot iterate through this file buffer." 

1467 ) 

1468 

1469 def _clean_options( 

1470 self, options: dict[str, Any], engine: CSVEngine 

1471 ) -> tuple[dict[str, Any], CSVEngine]: 

1472 result = options.copy() 

1473 

1474 fallback_reason = None 

1475 

1476 # C engine not supported yet 

1477 if engine == "c": 

1478 if options["skipfooter"] > 0: 

1479 fallback_reason = "the 'c' engine does not support skipfooter" 

1480 engine = "python" 

1481 

1482 sep = options["delimiter"] 

1483 delim_whitespace = options["delim_whitespace"] 

1484 

1485 if sep is None and not delim_whitespace: 

1486 if engine in ("c", "pyarrow"): 

1487 fallback_reason = ( 

1488 f"the '{engine}' engine does not support " 

1489 "sep=None with delim_whitespace=False" 

1490 ) 

1491 engine = "python" 

1492 elif sep is not None and len(sep) > 1: 

1493 if engine == "c" and sep == r"\s+": 

1494 result["delim_whitespace"] = True 

1495 del result["delimiter"] 

1496 elif engine not in ("python", "python-fwf"): 

1497 # wait until regex engine integrated 

1498 fallback_reason = ( 

1499 f"the '{engine}' engine does not support " 

1500 "regex separators (separators > 1 char and " 

1501 r"different from '\s+' are interpreted as regex)" 

1502 ) 

1503 engine = "python" 

1504 elif delim_whitespace: 

1505 if "python" in engine: 

1506 result["delimiter"] = r"\s+" 

1507 elif sep is not None: 

1508 encodeable = True 

1509 encoding = sys.getfilesystemencoding() or "utf-8" 

1510 try: 

1511 if len(sep.encode(encoding)) > 1: 

1512 encodeable = False 

1513 except UnicodeDecodeError: 

1514 encodeable = False 

1515 if not encodeable and engine not in ("python", "python-fwf"): 

1516 fallback_reason = ( 

1517 f"the separator encoded in {encoding} " 

1518 f"is > 1 char long, and the '{engine}' engine " 

1519 "does not support such separators" 

1520 ) 

1521 engine = "python" 

1522 

1523 quotechar = options["quotechar"] 

1524 if quotechar is not None and isinstance(quotechar, (str, bytes)): 

1525 if ( 

1526 len(quotechar) == 1 

1527 and ord(quotechar) > 127 

1528 and engine not in ("python", "python-fwf") 

1529 ): 

1530 fallback_reason = ( 

1531 "ord(quotechar) > 127, meaning the " 

1532 "quotechar is larger than one byte, " 

1533 f"and the '{engine}' engine does not support such quotechars" 

1534 ) 

1535 engine = "python" 

1536 

1537 if fallback_reason and self._engine_specified: 

1538 raise ValueError(fallback_reason) 

1539 

1540 if engine == "c": 

1541 for arg in _c_unsupported: 

1542 del result[arg] 

1543 

1544 if "python" in engine: 

1545 for arg in _python_unsupported: 

1546 if fallback_reason and result[arg] != _c_parser_defaults[arg]: 

1547 raise ValueError( 

1548 "Falling back to the 'python' engine because " 

1549 f"{fallback_reason}, but this causes {repr(arg)} to be " 

1550 "ignored as it is not supported by the 'python' engine." 

1551 ) 

1552 del result[arg] 

1553 

1554 if fallback_reason: 

1555 warnings.warn( 

1556 ( 

1557 "Falling back to the 'python' engine because " 

1558 f"{fallback_reason}; you can avoid this warning by specifying " 

1559 "engine='python'." 

1560 ), 

1561 ParserWarning, 

1562 stacklevel=find_stack_level(), 

1563 ) 

1564 

1565 index_col = options["index_col"] 

1566 names = options["names"] 

1567 converters = options["converters"] 

1568 na_values = options["na_values"] 

1569 skiprows = options["skiprows"] 

1570 

1571 validate_header_arg(options["header"]) 

1572 

1573 if index_col is True: 

1574 raise ValueError("The value of index_col couldn't be 'True'") 

1575 if is_index_col(index_col): 

1576 if not isinstance(index_col, (list, tuple, np.ndarray)): 

1577 index_col = [index_col] 

1578 result["index_col"] = index_col 

1579 

1580 names = list(names) if names is not None else names 

1581 

1582 # type conversion-related 

1583 if converters is not None: 

1584 if not isinstance(converters, dict): 

1585 raise TypeError( 

1586 "Type converters must be a dict or subclass, " 

1587 f"input was a {type(converters).__name__}" 

1588 ) 

1589 else: 

1590 converters = {} 

1591 

1592 # Converting values to NA 

1593 keep_default_na = options["keep_default_na"] 

1594 na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) 

1595 

1596 # handle skiprows; this is internally handled by the 

1597 # c-engine, so only need for python and pyarrow parsers 

1598 if engine == "pyarrow": 

1599 if not is_integer(skiprows) and skiprows is not None: 

1600 # pyarrow expects skiprows to be passed as an integer 

1601 raise ValueError( 

1602 "skiprows argument must be an integer when using " 

1603 "engine='pyarrow'" 

1604 ) 

1605 else: 

1606 if is_integer(skiprows): 

1607 skiprows = list(range(skiprows)) 

1608 if skiprows is None: 

1609 skiprows = set() 

1610 elif not callable(skiprows): 

1611 skiprows = set(skiprows) 

1612 

1613 # put stuff back 

1614 result["names"] = names 

1615 result["converters"] = converters 

1616 result["na_values"] = na_values 

1617 result["na_fvalues"] = na_fvalues 

1618 result["skiprows"] = skiprows 

1619 

1620 return result, engine 

1621 

1622 def __next__(self) -> DataFrame: 

1623 try: 

1624 return self.get_chunk() 

1625 except StopIteration: 

1626 self.close() 

1627 raise 

1628 

1629 def _make_engine( 

1630 self, 

1631 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO, 

1632 engine: CSVEngine = "c", 

1633 ) -> ParserBase: 

1634 mapping: dict[str, type[ParserBase]] = { 

1635 "c": CParserWrapper, 

1636 "python": PythonParser, 

1637 "pyarrow": ArrowParserWrapper, 

1638 "python-fwf": FixedWidthFieldParser, 

1639 } 

1640 if engine not in mapping: 

1641 raise ValueError( 

1642 f"Unknown engine: {engine} (valid options are {mapping.keys()})" 

1643 ) 

1644 if not isinstance(f, list): 

1645 # open file here 

1646 is_text = True 

1647 mode = "r" 

1648 if engine == "pyarrow": 

1649 is_text = False 

1650 mode = "rb" 

1651 elif ( 

1652 engine == "c" 

1653 and self.options.get("encoding", "utf-8") == "utf-8" 

1654 and isinstance(stringify_path(f), str) 

1655 ): 

1656 # c engine can decode utf-8 bytes, adding TextIOWrapper makes 

1657 # the c-engine especially for memory_map=True far slower 

1658 is_text = False 

1659 if "b" not in mode: 

1660 mode += "b" 

1661 self.handles = get_handle( 

1662 f, 

1663 mode, 

1664 encoding=self.options.get("encoding", None), 

1665 compression=self.options.get("compression", None), 

1666 memory_map=self.options.get("memory_map", False), 

1667 is_text=is_text, 

1668 errors=self.options.get("encoding_errors", "strict"), 

1669 storage_options=self.options.get("storage_options", None), 

1670 ) 

1671 assert self.handles is not None 

1672 f = self.handles.handle 

1673 

1674 elif engine != "python": 

1675 msg = f"Invalid file path or buffer object type: {type(f)}" 

1676 raise ValueError(msg) 

1677 

1678 try: 

1679 return mapping[engine](f, **self.options) 

1680 except Exception: 

1681 if self.handles is not None: 

1682 self.handles.close() 

1683 raise 

1684 

1685 def _failover_to_python(self) -> None: 

1686 raise AbstractMethodError(self) 

1687 

1688 def read(self, nrows: int | None = None) -> DataFrame: 

1689 if self.engine == "pyarrow": 

1690 try: 

1691 # error: "ParserBase" has no attribute "read" 

1692 df = self._engine.read() # type: ignore[attr-defined] 

1693 except Exception: 

1694 self.close() 

1695 raise 

1696 else: 

1697 nrows = validate_integer("nrows", nrows) 

1698 try: 

1699 # error: "ParserBase" has no attribute "read" 

1700 ( 

1701 index, 

1702 columns, 

1703 col_dict, 

1704 ) = self._engine.read( # type: ignore[attr-defined] 

1705 nrows 

1706 ) 

1707 except Exception: 

1708 self.close() 

1709 raise 

1710 

1711 if index is None: 

1712 if col_dict: 

1713 # Any column is actually fine: 

1714 new_rows = len(next(iter(col_dict.values()))) 

1715 index = RangeIndex(self._currow, self._currow + new_rows) 

1716 else: 

1717 new_rows = 0 

1718 else: 

1719 new_rows = len(index) 

1720 

1721 df = DataFrame(col_dict, columns=columns, index=index) 

1722 

1723 self._currow += new_rows 

1724 return df 

1725 

1726 def get_chunk(self, size: int | None = None) -> DataFrame: 

1727 if size is None: 

1728 size = self.chunksize 

1729 if self.nrows is not None: 

1730 if self._currow >= self.nrows: 

1731 raise StopIteration 

1732 size = min(size, self.nrows - self._currow) 

1733 return self.read(nrows=size) 

1734 

1735 def __enter__(self) -> TextFileReader: 

1736 return self 

1737 

1738 def __exit__( 

1739 self, 

1740 exc_type: type[BaseException] | None, 

1741 exc_value: BaseException | None, 

1742 traceback: TracebackType | None, 

1743 ) -> None: 

1744 self.close() 

1745 

1746 

1747def TextParser(*args, **kwds) -> TextFileReader: 

1748 """ 

1749 Converts lists of lists/tuples into DataFrames with proper type inference 

1750 and optional (e.g. string to datetime) conversion. Also enables iterating 

1751 lazily over chunks of large files 

1752 

1753 Parameters 

1754 ---------- 

1755 data : file-like object or list 

1756 delimiter : separator character to use 

1757 dialect : str or csv.Dialect instance, optional 

1758 Ignored if delimiter is longer than 1 character 

1759 names : sequence, default 

1760 header : int, default 0 

1761 Row to use to parse column labels. Defaults to the first row. Prior 

1762 rows will be discarded 

1763 index_col : int or list, optional 

1764 Column or columns to use as the (possibly hierarchical) index 

1765 has_index_names: bool, default False 

1766 True if the cols defined in index_col have an index name and are 

1767 not in the header. 

1768 na_values : scalar, str, list-like, or dict, optional 

1769 Additional strings to recognize as NA/NaN. 

1770 keep_default_na : bool, default True 

1771 thousands : str, optional 

1772 Thousands separator 

1773 comment : str, optional 

1774 Comment out remainder of line 

1775 parse_dates : bool, default False 

1776 keep_date_col : bool, default False 

1777 date_parser : function, optional 

1778 

1779 .. deprecated:: 2.0.0 

1780 date_format : str or dict of column -> format, default ``None`` 

1781 

1782 .. versionadded:: 2.0.0 

1783 skiprows : list of integers 

1784 Row numbers to skip 

1785 skipfooter : int 

1786 Number of line at bottom of file to skip 

1787 converters : dict, optional 

1788 Dict of functions for converting values in certain columns. Keys can 

1789 either be integers or column labels, values are functions that take one 

1790 input argument, the cell (not column) content, and return the 

1791 transformed content. 

1792 encoding : str, optional 

1793 Encoding to use for UTF when reading/writing (ex. 'utf-8') 

1794 float_precision : str, optional 

1795 Specifies which converter the C engine should use for floating-point 

1796 values. The options are `None` or `high` for the ordinary converter, 

1797 `legacy` for the original lower precision pandas converter, and 

1798 `round_trip` for the round-trip converter. 

1799 

1800 .. versionchanged:: 1.2 

1801 """ 

1802 kwds["engine"] = "python" 

1803 return TextFileReader(*args, **kwds) 

1804 

1805 

1806def _clean_na_values(na_values, keep_default_na: bool = True): 

1807 na_fvalues: set | dict 

1808 if na_values is None: 

1809 if keep_default_na: 

1810 na_values = STR_NA_VALUES 

1811 else: 

1812 na_values = set() 

1813 na_fvalues = set() 

1814 elif isinstance(na_values, dict): 

1815 old_na_values = na_values.copy() 

1816 na_values = {} # Prevent aliasing. 

1817 

1818 # Convert the values in the na_values dictionary 

1819 # into array-likes for further use. This is also 

1820 # where we append the default NaN values, provided 

1821 # that `keep_default_na=True`. 

1822 for k, v in old_na_values.items(): 

1823 if not is_list_like(v): 

1824 v = [v] 

1825 

1826 if keep_default_na: 

1827 v = set(v) | STR_NA_VALUES 

1828 

1829 na_values[k] = v 

1830 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} 

1831 else: 

1832 if not is_list_like(na_values): 

1833 na_values = [na_values] 

1834 na_values = _stringify_na_values(na_values) 

1835 if keep_default_na: 

1836 na_values = na_values | STR_NA_VALUES 

1837 

1838 na_fvalues = _floatify_na_values(na_values) 

1839 

1840 return na_values, na_fvalues 

1841 

1842 

1843def _floatify_na_values(na_values): 

1844 # create float versions of the na_values 

1845 result = set() 

1846 for v in na_values: 

1847 try: 

1848 v = float(v) 

1849 if not np.isnan(v): 

1850 result.add(v) 

1851 except (TypeError, ValueError, OverflowError): 

1852 pass 

1853 return result 

1854 

1855 

1856def _stringify_na_values(na_values): 

1857 """return a stringified and numeric for these values""" 

1858 result: list[str | float] = [] 

1859 for x in na_values: 

1860 result.append(str(x)) 

1861 result.append(x) 

1862 try: 

1863 v = float(x) 

1864 

1865 # we are like 999 here 

1866 if v == int(v): 

1867 v = int(v) 

1868 result.append(f"{v}.0") 

1869 result.append(str(v)) 

1870 

1871 result.append(v) 

1872 except (TypeError, ValueError, OverflowError): 

1873 pass 

1874 try: 

1875 result.append(int(x)) 

1876 except (TypeError, ValueError, OverflowError): 

1877 pass 

1878 return set(result) 

1879 

1880 

1881def _refine_defaults_read( 

1882 dialect: str | csv.Dialect | None, 

1883 delimiter: str | None | lib.NoDefault, 

1884 delim_whitespace: bool, 

1885 engine: CSVEngine | None, 

1886 sep: str | None | lib.NoDefault, 

1887 on_bad_lines: str | Callable, 

1888 names: Sequence[Hashable] | None | lib.NoDefault, 

1889 defaults: dict[str, Any], 

1890 dtype_backend: DtypeBackend | lib.NoDefault, 

1891): 

1892 """Validate/refine default values of input parameters of read_csv, read_table. 

1893 

1894 Parameters 

1895 ---------- 

1896 dialect : str or csv.Dialect 

1897 If provided, this parameter will override values (default or not) for the 

1898 following parameters: `delimiter`, `doublequote`, `escapechar`, 

1899 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 

1900 override values, a ParserWarning will be issued. See csv.Dialect 

1901 documentation for more details. 

1902 delimiter : str or object 

1903 Alias for sep. 

1904 delim_whitespace : bool 

1905 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 

1906 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 

1907 is set to True, nothing should be passed in for the ``delimiter`` 

1908 parameter. 

1909 engine : {{'c', 'python'}} 

1910 Parser engine to use. The C engine is faster while the python engine is 

1911 currently more feature-complete. 

1912 sep : str or object 

1913 A delimiter provided by the user (str) or a sentinel value, i.e. 

1914 pandas._libs.lib.no_default. 

1915 on_bad_lines : str, callable 

1916 An option for handling bad lines or a sentinel value(None). 

1917 names : array-like, optional 

1918 List of column names to use. If the file contains a header row, 

1919 then you should explicitly pass ``header=0`` to override the column names. 

1920 Duplicates in this list are not allowed. 

1921 defaults: dict 

1922 Default values of input parameters. 

1923 

1924 Returns 

1925 ------- 

1926 kwds : dict 

1927 Input parameters with correct values. 

1928 

1929 Raises 

1930 ------ 

1931 ValueError : 

1932 If a delimiter was specified with ``sep`` (or ``delimiter``) and 

1933 ``delim_whitespace=True``. 

1934 """ 

1935 # fix types for sep, delimiter to Union(str, Any) 

1936 delim_default = defaults["delimiter"] 

1937 kwds: dict[str, Any] = {} 

1938 # gh-23761 

1939 # 

1940 # When a dialect is passed, it overrides any of the overlapping 

1941 # parameters passed in directly. We don't want to warn if the 

1942 # default parameters were passed in (since it probably means 

1943 # that the user didn't pass them in explicitly in the first place). 

1944 # 

1945 # "delimiter" is the annoying corner case because we alias it to 

1946 # "sep" before doing comparison to the dialect values later on. 

1947 # Thus, we need a flag to indicate that we need to "override" 

1948 # the comparison to dialect values by checking if default values 

1949 # for BOTH "delimiter" and "sep" were provided. 

1950 if dialect is not None: 

1951 kwds["sep_override"] = delimiter is None and ( 

1952 sep is lib.no_default or sep == delim_default 

1953 ) 

1954 

1955 if delimiter and (sep is not lib.no_default): 

1956 raise ValueError("Specified a sep and a delimiter; you can only specify one.") 

1957 

1958 kwds["names"] = None if names is lib.no_default else names 

1959 

1960 # Alias sep -> delimiter. 

1961 if delimiter is None: 

1962 delimiter = sep 

1963 

1964 if delim_whitespace and (delimiter is not lib.no_default): 

1965 raise ValueError( 

1966 "Specified a delimiter with both sep and " 

1967 "delim_whitespace=True; you can only specify one." 

1968 ) 

1969 

1970 if delimiter == "\n": 

1971 raise ValueError( 

1972 r"Specified \n as separator or delimiter. This forces the python engine " 

1973 "which does not accept a line terminator. Hence it is not allowed to use " 

1974 "the line terminator as separator.", 

1975 ) 

1976 

1977 if delimiter is lib.no_default: 

1978 # assign default separator value 

1979 kwds["delimiter"] = delim_default 

1980 else: 

1981 kwds["delimiter"] = delimiter 

1982 

1983 if engine is not None: 

1984 kwds["engine_specified"] = True 

1985 else: 

1986 kwds["engine"] = "c" 

1987 kwds["engine_specified"] = False 

1988 

1989 if on_bad_lines == "error": 

1990 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR 

1991 elif on_bad_lines == "warn": 

1992 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN 

1993 elif on_bad_lines == "skip": 

1994 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP 

1995 elif callable(on_bad_lines): 

1996 if engine != "python": 

1997 raise ValueError( 

1998 "on_bad_line can only be a callable function if engine='python'" 

1999 ) 

2000 kwds["on_bad_lines"] = on_bad_lines 

2001 else: 

2002 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") 

2003 

2004 check_dtype_backend(dtype_backend) 

2005 

2006 kwds["dtype_backend"] = dtype_backend 

2007 

2008 return kwds 

2009 

2010 

2011def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: 

2012 """ 

2013 Extract concrete csv dialect instance. 

2014 

2015 Returns 

2016 ------- 

2017 csv.Dialect or None 

2018 """ 

2019 if kwds.get("dialect") is None: 

2020 return None 

2021 

2022 dialect = kwds["dialect"] 

2023 if dialect in csv.list_dialects(): 

2024 dialect = csv.get_dialect(dialect) 

2025 

2026 _validate_dialect(dialect) 

2027 

2028 return dialect 

2029 

2030 

2031MANDATORY_DIALECT_ATTRS = ( 

2032 "delimiter", 

2033 "doublequote", 

2034 "escapechar", 

2035 "skipinitialspace", 

2036 "quotechar", 

2037 "quoting", 

2038) 

2039 

2040 

2041def _validate_dialect(dialect: csv.Dialect) -> None: 

2042 """ 

2043 Validate csv dialect instance. 

2044 

2045 Raises 

2046 ------ 

2047 ValueError 

2048 If incorrect dialect is provided. 

2049 """ 

2050 for param in MANDATORY_DIALECT_ATTRS: 

2051 if not hasattr(dialect, param): 

2052 raise ValueError(f"Invalid dialect {dialect} provided") 

2053 

2054 

2055def _merge_with_dialect_properties( 

2056 dialect: csv.Dialect, 

2057 defaults: dict[str, Any], 

2058) -> dict[str, Any]: 

2059 """ 

2060 Merge default kwargs in TextFileReader with dialect parameters. 

2061 

2062 Parameters 

2063 ---------- 

2064 dialect : csv.Dialect 

2065 Concrete csv dialect. See csv.Dialect documentation for more details. 

2066 defaults : dict 

2067 Keyword arguments passed to TextFileReader. 

2068 

2069 Returns 

2070 ------- 

2071 kwds : dict 

2072 Updated keyword arguments, merged with dialect parameters. 

2073 """ 

2074 kwds = defaults.copy() 

2075 

2076 for param in MANDATORY_DIALECT_ATTRS: 

2077 dialect_val = getattr(dialect, param) 

2078 

2079 parser_default = parser_defaults[param] 

2080 provided = kwds.get(param, parser_default) 

2081 

2082 # Messages for conflicting values between the dialect 

2083 # instance and the actual parameters provided. 

2084 conflict_msgs = [] 

2085 

2086 # Don't warn if the default parameter was passed in, 

2087 # even if it conflicts with the dialect (gh-23761). 

2088 if provided not in (parser_default, dialect_val): 

2089 msg = ( 

2090 f"Conflicting values for '{param}': '{provided}' was " 

2091 f"provided, but the dialect specifies '{dialect_val}'. " 

2092 "Using the dialect-specified value." 

2093 ) 

2094 

2095 # Annoying corner case for not warning about 

2096 # conflicts between dialect and delimiter parameter. 

2097 # Refer to the outer "_read_" function for more info. 

2098 if not (param == "delimiter" and kwds.pop("sep_override", False)): 

2099 conflict_msgs.append(msg) 

2100 

2101 if conflict_msgs: 

2102 warnings.warn( 

2103 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() 

2104 ) 

2105 kwds[param] = dialect_val 

2106 return kwds 

2107 

2108 

2109def _validate_skipfooter(kwds: dict[str, Any]) -> None: 

2110 """ 

2111 Check whether skipfooter is compatible with other kwargs in TextFileReader. 

2112 

2113 Parameters 

2114 ---------- 

2115 kwds : dict 

2116 Keyword arguments passed to TextFileReader. 

2117 

2118 Raises 

2119 ------ 

2120 ValueError 

2121 If skipfooter is not compatible with other parameters. 

2122 """ 

2123 if kwds.get("skipfooter"): 

2124 if kwds.get("iterator") or kwds.get("chunksize"): 

2125 raise ValueError("'skipfooter' not supported for iteration") 

2126 if kwds.get("nrows"): 

2127 raise ValueError("'skipfooter' not supported with 'nrows'")