Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/parsers/readers.py: 52%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

559 statements  

1""" 

2Module contains tools for processing files into DataFrames or other objects 

3 

4GH#48849 provides a convenient way of deprecating keyword arguments 

5""" 

6from __future__ import annotations 

7 

8from collections import ( 

9 abc, 

10 defaultdict, 

11) 

12import csv 

13import sys 

14from textwrap import fill 

15from typing import ( 

16 IO, 

17 TYPE_CHECKING, 

18 Any, 

19 Callable, 

20 Literal, 

21 NamedTuple, 

22 TypedDict, 

23 overload, 

24) 

25import warnings 

26 

27import numpy as np 

28 

29from pandas._config import using_copy_on_write 

30 

31from pandas._libs import lib 

32from pandas._libs.parsers import STR_NA_VALUES 

33from pandas.errors import ( 

34 AbstractMethodError, 

35 ParserWarning, 

36) 

37from pandas.util._decorators import Appender 

38from pandas.util._exceptions import find_stack_level 

39from pandas.util._validators import check_dtype_backend 

40 

41from pandas.core.dtypes.common import ( 

42 is_file_like, 

43 is_float, 

44 is_hashable, 

45 is_integer, 

46 is_list_like, 

47 pandas_dtype, 

48) 

49 

50from pandas import Series 

51from pandas.core.frame import DataFrame 

52from pandas.core.indexes.api import RangeIndex 

53from pandas.core.shared_docs import _shared_docs 

54 

55from pandas.io.common import ( 

56 IOHandles, 

57 get_handle, 

58 stringify_path, 

59 validate_header_arg, 

60) 

61from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper 

62from pandas.io.parsers.base_parser import ( 

63 ParserBase, 

64 is_index_col, 

65 parser_defaults, 

66) 

67from pandas.io.parsers.c_parser_wrapper import CParserWrapper 

68from pandas.io.parsers.python_parser import ( 

69 FixedWidthFieldParser, 

70 PythonParser, 

71) 

72 

73if TYPE_CHECKING: 

74 from collections.abc import ( 

75 Hashable, 

76 Iterable, 

77 Mapping, 

78 Sequence, 

79 ) 

80 from types import TracebackType 

81 

82 from pandas._typing import ( 

83 CompressionOptions, 

84 CSVEngine, 

85 DtypeArg, 

86 DtypeBackend, 

87 FilePath, 

88 IndexLabel, 

89 ReadCsvBuffer, 

90 Self, 

91 StorageOptions, 

92 UsecolsArgType, 

93 ) 

94_doc_read_csv_and_table = ( 

95 r""" 

96{summary} 

97 

98Also supports optionally iterating or breaking of the file 

99into chunks. 

100 

101Additional help can be found in the online docs for 

102`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

103 

104Parameters 

105---------- 

106filepath_or_buffer : str, path object or file-like object 

107 Any valid string path is acceptable. The string could be a URL. Valid 

108 URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is 

109 expected. A local file could be: file://localhost/path/to/table.csv. 

110 

111 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

112 

113 By file-like object, we refer to objects with a ``read()`` method, such as 

114 a file handle (e.g. via builtin ``open`` function) or ``StringIO``. 

115sep : str, default {_default_sep} 

116 Character or regex pattern to treat as the delimiter. If ``sep=None``, the 

117 C engine cannot automatically detect 

118 the separator, but the Python parsing engine can, meaning the latter will 

119 be used and automatically detect the separator from only the first valid 

120 row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. 

121 In addition, separators longer than 1 character and different from 

122 ``'\s+'`` will be interpreted as regular expressions and will also force 

123 the use of the Python parsing engine. Note that regex delimiters are prone 

124 to ignoring quoted data. Regex example: ``'\r\t'``. 

125delimiter : str, optional 

126 Alias for ``sep``. 

127header : int, Sequence of int, 'infer' or None, default 'infer' 

128 Row number(s) containing column labels and marking the start of the 

129 data (zero-indexed). Default behavior is to infer the column names: if no ``names`` 

130 are passed the behavior is identical to ``header=0`` and column 

131 names are inferred from the first line of the file, if column 

132 names are passed explicitly to ``names`` then the behavior is identical to 

133 ``header=None``. Explicitly pass ``header=0`` to be able to 

134 replace existing names. The header can be a list of integers that 

135 specify row locations for a :class:`~pandas.MultiIndex` on the columns 

136 e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be 

137 skipped (e.g. 2 in this example is skipped). Note that this 

138 parameter ignores commented lines and empty lines if 

139 ``skip_blank_lines=True``, so ``header=0`` denotes the first line of 

140 data rather than the first line of the file. 

141names : Sequence of Hashable, optional 

142 Sequence of column labels to apply. If the file contains a header row, 

143 then you should explicitly pass ``header=0`` to override the column names. 

144 Duplicates in this list are not allowed. 

145index_col : Hashable, Sequence of Hashable or False, optional 

146 Column(s) to use as row label(s), denoted either by column labels or column 

147 indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` 

148 will be formed for the row labels. 

149 

150 Note: ``index_col=False`` can be used to force pandas to *not* use the first 

151 column as the index, e.g., when you have a malformed file with delimiters at 

152 the end of each line. 

153usecols : Sequence of Hashable or Callable, optional 

154 Subset of columns to select, denoted either by column labels or column indices. 

155 If list-like, all elements must either 

156 be positional (i.e. integer indices into the document columns) or strings 

157 that correspond to column names provided either by the user in ``names`` or 

158 inferred from the document header row(s). If ``names`` are given, the document 

159 header row(s) are not taken into account. For example, a valid list-like 

160 ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. 

161 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. 

162 To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order 

163 preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` 

164 for columns in ``['foo', 'bar']`` order or 

165 ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` 

166 for ``['bar', 'foo']`` order. 

167 

168 If callable, the callable function will be evaluated against the column 

169 names, returning names where the callable function evaluates to ``True``. An 

170 example of a valid callable argument would be ``lambda x: x.upper() in 

171 ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster 

172 parsing time and lower memory usage. 

173dtype : dtype or dict of {{Hashable : dtype}}, optional 

174 Data type(s) to apply to either the whole dataset or individual columns. 

175 E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` 

176 Use ``str`` or ``object`` together with suitable ``na_values`` settings 

177 to preserve and not interpret ``dtype``. 

178 If ``converters`` are specified, they will be applied INSTEAD 

179 of ``dtype`` conversion. 

180 

181 .. versionadded:: 1.5.0 

182 

183 Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where 

184 the default determines the ``dtype`` of the columns which are not explicitly 

185 listed. 

186engine : {{'c', 'python', 'pyarrow'}}, optional 

187 Parser engine to use. The C and pyarrow engines are faster, while the python engine 

188 is currently more feature-complete. Multithreading is currently only supported by 

189 the pyarrow engine. 

190 

191 .. versionadded:: 1.4.0 

192 

193 The 'pyarrow' engine was added as an *experimental* engine, and some features 

194 are unsupported, or may not work correctly, with this engine. 

195converters : dict of {{Hashable : Callable}}, optional 

196 Functions for converting values in specified columns. Keys can either 

197 be column labels or column indices. 

198true_values : list, optional 

199 Values to consider as ``True`` in addition to case-insensitive variants of 'True'. 

200false_values : list, optional 

201 Values to consider as ``False`` in addition to case-insensitive variants of 'False'. 

202skipinitialspace : bool, default False 

203 Skip spaces after delimiter. 

204skiprows : int, list of int or Callable, optional 

205 Line numbers to skip (0-indexed) or number of lines to skip (``int``) 

206 at the start of the file. 

207 

208 If callable, the callable function will be evaluated against the row 

209 indices, returning ``True`` if the row should be skipped and ``False`` otherwise. 

210 An example of a valid callable argument would be ``lambda x: x in [0, 2]``. 

211skipfooter : int, default 0 

212 Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). 

213nrows : int, optional 

214 Number of rows of file to read. Useful for reading pieces of large files. 

215na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional 

216 Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific 

217 per-column ``NA`` values. By default the following values are interpreted as 

218 ``NaN``: " """ 

219 + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") 

220 + """ ". 

221 

222keep_default_na : bool, default True 

223 Whether or not to include the default ``NaN`` values when parsing the data. 

224 Depending on whether ``na_values`` is passed in, the behavior is as follows: 

225 

226 * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` 

227 is appended to the default ``NaN`` values used for parsing. 

228 * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only 

229 the default ``NaN`` values are used for parsing. 

230 * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only 

231 the ``NaN`` values specified ``na_values`` are used for parsing. 

232 * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no 

233 strings will be parsed as ``NaN``. 

234 

235 Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and 

236 ``na_values`` parameters will be ignored. 

237na_filter : bool, default True 

238 Detect missing value markers (empty strings and the value of ``na_values``). In 

239 data without any ``NA`` values, passing ``na_filter=False`` can improve the 

240 performance of reading a large file. 

241verbose : bool, default False 

242 Indicate number of ``NA`` values placed in non-numeric columns. 

243 

244 .. deprecated:: 2.2.0 

245skip_blank_lines : bool, default True 

246 If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. 

247parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ 

248default False 

249 The behavior is as follows: 

250 

251 * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to 

252 ``True`` if ``date_format`` or ``date_parser`` arguments have been passed. 

253 * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 

254 each as a separate date column. 

255 * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse 

256 as a single date column. Values are joined with a space before parsing. 

257 * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call 

258 result 'foo'. Values are joined with a space before parsing. 

259 

260 If a column or index cannot be represented as an array of ``datetime``, 

261 say because of an unparsable value or a mixture of timezones, the column 

262 or index will be returned unaltered as an ``object`` data type. For 

263 non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after 

264 :func:`~pandas.read_csv`. 

265 

266 Note: A fast-path exists for iso8601-formatted dates. 

267infer_datetime_format : bool, default False 

268 If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the 

269 format of the ``datetime`` strings in the columns, and if it can be inferred, 

270 switch to a faster method of parsing them. In some cases this can increase 

271 the parsing speed by 5-10x. 

272 

273 .. deprecated:: 2.0.0 

274 A strict version of this argument is now the default, passing it has no effect. 

275 

276keep_date_col : bool, default False 

277 If ``True`` and ``parse_dates`` specifies combining multiple columns then 

278 keep the original columns. 

279date_parser : Callable, optional 

280 Function to use for converting a sequence of string columns to an array of 

281 ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the 

282 conversion. pandas will try to call ``date_parser`` in three different ways, 

283 advancing to the next if an exception occurs: 1) Pass one or more arrays 

284 (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the 

285 string values from the columns defined by ``parse_dates`` into a single array 

286 and pass that; and 3) call ``date_parser`` once for each row using one or 

287 more strings (corresponding to the columns defined by ``parse_dates``) as 

288 arguments. 

289 

290 .. deprecated:: 2.0.0 

291 Use ``date_format`` instead, or read in as ``object`` and then apply 

292 :func:`~pandas.to_datetime` as-needed. 

293date_format : str or dict of column -> format, optional 

294 Format to use for parsing dates when used in conjunction with ``parse_dates``. 

295 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See 

296 `strftime documentation 

297 <https://docs.python.org/3/library/datetime.html 

298 #strftime-and-strptime-behavior>`_ for more information on choices, though 

299 note that :const:`"%f"` will parse all the way up to nanoseconds. 

300 You can also pass: 

301 

302 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ 

303 time string (not necessarily in exactly the same format); 

304 - "mixed", to infer the format for each element individually. This is risky, 

305 and you should probably use it along with `dayfirst`. 

306 

307 .. versionadded:: 2.0.0 

308dayfirst : bool, default False 

309 DD/MM format dates, international and European format. 

310cache_dates : bool, default True 

311 If ``True``, use a cache of unique, converted dates to apply the ``datetime`` 

312 conversion. May produce significant speed-up when parsing duplicate 

313 date strings, especially ones with timezone offsets. 

314 

315iterator : bool, default False 

316 Return ``TextFileReader`` object for iteration or getting chunks with 

317 ``get_chunk()``. 

318chunksize : int, optional 

319 Number of lines to read from the file per chunk. Passing a value will cause the 

320 function to return a ``TextFileReader`` object for iteration. 

321 See the `IO Tools docs 

322 <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ 

323 for more information on ``iterator`` and ``chunksize``. 

324 

325{decompression_options} 

326 

327 .. versionchanged:: 1.4.0 Zstandard support. 

328 

329thousands : str (length 1), optional 

330 Character acting as the thousands separator in numerical values. 

331decimal : str (length 1), default '.' 

332 Character to recognize as decimal point (e.g., use ',' for European data). 

333lineterminator : str (length 1), optional 

334 Character used to denote a line break. Only valid with C parser. 

335quotechar : str (length 1), optional 

336 Character used to denote the start and end of a quoted item. Quoted 

337 items can include the ``delimiter`` and it will be ignored. 

338quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ 

3393 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL 

340 Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is 

341 ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special 

342 characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, 

343 or ``lineterminator``. 

344doublequote : bool, default True 

345 When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate 

346 whether or not to interpret two consecutive ``quotechar`` elements INSIDE a 

347 field as a single ``quotechar`` element. 

348escapechar : str (length 1), optional 

349 Character used to escape other characters. 

350comment : str (length 1), optional 

351 Character indicating that the remainder of line should not be parsed. 

352 If found at the beginning 

353 of a line, the line will be ignored altogether. This parameter must be a 

354 single character. Like empty lines (as long as ``skip_blank_lines=True``), 

355 fully commented lines are ignored by the parameter ``header`` but not by 

356 ``skiprows``. For example, if ``comment='#'``, parsing 

357 ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being 

358 treated as the header. 

359encoding : str, optional, default 'utf-8' 

360 Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python 

361 standard encodings 

362 <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ . 

363 

364encoding_errors : str, optional, default 'strict' 

365 How encoding errors are treated. `List of possible values 

366 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ . 

367 

368 .. versionadded:: 1.3.0 

369 

370dialect : str or csv.Dialect, optional 

371 If provided, this parameter will override values (default or not) for the 

372 following parameters: ``delimiter``, ``doublequote``, ``escapechar``, 

373 ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to 

374 override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` 

375 documentation for more details. 

376on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' 

377 Specifies what to do upon encountering a bad line (a line with too many fields). 

378 Allowed values are : 

379 

380 - ``'error'``, raise an Exception when a bad line is encountered. 

381 - ``'warn'``, raise a warning when a bad line is encountered and skip that line. 

382 - ``'skip'``, skip bad lines without raising or warning when they are encountered. 

383 

384 .. versionadded:: 1.3.0 

385 

386 .. versionadded:: 1.4.0 

387 

388 - Callable, function with signature 

389 ``(bad_line: list[str]) -> list[str] | None`` that will process a single 

390 bad line. ``bad_line`` is a list of strings split by the ``sep``. 

391 If the function returns ``None``, the bad line will be ignored. 

392 If the function returns a new ``list`` of strings with more elements than 

393 expected, a ``ParserWarning`` will be emitted while dropping extra elements. 

394 Only supported when ``engine='python'`` 

395 

396 .. versionchanged:: 2.2.0 

397 

398 - Callable, function with signature 

399 as described in `pyarrow documentation 

400 <https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html 

401 #pyarrow.csv.ParseOptions.invalid_row_handler>`_ when ``engine='pyarrow'`` 

402 

403delim_whitespace : bool, default False 

404 Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be 

405 used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option 

406 is set to ``True``, nothing should be passed in for the ``delimiter`` 

407 parameter. 

408 

409 .. deprecated:: 2.2.0 

410 Use ``sep="\\s+"`` instead. 

411low_memory : bool, default True 

412 Internally process the file in chunks, resulting in lower memory use 

413 while parsing, but possibly mixed type inference. To ensure no mixed 

414 types either set ``False``, or specify the type with the ``dtype`` parameter. 

415 Note that the entire file is read into a single :class:`~pandas.DataFrame` 

416 regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in 

417 chunks. (Only valid with C parser). 

418memory_map : bool, default False 

419 If a filepath is provided for ``filepath_or_buffer``, map the file object 

420 directly onto memory and access the data directly from there. Using this 

421 option can improve performance because there is no longer any I/O overhead. 

422float_precision : {{'high', 'legacy', 'round_trip'}}, optional 

423 Specifies which converter the C engine should use for floating-point 

424 values. The options are ``None`` or ``'high'`` for the ordinary converter, 

425 ``'legacy'`` for the original lower precision pandas converter, and 

426 ``'round_trip'`` for the round-trip converter. 

427 

428{storage_options} 

429 

430dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' 

431 Back-end data type applied to the resultant :class:`DataFrame` 

432 (still experimental). Behaviour is as follows: 

433 

434 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` 

435 (default). 

436 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` 

437 DataFrame. 

438 

439 .. versionadded:: 2.0 

440 

441Returns 

442------- 

443DataFrame or TextFileReader 

444 A comma-separated values (csv) file is returned as two-dimensional 

445 data structure with labeled axes. 

446 

447See Also 

448-------- 

449DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

450{see_also_func_name} : {see_also_func_summary} 

451read_fwf : Read a table of fixed-width formatted lines into DataFrame. 

452 

453Examples 

454-------- 

455>>> pd.{func_name}('data.csv') # doctest: +SKIP 

456""" 

457) 

458 

459 

460class _C_Parser_Defaults(TypedDict): 

461 delim_whitespace: Literal[False] 

462 na_filter: Literal[True] 

463 low_memory: Literal[True] 

464 memory_map: Literal[False] 

465 float_precision: None 

466 

467 

468_c_parser_defaults: _C_Parser_Defaults = { 

469 "delim_whitespace": False, 

470 "na_filter": True, 

471 "low_memory": True, 

472 "memory_map": False, 

473 "float_precision": None, 

474} 

475 

476 

477class _Fwf_Defaults(TypedDict): 

478 colspecs: Literal["infer"] 

479 infer_nrows: Literal[100] 

480 widths: None 

481 

482 

483_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} 

484_c_unsupported = {"skipfooter"} 

485_python_unsupported = {"low_memory", "float_precision"} 

486_pyarrow_unsupported = { 

487 "skipfooter", 

488 "float_precision", 

489 "chunksize", 

490 "comment", 

491 "nrows", 

492 "thousands", 

493 "memory_map", 

494 "dialect", 

495 "delim_whitespace", 

496 "quoting", 

497 "lineterminator", 

498 "converters", 

499 "iterator", 

500 "dayfirst", 

501 "verbose", 

502 "skipinitialspace", 

503 "low_memory", 

504} 

505 

506 

507class _DeprecationConfig(NamedTuple): 

508 default_value: Any 

509 msg: str | None 

510 

511 

512@overload 

513def validate_integer(name: str, val: None, min_val: int = ...) -> None: 

514 ... 

515 

516 

517@overload 

518def validate_integer(name: str, val: float, min_val: int = ...) -> int: 

519 ... 

520 

521 

522@overload 

523def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: 

524 ... 

525 

526 

527def validate_integer( 

528 name: str, val: int | float | None, min_val: int = 0 

529) -> int | None: 

530 """ 

531 Checks whether the 'name' parameter for parsing is either 

532 an integer OR float that can SAFELY be cast to an integer 

533 without losing accuracy. Raises a ValueError if that is 

534 not the case. 

535 

536 Parameters 

537 ---------- 

538 name : str 

539 Parameter name (used for error reporting) 

540 val : int or float 

541 The value to check 

542 min_val : int 

543 Minimum allowed value (val < min_val will result in a ValueError) 

544 """ 

545 if val is None: 

546 return val 

547 

548 msg = f"'{name:s}' must be an integer >={min_val:d}" 

549 if is_float(val): 

550 if int(val) != val: 

551 raise ValueError(msg) 

552 val = int(val) 

553 elif not (is_integer(val) and val >= min_val): 

554 raise ValueError(msg) 

555 

556 return int(val) 

557 

558 

559def _validate_names(names: Sequence[Hashable] | None) -> None: 

560 """ 

561 Raise ValueError if the `names` parameter contains duplicates or has an 

562 invalid data type. 

563 

564 Parameters 

565 ---------- 

566 names : array-like or None 

567 An array containing a list of the names used for the output DataFrame. 

568 

569 Raises 

570 ------ 

571 ValueError 

572 If names are not unique or are not ordered (e.g. set). 

573 """ 

574 if names is not None: 

575 if len(names) != len(set(names)): 

576 raise ValueError("Duplicate names are not allowed.") 

577 if not ( 

578 is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) 

579 ): 

580 raise ValueError("Names should be an ordered collection.") 

581 

582 

583def _read( 

584 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds 

585) -> DataFrame | TextFileReader: 

586 """Generic reader of line files.""" 

587 # if we pass a date_parser and parse_dates=False, we should not parse the 

588 # dates GH#44366 

589 if kwds.get("parse_dates", None) is None: 

590 if ( 

591 kwds.get("date_parser", lib.no_default) is lib.no_default 

592 and kwds.get("date_format", None) is None 

593 ): 

594 kwds["parse_dates"] = False 

595 else: 

596 kwds["parse_dates"] = True 

597 

598 # Extract some of the arguments (pass chunksize on). 

599 iterator = kwds.get("iterator", False) 

600 chunksize = kwds.get("chunksize", None) 

601 if kwds.get("engine") == "pyarrow": 

602 if iterator: 

603 raise ValueError( 

604 "The 'iterator' option is not supported with the 'pyarrow' engine" 

605 ) 

606 

607 if chunksize is not None: 

608 raise ValueError( 

609 "The 'chunksize' option is not supported with the 'pyarrow' engine" 

610 ) 

611 else: 

612 chunksize = validate_integer("chunksize", chunksize, 1) 

613 

614 nrows = kwds.get("nrows", None) 

615 

616 # Check for duplicates in names. 

617 _validate_names(kwds.get("names", None)) 

618 

619 # Create the parser. 

620 parser = TextFileReader(filepath_or_buffer, **kwds) 

621 

622 if chunksize or iterator: 

623 return parser 

624 

625 with parser: 

626 return parser.read(nrows) 

627 

628 

629# iterator=True -> TextFileReader 

630@overload 

631def read_csv( 

632 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

633 *, 

634 sep: str | None | lib.NoDefault = ..., 

635 delimiter: str | None | lib.NoDefault = ..., 

636 header: int | Sequence[int] | None | Literal["infer"] = ..., 

637 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

638 index_col: IndexLabel | Literal[False] | None = ..., 

639 usecols: UsecolsArgType = ..., 

640 dtype: DtypeArg | None = ..., 

641 engine: CSVEngine | None = ..., 

642 converters: Mapping[Hashable, Callable] | None = ..., 

643 true_values: list | None = ..., 

644 false_values: list | None = ..., 

645 skipinitialspace: bool = ..., 

646 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

647 skipfooter: int = ..., 

648 nrows: int | None = ..., 

649 na_values: Hashable 

650 | Iterable[Hashable] 

651 | Mapping[Hashable, Iterable[Hashable]] 

652 | None = ..., 

653 na_filter: bool = ..., 

654 verbose: bool | lib.NoDefault = ..., 

655 skip_blank_lines: bool = ..., 

656 parse_dates: bool | Sequence[Hashable] | None = ..., 

657 infer_datetime_format: bool | lib.NoDefault = ..., 

658 keep_date_col: bool | lib.NoDefault = ..., 

659 date_parser: Callable | lib.NoDefault = ..., 

660 date_format: str | dict[Hashable, str] | None = ..., 

661 dayfirst: bool = ..., 

662 cache_dates: bool = ..., 

663 iterator: Literal[True], 

664 chunksize: int | None = ..., 

665 compression: CompressionOptions = ..., 

666 thousands: str | None = ..., 

667 decimal: str = ..., 

668 lineterminator: str | None = ..., 

669 quotechar: str = ..., 

670 quoting: int = ..., 

671 doublequote: bool = ..., 

672 escapechar: str | None = ..., 

673 comment: str | None = ..., 

674 encoding: str | None = ..., 

675 encoding_errors: str | None = ..., 

676 dialect: str | csv.Dialect | None = ..., 

677 on_bad_lines=..., 

678 delim_whitespace: bool | lib.NoDefault = ..., 

679 low_memory: bool = ..., 

680 memory_map: bool = ..., 

681 float_precision: Literal["high", "legacy"] | None = ..., 

682 storage_options: StorageOptions = ..., 

683 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

684) -> TextFileReader: 

685 ... 

686 

687 

688# chunksize=int -> TextFileReader 

689@overload 

690def read_csv( 

691 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

692 *, 

693 sep: str | None | lib.NoDefault = ..., 

694 delimiter: str | None | lib.NoDefault = ..., 

695 header: int | Sequence[int] | None | Literal["infer"] = ..., 

696 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

697 index_col: IndexLabel | Literal[False] | None = ..., 

698 usecols: UsecolsArgType = ..., 

699 dtype: DtypeArg | None = ..., 

700 engine: CSVEngine | None = ..., 

701 converters: Mapping[Hashable, Callable] | None = ..., 

702 true_values: list | None = ..., 

703 false_values: list | None = ..., 

704 skipinitialspace: bool = ..., 

705 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

706 skipfooter: int = ..., 

707 nrows: int | None = ..., 

708 na_values: Hashable 

709 | Iterable[Hashable] 

710 | Mapping[Hashable, Iterable[Hashable]] 

711 | None = ..., 

712 keep_default_na: bool = ..., 

713 na_filter: bool = ..., 

714 verbose: bool | lib.NoDefault = ..., 

715 skip_blank_lines: bool = ..., 

716 parse_dates: bool | Sequence[Hashable] | None = ..., 

717 infer_datetime_format: bool | lib.NoDefault = ..., 

718 keep_date_col: bool | lib.NoDefault = ..., 

719 date_parser: Callable | lib.NoDefault = ..., 

720 date_format: str | dict[Hashable, str] | None = ..., 

721 dayfirst: bool = ..., 

722 cache_dates: bool = ..., 

723 iterator: bool = ..., 

724 chunksize: int, 

725 compression: CompressionOptions = ..., 

726 thousands: str | None = ..., 

727 decimal: str = ..., 

728 lineterminator: str | None = ..., 

729 quotechar: str = ..., 

730 quoting: int = ..., 

731 doublequote: bool = ..., 

732 escapechar: str | None = ..., 

733 comment: str | None = ..., 

734 encoding: str | None = ..., 

735 encoding_errors: str | None = ..., 

736 dialect: str | csv.Dialect | None = ..., 

737 on_bad_lines=..., 

738 delim_whitespace: bool | lib.NoDefault = ..., 

739 low_memory: bool = ..., 

740 memory_map: bool = ..., 

741 float_precision: Literal["high", "legacy"] | None = ..., 

742 storage_options: StorageOptions = ..., 

743 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

744) -> TextFileReader: 

745 ... 

746 

747 

748# default case -> DataFrame 

749@overload 

750def read_csv( 

751 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

752 *, 

753 sep: str | None | lib.NoDefault = ..., 

754 delimiter: str | None | lib.NoDefault = ..., 

755 header: int | Sequence[int] | None | Literal["infer"] = ..., 

756 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

757 index_col: IndexLabel | Literal[False] | None = ..., 

758 usecols: UsecolsArgType = ..., 

759 dtype: DtypeArg | None = ..., 

760 engine: CSVEngine | None = ..., 

761 converters: Mapping[Hashable, Callable] | None = ..., 

762 true_values: list | None = ..., 

763 false_values: list | None = ..., 

764 skipinitialspace: bool = ..., 

765 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

766 skipfooter: int = ..., 

767 nrows: int | None = ..., 

768 na_values: Hashable 

769 | Iterable[Hashable] 

770 | Mapping[Hashable, Iterable[Hashable]] 

771 | None = ..., 

772 keep_default_na: bool = ..., 

773 na_filter: bool = ..., 

774 verbose: bool | lib.NoDefault = ..., 

775 skip_blank_lines: bool = ..., 

776 parse_dates: bool | Sequence[Hashable] | None = ..., 

777 infer_datetime_format: bool | lib.NoDefault = ..., 

778 keep_date_col: bool | lib.NoDefault = ..., 

779 date_parser: Callable | lib.NoDefault = ..., 

780 date_format: str | dict[Hashable, str] | None = ..., 

781 dayfirst: bool = ..., 

782 cache_dates: bool = ..., 

783 iterator: Literal[False] = ..., 

784 chunksize: None = ..., 

785 compression: CompressionOptions = ..., 

786 thousands: str | None = ..., 

787 decimal: str = ..., 

788 lineterminator: str | None = ..., 

789 quotechar: str = ..., 

790 quoting: int = ..., 

791 doublequote: bool = ..., 

792 escapechar: str | None = ..., 

793 comment: str | None = ..., 

794 encoding: str | None = ..., 

795 encoding_errors: str | None = ..., 

796 dialect: str | csv.Dialect | None = ..., 

797 on_bad_lines=..., 

798 delim_whitespace: bool | lib.NoDefault = ..., 

799 low_memory: bool = ..., 

800 memory_map: bool = ..., 

801 float_precision: Literal["high", "legacy"] | None = ..., 

802 storage_options: StorageOptions = ..., 

803 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

804) -> DataFrame: 

805 ... 

806 

807 

808# Unions -> DataFrame | TextFileReader 

809@overload 

810def read_csv( 

811 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

812 *, 

813 sep: str | None | lib.NoDefault = ..., 

814 delimiter: str | None | lib.NoDefault = ..., 

815 header: int | Sequence[int] | None | Literal["infer"] = ..., 

816 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

817 index_col: IndexLabel | Literal[False] | None = ..., 

818 usecols: UsecolsArgType = ..., 

819 dtype: DtypeArg | None = ..., 

820 engine: CSVEngine | None = ..., 

821 converters: Mapping[Hashable, Callable] | None = ..., 

822 true_values: list | None = ..., 

823 false_values: list | None = ..., 

824 skipinitialspace: bool = ..., 

825 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

826 skipfooter: int = ..., 

827 nrows: int | None = ..., 

828 na_values: Hashable 

829 | Iterable[Hashable] 

830 | Mapping[Hashable, Iterable[Hashable]] 

831 | None = ..., 

832 keep_default_na: bool = ..., 

833 na_filter: bool = ..., 

834 verbose: bool | lib.NoDefault = ..., 

835 skip_blank_lines: bool = ..., 

836 parse_dates: bool | Sequence[Hashable] | None = ..., 

837 infer_datetime_format: bool | lib.NoDefault = ..., 

838 keep_date_col: bool | lib.NoDefault = ..., 

839 date_parser: Callable | lib.NoDefault = ..., 

840 date_format: str | dict[Hashable, str] | None = ..., 

841 dayfirst: bool = ..., 

842 cache_dates: bool = ..., 

843 iterator: bool = ..., 

844 chunksize: int | None = ..., 

845 compression: CompressionOptions = ..., 

846 thousands: str | None = ..., 

847 decimal: str = ..., 

848 lineterminator: str | None = ..., 

849 quotechar: str = ..., 

850 quoting: int = ..., 

851 doublequote: bool = ..., 

852 escapechar: str | None = ..., 

853 comment: str | None = ..., 

854 encoding: str | None = ..., 

855 encoding_errors: str | None = ..., 

856 dialect: str | csv.Dialect | None = ..., 

857 on_bad_lines=..., 

858 delim_whitespace: bool | lib.NoDefault = ..., 

859 low_memory: bool = ..., 

860 memory_map: bool = ..., 

861 float_precision: Literal["high", "legacy"] | None = ..., 

862 storage_options: StorageOptions = ..., 

863 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

864) -> DataFrame | TextFileReader: 

865 ... 

866 

867 

868@Appender( 

869 _doc_read_csv_and_table.format( 

870 func_name="read_csv", 

871 summary="Read a comma-separated values (csv) file into DataFrame.", 

872 see_also_func_name="read_table", 

873 see_also_func_summary="Read general delimited file into DataFrame.", 

874 _default_sep="','", 

875 storage_options=_shared_docs["storage_options"], 

876 decompression_options=_shared_docs["decompression_options"] 

877 % "filepath_or_buffer", 

878 ) 

879) 

880def read_csv( 

881 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

882 *, 

883 sep: str | None | lib.NoDefault = lib.no_default, 

884 delimiter: str | None | lib.NoDefault = None, 

885 # Column and Index Locations and Names 

886 header: int | Sequence[int] | None | Literal["infer"] = "infer", 

887 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, 

888 index_col: IndexLabel | Literal[False] | None = None, 

889 usecols: UsecolsArgType = None, 

890 # General Parsing Configuration 

891 dtype: DtypeArg | None = None, 

892 engine: CSVEngine | None = None, 

893 converters: Mapping[Hashable, Callable] | None = None, 

894 true_values: list | None = None, 

895 false_values: list | None = None, 

896 skipinitialspace: bool = False, 

897 skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, 

898 skipfooter: int = 0, 

899 nrows: int | None = None, 

900 # NA and Missing Data Handling 

901 na_values: Hashable 

902 | Iterable[Hashable] 

903 | Mapping[Hashable, Iterable[Hashable]] 

904 | None = None, 

905 keep_default_na: bool = True, 

906 na_filter: bool = True, 

907 verbose: bool | lib.NoDefault = lib.no_default, 

908 skip_blank_lines: bool = True, 

909 # Datetime Handling 

910 parse_dates: bool | Sequence[Hashable] | None = None, 

911 infer_datetime_format: bool | lib.NoDefault = lib.no_default, 

912 keep_date_col: bool | lib.NoDefault = lib.no_default, 

913 date_parser: Callable | lib.NoDefault = lib.no_default, 

914 date_format: str | dict[Hashable, str] | None = None, 

915 dayfirst: bool = False, 

916 cache_dates: bool = True, 

917 # Iteration 

918 iterator: bool = False, 

919 chunksize: int | None = None, 

920 # Quoting, Compression, and File Format 

921 compression: CompressionOptions = "infer", 

922 thousands: str | None = None, 

923 decimal: str = ".", 

924 lineterminator: str | None = None, 

925 quotechar: str = '"', 

926 quoting: int = csv.QUOTE_MINIMAL, 

927 doublequote: bool = True, 

928 escapechar: str | None = None, 

929 comment: str | None = None, 

930 encoding: str | None = None, 

931 encoding_errors: str | None = "strict", 

932 dialect: str | csv.Dialect | None = None, 

933 # Error Handling 

934 on_bad_lines: str = "error", 

935 # Internal 

936 delim_whitespace: bool | lib.NoDefault = lib.no_default, 

937 low_memory: bool = _c_parser_defaults["low_memory"], 

938 memory_map: bool = False, 

939 float_precision: Literal["high", "legacy"] | None = None, 

940 storage_options: StorageOptions | None = None, 

941 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

942) -> DataFrame | TextFileReader: 

943 if keep_date_col is not lib.no_default: 

944 # GH#55569 

945 warnings.warn( 

946 "The 'keep_date_col' keyword in pd.read_csv is deprecated and " 

947 "will be removed in a future version. Explicitly remove unwanted " 

948 "columns after parsing instead.", 

949 FutureWarning, 

950 stacklevel=find_stack_level(), 

951 ) 

952 else: 

953 keep_date_col = False 

954 

955 if lib.is_list_like(parse_dates): 

956 # GH#55569 

957 depr = False 

958 # error: Item "bool" of "bool | Sequence[Hashable] | None" has no 

959 # attribute "__iter__" (not iterable) 

960 if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] 

961 depr = True 

962 elif isinstance(parse_dates, dict) and any( 

963 lib.is_list_like(x) for x in parse_dates.values() 

964 ): 

965 depr = True 

966 if depr: 

967 warnings.warn( 

968 "Support for nested sequences for 'parse_dates' in pd.read_csv " 

969 "is deprecated. Combine the desired columns with pd.to_datetime " 

970 "after parsing instead.", 

971 FutureWarning, 

972 stacklevel=find_stack_level(), 

973 ) 

974 

975 if infer_datetime_format is not lib.no_default: 

976 warnings.warn( 

977 "The argument 'infer_datetime_format' is deprecated and will " 

978 "be removed in a future version. " 

979 "A strict version of it is now the default, see " 

980 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " 

981 "You can safely remove this argument.", 

982 FutureWarning, 

983 stacklevel=find_stack_level(), 

984 ) 

985 

986 if delim_whitespace is not lib.no_default: 

987 # GH#55569 

988 warnings.warn( 

989 "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " 

990 "will be removed in a future version. Use ``sep='\\s+'`` instead", 

991 FutureWarning, 

992 stacklevel=find_stack_level(), 

993 ) 

994 else: 

995 delim_whitespace = False 

996 

997 if verbose is not lib.no_default: 

998 # GH#55569 

999 warnings.warn( 

1000 "The 'verbose' keyword in pd.read_csv is deprecated and " 

1001 "will be removed in a future version.", 

1002 FutureWarning, 

1003 stacklevel=find_stack_level(), 

1004 ) 

1005 else: 

1006 verbose = False 

1007 

1008 # locals() should never be modified 

1009 kwds = locals().copy() 

1010 del kwds["filepath_or_buffer"] 

1011 del kwds["sep"] 

1012 

1013 kwds_defaults = _refine_defaults_read( 

1014 dialect, 

1015 delimiter, 

1016 delim_whitespace, 

1017 engine, 

1018 sep, 

1019 on_bad_lines, 

1020 names, 

1021 defaults={"delimiter": ","}, 

1022 dtype_backend=dtype_backend, 

1023 ) 

1024 kwds.update(kwds_defaults) 

1025 

1026 return _read(filepath_or_buffer, kwds) 

1027 

1028 

1029# iterator=True -> TextFileReader 

1030@overload 

1031def read_table( 

1032 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1033 *, 

1034 sep: str | None | lib.NoDefault = ..., 

1035 delimiter: str | None | lib.NoDefault = ..., 

1036 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1037 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1038 index_col: IndexLabel | Literal[False] | None = ..., 

1039 usecols: UsecolsArgType = ..., 

1040 dtype: DtypeArg | None = ..., 

1041 engine: CSVEngine | None = ..., 

1042 converters: Mapping[Hashable, Callable] | None = ..., 

1043 true_values: list | None = ..., 

1044 false_values: list | None = ..., 

1045 skipinitialspace: bool = ..., 

1046 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

1047 skipfooter: int = ..., 

1048 nrows: int | None = ..., 

1049 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., 

1050 keep_default_na: bool = ..., 

1051 na_filter: bool = ..., 

1052 verbose: bool | lib.NoDefault = ..., 

1053 skip_blank_lines: bool = ..., 

1054 parse_dates: bool | Sequence[Hashable] = ..., 

1055 infer_datetime_format: bool | lib.NoDefault = ..., 

1056 keep_date_col: bool | lib.NoDefault = ..., 

1057 date_parser: Callable | lib.NoDefault = ..., 

1058 date_format: str | dict[Hashable, str] | None = ..., 

1059 dayfirst: bool = ..., 

1060 cache_dates: bool = ..., 

1061 iterator: Literal[True], 

1062 chunksize: int | None = ..., 

1063 compression: CompressionOptions = ..., 

1064 thousands: str | None = ..., 

1065 decimal: str = ..., 

1066 lineterminator: str | None = ..., 

1067 quotechar: str = ..., 

1068 quoting: int = ..., 

1069 doublequote: bool = ..., 

1070 escapechar: str | None = ..., 

1071 comment: str | None = ..., 

1072 encoding: str | None = ..., 

1073 encoding_errors: str | None = ..., 

1074 dialect: str | csv.Dialect | None = ..., 

1075 on_bad_lines=..., 

1076 delim_whitespace: bool = ..., 

1077 low_memory: bool = ..., 

1078 memory_map: bool = ..., 

1079 float_precision: str | None = ..., 

1080 storage_options: StorageOptions = ..., 

1081 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1082) -> TextFileReader: 

1083 ... 

1084 

1085 

1086# chunksize=int -> TextFileReader 

1087@overload 

1088def read_table( 

1089 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1090 *, 

1091 sep: str | None | lib.NoDefault = ..., 

1092 delimiter: str | None | lib.NoDefault = ..., 

1093 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1094 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1095 index_col: IndexLabel | Literal[False] | None = ..., 

1096 usecols: UsecolsArgType = ..., 

1097 dtype: DtypeArg | None = ..., 

1098 engine: CSVEngine | None = ..., 

1099 converters: Mapping[Hashable, Callable] | None = ..., 

1100 true_values: list | None = ..., 

1101 false_values: list | None = ..., 

1102 skipinitialspace: bool = ..., 

1103 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

1104 skipfooter: int = ..., 

1105 nrows: int | None = ..., 

1106 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., 

1107 keep_default_na: bool = ..., 

1108 na_filter: bool = ..., 

1109 verbose: bool | lib.NoDefault = ..., 

1110 skip_blank_lines: bool = ..., 

1111 parse_dates: bool | Sequence[Hashable] = ..., 

1112 infer_datetime_format: bool | lib.NoDefault = ..., 

1113 keep_date_col: bool | lib.NoDefault = ..., 

1114 date_parser: Callable | lib.NoDefault = ..., 

1115 date_format: str | dict[Hashable, str] | None = ..., 

1116 dayfirst: bool = ..., 

1117 cache_dates: bool = ..., 

1118 iterator: bool = ..., 

1119 chunksize: int, 

1120 compression: CompressionOptions = ..., 

1121 thousands: str | None = ..., 

1122 decimal: str = ..., 

1123 lineterminator: str | None = ..., 

1124 quotechar: str = ..., 

1125 quoting: int = ..., 

1126 doublequote: bool = ..., 

1127 escapechar: str | None = ..., 

1128 comment: str | None = ..., 

1129 encoding: str | None = ..., 

1130 encoding_errors: str | None = ..., 

1131 dialect: str | csv.Dialect | None = ..., 

1132 on_bad_lines=..., 

1133 delim_whitespace: bool = ..., 

1134 low_memory: bool = ..., 

1135 memory_map: bool = ..., 

1136 float_precision: str | None = ..., 

1137 storage_options: StorageOptions = ..., 

1138 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1139) -> TextFileReader: 

1140 ... 

1141 

1142 

1143# default -> DataFrame 

1144@overload 

1145def read_table( 

1146 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1147 *, 

1148 sep: str | None | lib.NoDefault = ..., 

1149 delimiter: str | None | lib.NoDefault = ..., 

1150 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1151 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1152 index_col: IndexLabel | Literal[False] | None = ..., 

1153 usecols: UsecolsArgType = ..., 

1154 dtype: DtypeArg | None = ..., 

1155 engine: CSVEngine | None = ..., 

1156 converters: Mapping[Hashable, Callable] | None = ..., 

1157 true_values: list | None = ..., 

1158 false_values: list | None = ..., 

1159 skipinitialspace: bool = ..., 

1160 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

1161 skipfooter: int = ..., 

1162 nrows: int | None = ..., 

1163 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., 

1164 keep_default_na: bool = ..., 

1165 na_filter: bool = ..., 

1166 verbose: bool | lib.NoDefault = ..., 

1167 skip_blank_lines: bool = ..., 

1168 parse_dates: bool | Sequence[Hashable] = ..., 

1169 infer_datetime_format: bool | lib.NoDefault = ..., 

1170 keep_date_col: bool | lib.NoDefault = ..., 

1171 date_parser: Callable | lib.NoDefault = ..., 

1172 date_format: str | dict[Hashable, str] | None = ..., 

1173 dayfirst: bool = ..., 

1174 cache_dates: bool = ..., 

1175 iterator: Literal[False] = ..., 

1176 chunksize: None = ..., 

1177 compression: CompressionOptions = ..., 

1178 thousands: str | None = ..., 

1179 decimal: str = ..., 

1180 lineterminator: str | None = ..., 

1181 quotechar: str = ..., 

1182 quoting: int = ..., 

1183 doublequote: bool = ..., 

1184 escapechar: str | None = ..., 

1185 comment: str | None = ..., 

1186 encoding: str | None = ..., 

1187 encoding_errors: str | None = ..., 

1188 dialect: str | csv.Dialect | None = ..., 

1189 on_bad_lines=..., 

1190 delim_whitespace: bool = ..., 

1191 low_memory: bool = ..., 

1192 memory_map: bool = ..., 

1193 float_precision: str | None = ..., 

1194 storage_options: StorageOptions = ..., 

1195 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1196) -> DataFrame: 

1197 ... 

1198 

1199 

1200# Unions -> DataFrame | TextFileReader 

1201@overload 

1202def read_table( 

1203 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1204 *, 

1205 sep: str | None | lib.NoDefault = ..., 

1206 delimiter: str | None | lib.NoDefault = ..., 

1207 header: int | Sequence[int] | None | Literal["infer"] = ..., 

1208 names: Sequence[Hashable] | None | lib.NoDefault = ..., 

1209 index_col: IndexLabel | Literal[False] | None = ..., 

1210 usecols: UsecolsArgType = ..., 

1211 dtype: DtypeArg | None = ..., 

1212 engine: CSVEngine | None = ..., 

1213 converters: Mapping[Hashable, Callable] | None = ..., 

1214 true_values: list | None = ..., 

1215 false_values: list | None = ..., 

1216 skipinitialspace: bool = ..., 

1217 skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., 

1218 skipfooter: int = ..., 

1219 nrows: int | None = ..., 

1220 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., 

1221 keep_default_na: bool = ..., 

1222 na_filter: bool = ..., 

1223 verbose: bool | lib.NoDefault = ..., 

1224 skip_blank_lines: bool = ..., 

1225 parse_dates: bool | Sequence[Hashable] = ..., 

1226 infer_datetime_format: bool | lib.NoDefault = ..., 

1227 keep_date_col: bool | lib.NoDefault = ..., 

1228 date_parser: Callable | lib.NoDefault = ..., 

1229 date_format: str | dict[Hashable, str] | None = ..., 

1230 dayfirst: bool = ..., 

1231 cache_dates: bool = ..., 

1232 iterator: bool = ..., 

1233 chunksize: int | None = ..., 

1234 compression: CompressionOptions = ..., 

1235 thousands: str | None = ..., 

1236 decimal: str = ..., 

1237 lineterminator: str | None = ..., 

1238 quotechar: str = ..., 

1239 quoting: int = ..., 

1240 doublequote: bool = ..., 

1241 escapechar: str | None = ..., 

1242 comment: str | None = ..., 

1243 encoding: str | None = ..., 

1244 encoding_errors: str | None = ..., 

1245 dialect: str | csv.Dialect | None = ..., 

1246 on_bad_lines=..., 

1247 delim_whitespace: bool = ..., 

1248 low_memory: bool = ..., 

1249 memory_map: bool = ..., 

1250 float_precision: str | None = ..., 

1251 storage_options: StorageOptions = ..., 

1252 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1253) -> DataFrame | TextFileReader: 

1254 ... 

1255 

1256 

1257@Appender( 

1258 _doc_read_csv_and_table.format( 

1259 func_name="read_table", 

1260 summary="Read general delimited file into DataFrame.", 

1261 see_also_func_name="read_csv", 

1262 see_also_func_summary=( 

1263 "Read a comma-separated values (csv) file into DataFrame." 

1264 ), 

1265 _default_sep=r"'\\t' (tab-stop)", 

1266 storage_options=_shared_docs["storage_options"], 

1267 decompression_options=_shared_docs["decompression_options"] 

1268 % "filepath_or_buffer", 

1269 ) 

1270) 

1271def read_table( 

1272 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1273 *, 

1274 sep: str | None | lib.NoDefault = lib.no_default, 

1275 delimiter: str | None | lib.NoDefault = None, 

1276 # Column and Index Locations and Names 

1277 header: int | Sequence[int] | None | Literal["infer"] = "infer", 

1278 names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, 

1279 index_col: IndexLabel | Literal[False] | None = None, 

1280 usecols: UsecolsArgType = None, 

1281 # General Parsing Configuration 

1282 dtype: DtypeArg | None = None, 

1283 engine: CSVEngine | None = None, 

1284 converters: Mapping[Hashable, Callable] | None = None, 

1285 true_values: list | None = None, 

1286 false_values: list | None = None, 

1287 skipinitialspace: bool = False, 

1288 skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, 

1289 skipfooter: int = 0, 

1290 nrows: int | None = None, 

1291 # NA and Missing Data Handling 

1292 na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, 

1293 keep_default_na: bool = True, 

1294 na_filter: bool = True, 

1295 verbose: bool | lib.NoDefault = lib.no_default, 

1296 skip_blank_lines: bool = True, 

1297 # Datetime Handling 

1298 parse_dates: bool | Sequence[Hashable] = False, 

1299 infer_datetime_format: bool | lib.NoDefault = lib.no_default, 

1300 keep_date_col: bool | lib.NoDefault = lib.no_default, 

1301 date_parser: Callable | lib.NoDefault = lib.no_default, 

1302 date_format: str | dict[Hashable, str] | None = None, 

1303 dayfirst: bool = False, 

1304 cache_dates: bool = True, 

1305 # Iteration 

1306 iterator: bool = False, 

1307 chunksize: int | None = None, 

1308 # Quoting, Compression, and File Format 

1309 compression: CompressionOptions = "infer", 

1310 thousands: str | None = None, 

1311 decimal: str = ".", 

1312 lineterminator: str | None = None, 

1313 quotechar: str = '"', 

1314 quoting: int = csv.QUOTE_MINIMAL, 

1315 doublequote: bool = True, 

1316 escapechar: str | None = None, 

1317 comment: str | None = None, 

1318 encoding: str | None = None, 

1319 encoding_errors: str | None = "strict", 

1320 dialect: str | csv.Dialect | None = None, 

1321 # Error Handling 

1322 on_bad_lines: str = "error", 

1323 # Internal 

1324 delim_whitespace: bool | lib.NoDefault = lib.no_default, 

1325 low_memory: bool = _c_parser_defaults["low_memory"], 

1326 memory_map: bool = False, 

1327 float_precision: str | None = None, 

1328 storage_options: StorageOptions | None = None, 

1329 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1330) -> DataFrame | TextFileReader: 

1331 if keep_date_col is not lib.no_default: 

1332 # GH#55569 

1333 warnings.warn( 

1334 "The 'keep_date_col' keyword in pd.read_table is deprecated and " 

1335 "will be removed in a future version. Explicitly remove unwanted " 

1336 "columns after parsing instead.", 

1337 FutureWarning, 

1338 stacklevel=find_stack_level(), 

1339 ) 

1340 else: 

1341 keep_date_col = False 

1342 

1343 # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" 

1344 if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] 

1345 # GH#55569 

1346 warnings.warn( 

1347 "Support for nested sequences for 'parse_dates' in pd.read_table " 

1348 "is deprecated. Combine the desired columns with pd.to_datetime " 

1349 "after parsing instead.", 

1350 FutureWarning, 

1351 stacklevel=find_stack_level(), 

1352 ) 

1353 

1354 if infer_datetime_format is not lib.no_default: 

1355 warnings.warn( 

1356 "The argument 'infer_datetime_format' is deprecated and will " 

1357 "be removed in a future version. " 

1358 "A strict version of it is now the default, see " 

1359 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " 

1360 "You can safely remove this argument.", 

1361 FutureWarning, 

1362 stacklevel=find_stack_level(), 

1363 ) 

1364 

1365 if delim_whitespace is not lib.no_default: 

1366 # GH#55569 

1367 warnings.warn( 

1368 "The 'delim_whitespace' keyword in pd.read_table is deprecated and " 

1369 "will be removed in a future version. Use ``sep='\\s+'`` instead", 

1370 FutureWarning, 

1371 stacklevel=find_stack_level(), 

1372 ) 

1373 else: 

1374 delim_whitespace = False 

1375 

1376 if verbose is not lib.no_default: 

1377 # GH#55569 

1378 warnings.warn( 

1379 "The 'verbose' keyword in pd.read_table is deprecated and " 

1380 "will be removed in a future version.", 

1381 FutureWarning, 

1382 stacklevel=find_stack_level(), 

1383 ) 

1384 else: 

1385 verbose = False 

1386 

1387 # locals() should never be modified 

1388 kwds = locals().copy() 

1389 del kwds["filepath_or_buffer"] 

1390 del kwds["sep"] 

1391 

1392 kwds_defaults = _refine_defaults_read( 

1393 dialect, 

1394 delimiter, 

1395 delim_whitespace, 

1396 engine, 

1397 sep, 

1398 on_bad_lines, 

1399 names, 

1400 defaults={"delimiter": "\t"}, 

1401 dtype_backend=dtype_backend, 

1402 ) 

1403 kwds.update(kwds_defaults) 

1404 

1405 return _read(filepath_or_buffer, kwds) 

1406 

1407 

1408@overload 

1409def read_fwf( 

1410 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1411 *, 

1412 colspecs: Sequence[tuple[int, int]] | str | None = ..., 

1413 widths: Sequence[int] | None = ..., 

1414 infer_nrows: int = ..., 

1415 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1416 iterator: Literal[True], 

1417 chunksize: int | None = ..., 

1418 **kwds, 

1419) -> TextFileReader: 

1420 ... 

1421 

1422 

1423@overload 

1424def read_fwf( 

1425 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1426 *, 

1427 colspecs: Sequence[tuple[int, int]] | str | None = ..., 

1428 widths: Sequence[int] | None = ..., 

1429 infer_nrows: int = ..., 

1430 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1431 iterator: bool = ..., 

1432 chunksize: int, 

1433 **kwds, 

1434) -> TextFileReader: 

1435 ... 

1436 

1437 

1438@overload 

1439def read_fwf( 

1440 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1441 *, 

1442 colspecs: Sequence[tuple[int, int]] | str | None = ..., 

1443 widths: Sequence[int] | None = ..., 

1444 infer_nrows: int = ..., 

1445 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

1446 iterator: Literal[False] = ..., 

1447 chunksize: None = ..., 

1448 **kwds, 

1449) -> DataFrame: 

1450 ... 

1451 

1452 

1453def read_fwf( 

1454 filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], 

1455 *, 

1456 colspecs: Sequence[tuple[int, int]] | str | None = "infer", 

1457 widths: Sequence[int] | None = None, 

1458 infer_nrows: int = 100, 

1459 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1460 iterator: bool = False, 

1461 chunksize: int | None = None, 

1462 **kwds, 

1463) -> DataFrame | TextFileReader: 

1464 r""" 

1465 Read a table of fixed-width formatted lines into DataFrame. 

1466 

1467 Also supports optionally iterating or breaking of the file 

1468 into chunks. 

1469 

1470 Additional help can be found in the `online docs for IO Tools 

1471 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_. 

1472 

1473 Parameters 

1474 ---------- 

1475 filepath_or_buffer : str, path object, or file-like object 

1476 String, path object (implementing ``os.PathLike[str]``), or file-like 

1477 object implementing a text ``read()`` function.The string could be a URL. 

1478 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is 

1479 expected. A local file could be: 

1480 ``file://localhost/path/to/table.csv``. 

1481 colspecs : list of tuple (int, int) or 'infer'. optional 

1482 A list of tuples giving the extents of the fixed-width 

1483 fields of each line as half-open intervals (i.e., [from, to[ ). 

1484 String value 'infer' can be used to instruct the parser to try 

1485 detecting the column specifications from the first 100 rows of 

1486 the data which are not being skipped via skiprows (default='infer'). 

1487 widths : list of int, optional 

1488 A list of field widths which can be used instead of 'colspecs' if 

1489 the intervals are contiguous. 

1490 infer_nrows : int, default 100 

1491 The number of rows to consider when letting the parser determine the 

1492 `colspecs`. 

1493 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' 

1494 Back-end data type applied to the resultant :class:`DataFrame` 

1495 (still experimental). Behaviour is as follows: 

1496 

1497 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` 

1498 (default). 

1499 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` 

1500 DataFrame. 

1501 

1502 .. versionadded:: 2.0 

1503 

1504 **kwds : optional 

1505 Optional keyword arguments can be passed to ``TextFileReader``. 

1506 

1507 Returns 

1508 ------- 

1509 DataFrame or TextFileReader 

1510 A comma-separated values (csv) file is returned as two-dimensional 

1511 data structure with labeled axes. 

1512 

1513 See Also 

1514 -------- 

1515 DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. 

1516 read_csv : Read a comma-separated values (csv) file into DataFrame. 

1517 

1518 Examples 

1519 -------- 

1520 >>> pd.read_fwf('data.csv') # doctest: +SKIP 

1521 """ 

1522 # Check input arguments. 

1523 if colspecs is None and widths is None: 

1524 raise ValueError("Must specify either colspecs or widths") 

1525 if colspecs not in (None, "infer") and widths is not None: 

1526 raise ValueError("You must specify only one of 'widths' and 'colspecs'") 

1527 

1528 # Compute 'colspecs' from 'widths', if specified. 

1529 if widths is not None: 

1530 colspecs, col = [], 0 

1531 for w in widths: 

1532 colspecs.append((col, col + w)) 

1533 col += w 

1534 

1535 # for mypy 

1536 assert colspecs is not None 

1537 

1538 # GH#40830 

1539 # Ensure length of `colspecs` matches length of `names` 

1540 names = kwds.get("names") 

1541 if names is not None: 

1542 if len(names) != len(colspecs) and colspecs != "infer": 

1543 # need to check len(index_col) as it might contain 

1544 # unnamed indices, in which case it's name is not required 

1545 len_index = 0 

1546 if kwds.get("index_col") is not None: 

1547 index_col: Any = kwds.get("index_col") 

1548 if index_col is not False: 

1549 if not is_list_like(index_col): 

1550 len_index = 1 

1551 else: 

1552 len_index = len(index_col) 

1553 if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): 

1554 # If usecols is used colspec may be longer than names 

1555 raise ValueError("Length of colspecs must match length of names") 

1556 

1557 kwds["colspecs"] = colspecs 

1558 kwds["infer_nrows"] = infer_nrows 

1559 kwds["engine"] = "python-fwf" 

1560 kwds["iterator"] = iterator 

1561 kwds["chunksize"] = chunksize 

1562 

1563 check_dtype_backend(dtype_backend) 

1564 kwds["dtype_backend"] = dtype_backend 

1565 return _read(filepath_or_buffer, kwds) 

1566 

1567 

1568class TextFileReader(abc.Iterator): 

1569 """ 

1570 

1571 Passed dialect overrides any of the related parser options 

1572 

1573 """ 

1574 

1575 def __init__( 

1576 self, 

1577 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, 

1578 engine: CSVEngine | None = None, 

1579 **kwds, 

1580 ) -> None: 

1581 if engine is not None: 

1582 engine_specified = True 

1583 else: 

1584 engine = "python" 

1585 engine_specified = False 

1586 self.engine = engine 

1587 self._engine_specified = kwds.get("engine_specified", engine_specified) 

1588 

1589 _validate_skipfooter(kwds) 

1590 

1591 dialect = _extract_dialect(kwds) 

1592 if dialect is not None: 

1593 if engine == "pyarrow": 

1594 raise ValueError( 

1595 "The 'dialect' option is not supported with the 'pyarrow' engine" 

1596 ) 

1597 kwds = _merge_with_dialect_properties(dialect, kwds) 

1598 

1599 if kwds.get("header", "infer") == "infer": 

1600 kwds["header"] = 0 if kwds.get("names") is None else None 

1601 

1602 self.orig_options = kwds 

1603 

1604 # miscellanea 

1605 self._currow = 0 

1606 

1607 options = self._get_options_with_defaults(engine) 

1608 options["storage_options"] = kwds.get("storage_options", None) 

1609 

1610 self.chunksize = options.pop("chunksize", None) 

1611 self.nrows = options.pop("nrows", None) 

1612 

1613 self._check_file_or_buffer(f, engine) 

1614 self.options, self.engine = self._clean_options(options, engine) 

1615 

1616 if "has_index_names" in kwds: 

1617 self.options["has_index_names"] = kwds["has_index_names"] 

1618 

1619 self.handles: IOHandles | None = None 

1620 self._engine = self._make_engine(f, self.engine) 

1621 

1622 def close(self) -> None: 

1623 if self.handles is not None: 

1624 self.handles.close() 

1625 self._engine.close() 

1626 

1627 def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: 

1628 kwds = self.orig_options 

1629 

1630 options = {} 

1631 default: object | None 

1632 

1633 for argname, default in parser_defaults.items(): 

1634 value = kwds.get(argname, default) 

1635 

1636 # see gh-12935 

1637 if ( 

1638 engine == "pyarrow" 

1639 and argname in _pyarrow_unsupported 

1640 and value != default 

1641 and value != getattr(value, "value", default) 

1642 ): 

1643 raise ValueError( 

1644 f"The {repr(argname)} option is not supported with the " 

1645 f"'pyarrow' engine" 

1646 ) 

1647 options[argname] = value 

1648 

1649 for argname, default in _c_parser_defaults.items(): 

1650 if argname in kwds: 

1651 value = kwds[argname] 

1652 

1653 if engine != "c" and value != default: 

1654 # TODO: Refactor this logic, its pretty convoluted 

1655 if "python" in engine and argname not in _python_unsupported: 

1656 pass 

1657 elif "pyarrow" in engine and argname not in _pyarrow_unsupported: 

1658 pass 

1659 else: 

1660 raise ValueError( 

1661 f"The {repr(argname)} option is not supported with the " 

1662 f"{repr(engine)} engine" 

1663 ) 

1664 else: 

1665 value = default 

1666 options[argname] = value 

1667 

1668 if engine == "python-fwf": 

1669 for argname, default in _fwf_defaults.items(): 

1670 options[argname] = kwds.get(argname, default) 

1671 

1672 return options 

1673 

1674 def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: 

1675 # see gh-16530 

1676 if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): 

1677 # The C engine doesn't need the file-like to have the "__iter__" 

1678 # attribute. However, the Python engine needs "__iter__(...)" 

1679 # when iterating through such an object, meaning it 

1680 # needs to have that attribute 

1681 raise ValueError( 

1682 "The 'python' engine cannot iterate through this file buffer." 

1683 ) 

1684 

1685 def _clean_options( 

1686 self, options: dict[str, Any], engine: CSVEngine 

1687 ) -> tuple[dict[str, Any], CSVEngine]: 

1688 result = options.copy() 

1689 

1690 fallback_reason = None 

1691 

1692 # C engine not supported yet 

1693 if engine == "c": 

1694 if options["skipfooter"] > 0: 

1695 fallback_reason = "the 'c' engine does not support skipfooter" 

1696 engine = "python" 

1697 

1698 sep = options["delimiter"] 

1699 delim_whitespace = options["delim_whitespace"] 

1700 

1701 if sep is None and not delim_whitespace: 

1702 if engine in ("c", "pyarrow"): 

1703 fallback_reason = ( 

1704 f"the '{engine}' engine does not support " 

1705 "sep=None with delim_whitespace=False" 

1706 ) 

1707 engine = "python" 

1708 elif sep is not None and len(sep) > 1: 

1709 if engine == "c" and sep == r"\s+": 

1710 result["delim_whitespace"] = True 

1711 del result["delimiter"] 

1712 elif engine not in ("python", "python-fwf"): 

1713 # wait until regex engine integrated 

1714 fallback_reason = ( 

1715 f"the '{engine}' engine does not support " 

1716 "regex separators (separators > 1 char and " 

1717 r"different from '\s+' are interpreted as regex)" 

1718 ) 

1719 engine = "python" 

1720 elif delim_whitespace: 

1721 if "python" in engine: 

1722 result["delimiter"] = r"\s+" 

1723 elif sep is not None: 

1724 encodeable = True 

1725 encoding = sys.getfilesystemencoding() or "utf-8" 

1726 try: 

1727 if len(sep.encode(encoding)) > 1: 

1728 encodeable = False 

1729 except UnicodeDecodeError: 

1730 encodeable = False 

1731 if not encodeable and engine not in ("python", "python-fwf"): 

1732 fallback_reason = ( 

1733 f"the separator encoded in {encoding} " 

1734 f"is > 1 char long, and the '{engine}' engine " 

1735 "does not support such separators" 

1736 ) 

1737 engine = "python" 

1738 

1739 quotechar = options["quotechar"] 

1740 if quotechar is not None and isinstance(quotechar, (str, bytes)): 

1741 if ( 

1742 len(quotechar) == 1 

1743 and ord(quotechar) > 127 

1744 and engine not in ("python", "python-fwf") 

1745 ): 

1746 fallback_reason = ( 

1747 "ord(quotechar) > 127, meaning the " 

1748 "quotechar is larger than one byte, " 

1749 f"and the '{engine}' engine does not support such quotechars" 

1750 ) 

1751 engine = "python" 

1752 

1753 if fallback_reason and self._engine_specified: 

1754 raise ValueError(fallback_reason) 

1755 

1756 if engine == "c": 

1757 for arg in _c_unsupported: 

1758 del result[arg] 

1759 

1760 if "python" in engine: 

1761 for arg in _python_unsupported: 

1762 if fallback_reason and result[arg] != _c_parser_defaults.get(arg): 

1763 raise ValueError( 

1764 "Falling back to the 'python' engine because " 

1765 f"{fallback_reason}, but this causes {repr(arg)} to be " 

1766 "ignored as it is not supported by the 'python' engine." 

1767 ) 

1768 del result[arg] 

1769 

1770 if fallback_reason: 

1771 warnings.warn( 

1772 ( 

1773 "Falling back to the 'python' engine because " 

1774 f"{fallback_reason}; you can avoid this warning by specifying " 

1775 "engine='python'." 

1776 ), 

1777 ParserWarning, 

1778 stacklevel=find_stack_level(), 

1779 ) 

1780 

1781 index_col = options["index_col"] 

1782 names = options["names"] 

1783 converters = options["converters"] 

1784 na_values = options["na_values"] 

1785 skiprows = options["skiprows"] 

1786 

1787 validate_header_arg(options["header"]) 

1788 

1789 if index_col is True: 

1790 raise ValueError("The value of index_col couldn't be 'True'") 

1791 if is_index_col(index_col): 

1792 if not isinstance(index_col, (list, tuple, np.ndarray)): 

1793 index_col = [index_col] 

1794 result["index_col"] = index_col 

1795 

1796 names = list(names) if names is not None else names 

1797 

1798 # type conversion-related 

1799 if converters is not None: 

1800 if not isinstance(converters, dict): 

1801 raise TypeError( 

1802 "Type converters must be a dict or subclass, " 

1803 f"input was a {type(converters).__name__}" 

1804 ) 

1805 else: 

1806 converters = {} 

1807 

1808 # Converting values to NA 

1809 keep_default_na = options["keep_default_na"] 

1810 floatify = engine != "pyarrow" 

1811 na_values, na_fvalues = _clean_na_values( 

1812 na_values, keep_default_na, floatify=floatify 

1813 ) 

1814 

1815 # handle skiprows; this is internally handled by the 

1816 # c-engine, so only need for python and pyarrow parsers 

1817 if engine == "pyarrow": 

1818 if not is_integer(skiprows) and skiprows is not None: 

1819 # pyarrow expects skiprows to be passed as an integer 

1820 raise ValueError( 

1821 "skiprows argument must be an integer when using " 

1822 "engine='pyarrow'" 

1823 ) 

1824 else: 

1825 if is_integer(skiprows): 

1826 skiprows = list(range(skiprows)) 

1827 if skiprows is None: 

1828 skiprows = set() 

1829 elif not callable(skiprows): 

1830 skiprows = set(skiprows) 

1831 

1832 # put stuff back 

1833 result["names"] = names 

1834 result["converters"] = converters 

1835 result["na_values"] = na_values 

1836 result["na_fvalues"] = na_fvalues 

1837 result["skiprows"] = skiprows 

1838 

1839 return result, engine 

1840 

1841 def __next__(self) -> DataFrame: 

1842 try: 

1843 return self.get_chunk() 

1844 except StopIteration: 

1845 self.close() 

1846 raise 

1847 

1848 def _make_engine( 

1849 self, 

1850 f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO, 

1851 engine: CSVEngine = "c", 

1852 ) -> ParserBase: 

1853 mapping: dict[str, type[ParserBase]] = { 

1854 "c": CParserWrapper, 

1855 "python": PythonParser, 

1856 "pyarrow": ArrowParserWrapper, 

1857 "python-fwf": FixedWidthFieldParser, 

1858 } 

1859 if engine not in mapping: 

1860 raise ValueError( 

1861 f"Unknown engine: {engine} (valid options are {mapping.keys()})" 

1862 ) 

1863 if not isinstance(f, list): 

1864 # open file here 

1865 is_text = True 

1866 mode = "r" 

1867 if engine == "pyarrow": 

1868 is_text = False 

1869 mode = "rb" 

1870 elif ( 

1871 engine == "c" 

1872 and self.options.get("encoding", "utf-8") == "utf-8" 

1873 and isinstance(stringify_path(f), str) 

1874 ): 

1875 # c engine can decode utf-8 bytes, adding TextIOWrapper makes 

1876 # the c-engine especially for memory_map=True far slower 

1877 is_text = False 

1878 if "b" not in mode: 

1879 mode += "b" 

1880 self.handles = get_handle( 

1881 f, 

1882 mode, 

1883 encoding=self.options.get("encoding", None), 

1884 compression=self.options.get("compression", None), 

1885 memory_map=self.options.get("memory_map", False), 

1886 is_text=is_text, 

1887 errors=self.options.get("encoding_errors", "strict"), 

1888 storage_options=self.options.get("storage_options", None), 

1889 ) 

1890 assert self.handles is not None 

1891 f = self.handles.handle 

1892 

1893 elif engine != "python": 

1894 msg = f"Invalid file path or buffer object type: {type(f)}" 

1895 raise ValueError(msg) 

1896 

1897 try: 

1898 return mapping[engine](f, **self.options) 

1899 except Exception: 

1900 if self.handles is not None: 

1901 self.handles.close() 

1902 raise 

1903 

1904 def _failover_to_python(self) -> None: 

1905 raise AbstractMethodError(self) 

1906 

1907 def read(self, nrows: int | None = None) -> DataFrame: 

1908 if self.engine == "pyarrow": 

1909 try: 

1910 # error: "ParserBase" has no attribute "read" 

1911 df = self._engine.read() # type: ignore[attr-defined] 

1912 except Exception: 

1913 self.close() 

1914 raise 

1915 else: 

1916 nrows = validate_integer("nrows", nrows) 

1917 try: 

1918 # error: "ParserBase" has no attribute "read" 

1919 ( 

1920 index, 

1921 columns, 

1922 col_dict, 

1923 ) = self._engine.read( # type: ignore[attr-defined] 

1924 nrows 

1925 ) 

1926 except Exception: 

1927 self.close() 

1928 raise 

1929 

1930 if index is None: 

1931 if col_dict: 

1932 # Any column is actually fine: 

1933 new_rows = len(next(iter(col_dict.values()))) 

1934 index = RangeIndex(self._currow, self._currow + new_rows) 

1935 else: 

1936 new_rows = 0 

1937 else: 

1938 new_rows = len(index) 

1939 

1940 if hasattr(self, "orig_options"): 

1941 dtype_arg = self.orig_options.get("dtype", None) 

1942 else: 

1943 dtype_arg = None 

1944 

1945 if isinstance(dtype_arg, dict): 

1946 dtype = defaultdict(lambda: None) # type: ignore[var-annotated] 

1947 dtype.update(dtype_arg) 

1948 elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( 

1949 np.str_, 

1950 np.object_, 

1951 ): 

1952 dtype = defaultdict(lambda: dtype_arg) 

1953 else: 

1954 dtype = None 

1955 

1956 if dtype is not None: 

1957 new_col_dict = {} 

1958 for k, v in col_dict.items(): 

1959 d = ( 

1960 dtype[k] 

1961 if pandas_dtype(dtype[k]) in (np.str_, np.object_) 

1962 else None 

1963 ) 

1964 new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) 

1965 else: 

1966 new_col_dict = col_dict 

1967 

1968 df = DataFrame( 

1969 new_col_dict, 

1970 columns=columns, 

1971 index=index, 

1972 copy=not using_copy_on_write(), 

1973 ) 

1974 

1975 self._currow += new_rows 

1976 return df 

1977 

1978 def get_chunk(self, size: int | None = None) -> DataFrame: 

1979 if size is None: 

1980 size = self.chunksize 

1981 if self.nrows is not None: 

1982 if self._currow >= self.nrows: 

1983 raise StopIteration 

1984 size = min(size, self.nrows - self._currow) 

1985 return self.read(nrows=size) 

1986 

1987 def __enter__(self) -> Self: 

1988 return self 

1989 

1990 def __exit__( 

1991 self, 

1992 exc_type: type[BaseException] | None, 

1993 exc_value: BaseException | None, 

1994 traceback: TracebackType | None, 

1995 ) -> None: 

1996 self.close() 

1997 

1998 

1999def TextParser(*args, **kwds) -> TextFileReader: 

2000 """ 

2001 Converts lists of lists/tuples into DataFrames with proper type inference 

2002 and optional (e.g. string to datetime) conversion. Also enables iterating 

2003 lazily over chunks of large files 

2004 

2005 Parameters 

2006 ---------- 

2007 data : file-like object or list 

2008 delimiter : separator character to use 

2009 dialect : str or csv.Dialect instance, optional 

2010 Ignored if delimiter is longer than 1 character 

2011 names : sequence, default 

2012 header : int, default 0 

2013 Row to use to parse column labels. Defaults to the first row. Prior 

2014 rows will be discarded 

2015 index_col : int or list, optional 

2016 Column or columns to use as the (possibly hierarchical) index 

2017 has_index_names: bool, default False 

2018 True if the cols defined in index_col have an index name and are 

2019 not in the header. 

2020 na_values : scalar, str, list-like, or dict, optional 

2021 Additional strings to recognize as NA/NaN. 

2022 keep_default_na : bool, default True 

2023 thousands : str, optional 

2024 Thousands separator 

2025 comment : str, optional 

2026 Comment out remainder of line 

2027 parse_dates : bool, default False 

2028 keep_date_col : bool, default False 

2029 date_parser : function, optional 

2030 

2031 .. deprecated:: 2.0.0 

2032 date_format : str or dict of column -> format, default ``None`` 

2033 

2034 .. versionadded:: 2.0.0 

2035 skiprows : list of integers 

2036 Row numbers to skip 

2037 skipfooter : int 

2038 Number of line at bottom of file to skip 

2039 converters : dict, optional 

2040 Dict of functions for converting values in certain columns. Keys can 

2041 either be integers or column labels, values are functions that take one 

2042 input argument, the cell (not column) content, and return the 

2043 transformed content. 

2044 encoding : str, optional 

2045 Encoding to use for UTF when reading/writing (ex. 'utf-8') 

2046 float_precision : str, optional 

2047 Specifies which converter the C engine should use for floating-point 

2048 values. The options are `None` or `high` for the ordinary converter, 

2049 `legacy` for the original lower precision pandas converter, and 

2050 `round_trip` for the round-trip converter. 

2051 """ 

2052 kwds["engine"] = "python" 

2053 return TextFileReader(*args, **kwds) 

2054 

2055 

2056def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): 

2057 na_fvalues: set | dict 

2058 if na_values is None: 

2059 if keep_default_na: 

2060 na_values = STR_NA_VALUES 

2061 else: 

2062 na_values = set() 

2063 na_fvalues = set() 

2064 elif isinstance(na_values, dict): 

2065 old_na_values = na_values.copy() 

2066 na_values = {} # Prevent aliasing. 

2067 

2068 # Convert the values in the na_values dictionary 

2069 # into array-likes for further use. This is also 

2070 # where we append the default NaN values, provided 

2071 # that `keep_default_na=True`. 

2072 for k, v in old_na_values.items(): 

2073 if not is_list_like(v): 

2074 v = [v] 

2075 

2076 if keep_default_na: 

2077 v = set(v) | STR_NA_VALUES 

2078 

2079 na_values[k] = v 

2080 na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} 

2081 else: 

2082 if not is_list_like(na_values): 

2083 na_values = [na_values] 

2084 na_values = _stringify_na_values(na_values, floatify) 

2085 if keep_default_na: 

2086 na_values = na_values | STR_NA_VALUES 

2087 

2088 na_fvalues = _floatify_na_values(na_values) 

2089 

2090 return na_values, na_fvalues 

2091 

2092 

2093def _floatify_na_values(na_values): 

2094 # create float versions of the na_values 

2095 result = set() 

2096 for v in na_values: 

2097 try: 

2098 v = float(v) 

2099 if not np.isnan(v): 

2100 result.add(v) 

2101 except (TypeError, ValueError, OverflowError): 

2102 pass 

2103 return result 

2104 

2105 

2106def _stringify_na_values(na_values, floatify: bool): 

2107 """return a stringified and numeric for these values""" 

2108 result: list[str | float] = [] 

2109 for x in na_values: 

2110 result.append(str(x)) 

2111 result.append(x) 

2112 try: 

2113 v = float(x) 

2114 

2115 # we are like 999 here 

2116 if v == int(v): 

2117 v = int(v) 

2118 result.append(f"{v}.0") 

2119 result.append(str(v)) 

2120 

2121 if floatify: 

2122 result.append(v) 

2123 except (TypeError, ValueError, OverflowError): 

2124 pass 

2125 if floatify: 

2126 try: 

2127 result.append(int(x)) 

2128 except (TypeError, ValueError, OverflowError): 

2129 pass 

2130 return set(result) 

2131 

2132 

2133def _refine_defaults_read( 

2134 dialect: str | csv.Dialect | None, 

2135 delimiter: str | None | lib.NoDefault, 

2136 delim_whitespace: bool, 

2137 engine: CSVEngine | None, 

2138 sep: str | None | lib.NoDefault, 

2139 on_bad_lines: str | Callable, 

2140 names: Sequence[Hashable] | None | lib.NoDefault, 

2141 defaults: dict[str, Any], 

2142 dtype_backend: DtypeBackend | lib.NoDefault, 

2143): 

2144 """Validate/refine default values of input parameters of read_csv, read_table. 

2145 

2146 Parameters 

2147 ---------- 

2148 dialect : str or csv.Dialect 

2149 If provided, this parameter will override values (default or not) for the 

2150 following parameters: `delimiter`, `doublequote`, `escapechar`, 

2151 `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to 

2152 override values, a ParserWarning will be issued. See csv.Dialect 

2153 documentation for more details. 

2154 delimiter : str or object 

2155 Alias for sep. 

2156 delim_whitespace : bool 

2157 Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be 

2158 used as the sep. Equivalent to setting ``sep='\\s+'``. If this option 

2159 is set to True, nothing should be passed in for the ``delimiter`` 

2160 parameter. 

2161 

2162 .. deprecated:: 2.2.0 

2163 Use ``sep="\\s+"`` instead. 

2164 engine : {{'c', 'python'}} 

2165 Parser engine to use. The C engine is faster while the python engine is 

2166 currently more feature-complete. 

2167 sep : str or object 

2168 A delimiter provided by the user (str) or a sentinel value, i.e. 

2169 pandas._libs.lib.no_default. 

2170 on_bad_lines : str, callable 

2171 An option for handling bad lines or a sentinel value(None). 

2172 names : array-like, optional 

2173 List of column names to use. If the file contains a header row, 

2174 then you should explicitly pass ``header=0`` to override the column names. 

2175 Duplicates in this list are not allowed. 

2176 defaults: dict 

2177 Default values of input parameters. 

2178 

2179 Returns 

2180 ------- 

2181 kwds : dict 

2182 Input parameters with correct values. 

2183 

2184 Raises 

2185 ------ 

2186 ValueError : 

2187 If a delimiter was specified with ``sep`` (or ``delimiter``) and 

2188 ``delim_whitespace=True``. 

2189 """ 

2190 # fix types for sep, delimiter to Union(str, Any) 

2191 delim_default = defaults["delimiter"] 

2192 kwds: dict[str, Any] = {} 

2193 # gh-23761 

2194 # 

2195 # When a dialect is passed, it overrides any of the overlapping 

2196 # parameters passed in directly. We don't want to warn if the 

2197 # default parameters were passed in (since it probably means 

2198 # that the user didn't pass them in explicitly in the first place). 

2199 # 

2200 # "delimiter" is the annoying corner case because we alias it to 

2201 # "sep" before doing comparison to the dialect values later on. 

2202 # Thus, we need a flag to indicate that we need to "override" 

2203 # the comparison to dialect values by checking if default values 

2204 # for BOTH "delimiter" and "sep" were provided. 

2205 if dialect is not None: 

2206 kwds["sep_override"] = delimiter is None and ( 

2207 sep is lib.no_default or sep == delim_default 

2208 ) 

2209 

2210 if delimiter and (sep is not lib.no_default): 

2211 raise ValueError("Specified a sep and a delimiter; you can only specify one.") 

2212 

2213 kwds["names"] = None if names is lib.no_default else names 

2214 

2215 # Alias sep -> delimiter. 

2216 if delimiter is None: 

2217 delimiter = sep 

2218 

2219 if delim_whitespace and (delimiter is not lib.no_default): 

2220 raise ValueError( 

2221 "Specified a delimiter with both sep and " 

2222 "delim_whitespace=True; you can only specify one." 

2223 ) 

2224 

2225 if delimiter == "\n": 

2226 raise ValueError( 

2227 r"Specified \n as separator or delimiter. This forces the python engine " 

2228 "which does not accept a line terminator. Hence it is not allowed to use " 

2229 "the line terminator as separator.", 

2230 ) 

2231 

2232 if delimiter is lib.no_default: 

2233 # assign default separator value 

2234 kwds["delimiter"] = delim_default 

2235 else: 

2236 kwds["delimiter"] = delimiter 

2237 

2238 if engine is not None: 

2239 kwds["engine_specified"] = True 

2240 else: 

2241 kwds["engine"] = "c" 

2242 kwds["engine_specified"] = False 

2243 

2244 if on_bad_lines == "error": 

2245 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR 

2246 elif on_bad_lines == "warn": 

2247 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN 

2248 elif on_bad_lines == "skip": 

2249 kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP 

2250 elif callable(on_bad_lines): 

2251 if engine not in ["python", "pyarrow"]: 

2252 raise ValueError( 

2253 "on_bad_line can only be a callable function " 

2254 "if engine='python' or 'pyarrow'" 

2255 ) 

2256 kwds["on_bad_lines"] = on_bad_lines 

2257 else: 

2258 raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") 

2259 

2260 check_dtype_backend(dtype_backend) 

2261 

2262 kwds["dtype_backend"] = dtype_backend 

2263 

2264 return kwds 

2265 

2266 

2267def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: 

2268 """ 

2269 Extract concrete csv dialect instance. 

2270 

2271 Returns 

2272 ------- 

2273 csv.Dialect or None 

2274 """ 

2275 if kwds.get("dialect") is None: 

2276 return None 

2277 

2278 dialect = kwds["dialect"] 

2279 if dialect in csv.list_dialects(): 

2280 dialect = csv.get_dialect(dialect) 

2281 

2282 _validate_dialect(dialect) 

2283 

2284 return dialect 

2285 

2286 

2287MANDATORY_DIALECT_ATTRS = ( 

2288 "delimiter", 

2289 "doublequote", 

2290 "escapechar", 

2291 "skipinitialspace", 

2292 "quotechar", 

2293 "quoting", 

2294) 

2295 

2296 

2297def _validate_dialect(dialect: csv.Dialect) -> None: 

2298 """ 

2299 Validate csv dialect instance. 

2300 

2301 Raises 

2302 ------ 

2303 ValueError 

2304 If incorrect dialect is provided. 

2305 """ 

2306 for param in MANDATORY_DIALECT_ATTRS: 

2307 if not hasattr(dialect, param): 

2308 raise ValueError(f"Invalid dialect {dialect} provided") 

2309 

2310 

2311def _merge_with_dialect_properties( 

2312 dialect: csv.Dialect, 

2313 defaults: dict[str, Any], 

2314) -> dict[str, Any]: 

2315 """ 

2316 Merge default kwargs in TextFileReader with dialect parameters. 

2317 

2318 Parameters 

2319 ---------- 

2320 dialect : csv.Dialect 

2321 Concrete csv dialect. See csv.Dialect documentation for more details. 

2322 defaults : dict 

2323 Keyword arguments passed to TextFileReader. 

2324 

2325 Returns 

2326 ------- 

2327 kwds : dict 

2328 Updated keyword arguments, merged with dialect parameters. 

2329 """ 

2330 kwds = defaults.copy() 

2331 

2332 for param in MANDATORY_DIALECT_ATTRS: 

2333 dialect_val = getattr(dialect, param) 

2334 

2335 parser_default = parser_defaults[param] 

2336 provided = kwds.get(param, parser_default) 

2337 

2338 # Messages for conflicting values between the dialect 

2339 # instance and the actual parameters provided. 

2340 conflict_msgs = [] 

2341 

2342 # Don't warn if the default parameter was passed in, 

2343 # even if it conflicts with the dialect (gh-23761). 

2344 if provided not in (parser_default, dialect_val): 

2345 msg = ( 

2346 f"Conflicting values for '{param}': '{provided}' was " 

2347 f"provided, but the dialect specifies '{dialect_val}'. " 

2348 "Using the dialect-specified value." 

2349 ) 

2350 

2351 # Annoying corner case for not warning about 

2352 # conflicts between dialect and delimiter parameter. 

2353 # Refer to the outer "_read_" function for more info. 

2354 if not (param == "delimiter" and kwds.pop("sep_override", False)): 

2355 conflict_msgs.append(msg) 

2356 

2357 if conflict_msgs: 

2358 warnings.warn( 

2359 "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() 

2360 ) 

2361 kwds[param] = dialect_val 

2362 return kwds 

2363 

2364 

2365def _validate_skipfooter(kwds: dict[str, Any]) -> None: 

2366 """ 

2367 Check whether skipfooter is compatible with other kwargs in TextFileReader. 

2368 

2369 Parameters 

2370 ---------- 

2371 kwds : dict 

2372 Keyword arguments passed to TextFileReader. 

2373 

2374 Raises 

2375 ------ 

2376 ValueError 

2377 If skipfooter is not compatible with other parameters. 

2378 """ 

2379 if kwds.get("skipfooter"): 

2380 if kwds.get("iterator") or kwds.get("chunksize"): 

2381 raise ValueError("'skipfooter' not supported for iteration") 

2382 if kwds.get("nrows"): 

2383 raise ValueError("'skipfooter' not supported with 'nrows'")