Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/_json.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

501 statements  

1from __future__ import annotations 

2 

3from abc import ( 

4 ABC, 

5 abstractmethod, 

6) 

7from collections import abc 

8from io import StringIO 

9from itertools import islice 

10from types import TracebackType 

11from typing import ( 

12 TYPE_CHECKING, 

13 Any, 

14 Callable, 

15 Generic, 

16 Literal, 

17 Mapping, 

18 TypeVar, 

19 overload, 

20) 

21 

22import numpy as np 

23 

24from pandas._libs import lib 

25from pandas._libs.json import ( 

26 dumps, 

27 loads, 

28) 

29from pandas._libs.tslibs import iNaT 

30from pandas._typing import ( 

31 CompressionOptions, 

32 DtypeArg, 

33 DtypeBackend, 

34 FilePath, 

35 IndexLabel, 

36 JSONEngine, 

37 JSONSerializable, 

38 ReadBuffer, 

39 StorageOptions, 

40 WriteBuffer, 

41) 

42from pandas.compat._optional import import_optional_dependency 

43from pandas.errors import AbstractMethodError 

44from pandas.util._decorators import doc 

45from pandas.util._validators import check_dtype_backend 

46 

47from pandas.core.dtypes.common import ( 

48 ensure_str, 

49 is_period_dtype, 

50) 

51from pandas.core.dtypes.generic import ABCIndex 

52 

53from pandas import ( 

54 ArrowDtype, 

55 DataFrame, 

56 MultiIndex, 

57 Series, 

58 isna, 

59 notna, 

60 to_datetime, 

61) 

62from pandas.core.reshape.concat import concat 

63from pandas.core.shared_docs import _shared_docs 

64 

65from pandas.io.common import ( 

66 IOHandles, 

67 dedup_names, 

68 extension_to_compression, 

69 file_exists, 

70 get_handle, 

71 is_fsspec_url, 

72 is_potential_multi_index, 

73 is_url, 

74 stringify_path, 

75) 

76from pandas.io.json._normalize import convert_to_line_delimits 

77from pandas.io.json._table_schema import ( 

78 build_table_schema, 

79 parse_table_schema, 

80) 

81from pandas.io.parsers.readers import validate_integer 

82 

83if TYPE_CHECKING: 

84 from pandas.core.generic import NDFrame 

85 

86FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"]) 

87 

88 

89# interface to/from 

90@overload 

91def to_json( 

92 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes], 

93 obj: NDFrame, 

94 orient: str | None = ..., 

95 date_format: str = ..., 

96 double_precision: int = ..., 

97 force_ascii: bool = ..., 

98 date_unit: str = ..., 

99 default_handler: Callable[[Any], JSONSerializable] | None = ..., 

100 lines: bool = ..., 

101 compression: CompressionOptions = ..., 

102 index: bool = ..., 

103 indent: int = ..., 

104 storage_options: StorageOptions = ..., 

105 mode: Literal["a", "w"] = ..., 

106) -> None: 

107 ... 

108 

109 

110@overload 

111def to_json( 

112 path_or_buf: None, 

113 obj: NDFrame, 

114 orient: str | None = ..., 

115 date_format: str = ..., 

116 double_precision: int = ..., 

117 force_ascii: bool = ..., 

118 date_unit: str = ..., 

119 default_handler: Callable[[Any], JSONSerializable] | None = ..., 

120 lines: bool = ..., 

121 compression: CompressionOptions = ..., 

122 index: bool = ..., 

123 indent: int = ..., 

124 storage_options: StorageOptions = ..., 

125 mode: Literal["a", "w"] = ..., 

126) -> str: 

127 ... 

128 

129 

130def to_json( 

131 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None, 

132 obj: NDFrame, 

133 orient: str | None = None, 

134 date_format: str = "epoch", 

135 double_precision: int = 10, 

136 force_ascii: bool = True, 

137 date_unit: str = "ms", 

138 default_handler: Callable[[Any], JSONSerializable] | None = None, 

139 lines: bool = False, 

140 compression: CompressionOptions = "infer", 

141 index: bool = True, 

142 indent: int = 0, 

143 storage_options: StorageOptions = None, 

144 mode: Literal["a", "w"] = "w", 

145) -> str | None: 

146 if not index and orient not in ["split", "table"]: 

147 raise ValueError( 

148 "'index=False' is only valid when 'orient' is 'split' or 'table'" 

149 ) 

150 

151 if lines and orient != "records": 

152 raise ValueError("'lines' keyword only valid when 'orient' is records") 

153 

154 if mode not in ["a", "w"]: 

155 msg = ( 

156 f"mode={mode} is not a valid option." 

157 "Only 'w' and 'a' are currently supported." 

158 ) 

159 raise ValueError(msg) 

160 

161 if mode == "a" and (not lines or orient != "records"): 

162 msg = ( 

163 "mode='a' (append) is only supported when" 

164 "lines is True and orient is 'records'" 

165 ) 

166 raise ValueError(msg) 

167 

168 if orient == "table" and isinstance(obj, Series): 

169 obj = obj.to_frame(name=obj.name or "values") 

170 

171 writer: type[Writer] 

172 if orient == "table" and isinstance(obj, DataFrame): 

173 writer = JSONTableWriter 

174 elif isinstance(obj, Series): 

175 writer = SeriesWriter 

176 elif isinstance(obj, DataFrame): 

177 writer = FrameWriter 

178 else: 

179 raise NotImplementedError("'obj' should be a Series or a DataFrame") 

180 

181 s = writer( 

182 obj, 

183 orient=orient, 

184 date_format=date_format, 

185 double_precision=double_precision, 

186 ensure_ascii=force_ascii, 

187 date_unit=date_unit, 

188 default_handler=default_handler, 

189 index=index, 

190 indent=indent, 

191 ).write() 

192 

193 if lines: 

194 s = convert_to_line_delimits(s) 

195 

196 if path_or_buf is not None: 

197 # apply compression and byte/text conversion 

198 with get_handle( 

199 path_or_buf, mode, compression=compression, storage_options=storage_options 

200 ) as handles: 

201 handles.handle.write(s) 

202 else: 

203 return s 

204 return None 

205 

206 

207class Writer(ABC): 

208 _default_orient: str 

209 

210 def __init__( 

211 self, 

212 obj: NDFrame, 

213 orient: str | None, 

214 date_format: str, 

215 double_precision: int, 

216 ensure_ascii: bool, 

217 date_unit: str, 

218 index: bool, 

219 default_handler: Callable[[Any], JSONSerializable] | None = None, 

220 indent: int = 0, 

221 ) -> None: 

222 self.obj = obj 

223 

224 if orient is None: 

225 orient = self._default_orient 

226 

227 self.orient = orient 

228 self.date_format = date_format 

229 self.double_precision = double_precision 

230 self.ensure_ascii = ensure_ascii 

231 self.date_unit = date_unit 

232 self.default_handler = default_handler 

233 self.index = index 

234 self.indent = indent 

235 

236 self.is_copy = None 

237 self._format_axes() 

238 

239 def _format_axes(self): 

240 raise AbstractMethodError(self) 

241 

242 def write(self) -> str: 

243 iso_dates = self.date_format == "iso" 

244 return dumps( 

245 self.obj_to_write, 

246 orient=self.orient, 

247 double_precision=self.double_precision, 

248 ensure_ascii=self.ensure_ascii, 

249 date_unit=self.date_unit, 

250 iso_dates=iso_dates, 

251 default_handler=self.default_handler, 

252 indent=self.indent, 

253 ) 

254 

255 @property 

256 @abstractmethod 

257 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

258 """Object to write in JSON format.""" 

259 

260 

261class SeriesWriter(Writer): 

262 _default_orient = "index" 

263 

264 @property 

265 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

266 if not self.index and self.orient == "split": 

267 return {"name": self.obj.name, "data": self.obj.values} 

268 else: 

269 return self.obj 

270 

271 def _format_axes(self): 

272 if not self.obj.index.is_unique and self.orient == "index": 

273 raise ValueError(f"Series index must be unique for orient='{self.orient}'") 

274 

275 

276class FrameWriter(Writer): 

277 _default_orient = "columns" 

278 

279 @property 

280 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

281 if not self.index and self.orient == "split": 

282 obj_to_write = self.obj.to_dict(orient="split") 

283 del obj_to_write["index"] 

284 else: 

285 obj_to_write = self.obj 

286 return obj_to_write 

287 

288 def _format_axes(self): 

289 """ 

290 Try to format axes if they are datelike. 

291 """ 

292 if not self.obj.index.is_unique and self.orient in ("index", "columns"): 

293 raise ValueError( 

294 f"DataFrame index must be unique for orient='{self.orient}'." 

295 ) 

296 if not self.obj.columns.is_unique and self.orient in ( 

297 "index", 

298 "columns", 

299 "records", 

300 ): 

301 raise ValueError( 

302 f"DataFrame columns must be unique for orient='{self.orient}'." 

303 ) 

304 

305 

306class JSONTableWriter(FrameWriter): 

307 _default_orient = "records" 

308 

309 def __init__( 

310 self, 

311 obj, 

312 orient: str | None, 

313 date_format: str, 

314 double_precision: int, 

315 ensure_ascii: bool, 

316 date_unit: str, 

317 index: bool, 

318 default_handler: Callable[[Any], JSONSerializable] | None = None, 

319 indent: int = 0, 

320 ) -> None: 

321 """ 

322 Adds a `schema` attribute with the Table Schema, resets 

323 the index (can't do in caller, because the schema inference needs 

324 to know what the index is, forces orient to records, and forces 

325 date_format to 'iso'. 

326 """ 

327 super().__init__( 

328 obj, 

329 orient, 

330 date_format, 

331 double_precision, 

332 ensure_ascii, 

333 date_unit, 

334 index, 

335 default_handler=default_handler, 

336 indent=indent, 

337 ) 

338 

339 if date_format != "iso": 

340 msg = ( 

341 "Trying to write with `orient='table'` and " 

342 f"`date_format='{date_format}'`. Table Schema requires dates " 

343 "to be formatted with `date_format='iso'`" 

344 ) 

345 raise ValueError(msg) 

346 

347 self.schema = build_table_schema(obj, index=self.index) 

348 

349 # NotImplemented on a column MultiIndex 

350 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): 

351 raise NotImplementedError( 

352 "orient='table' is not supported for MultiIndex columns" 

353 ) 

354 

355 # TODO: Do this timedelta properly in objToJSON.c See GH #15137 

356 if ( 

357 (obj.ndim == 1) 

358 and (obj.name in set(obj.index.names)) 

359 or len(obj.columns.intersection(obj.index.names)) 

360 ): 

361 msg = "Overlapping names between the index and columns" 

362 raise ValueError(msg) 

363 

364 obj = obj.copy() 

365 timedeltas = obj.select_dtypes(include=["timedelta"]).columns 

366 if len(timedeltas): 

367 obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat()) 

368 # Convert PeriodIndex to datetimes before serializing 

369 if is_period_dtype(obj.index.dtype): 

370 obj.index = obj.index.to_timestamp() 

371 

372 # exclude index from obj if index=False 

373 if not self.index: 

374 self.obj = obj.reset_index(drop=True) 

375 else: 

376 self.obj = obj.reset_index(drop=False) 

377 self.date_format = "iso" 

378 self.orient = "records" 

379 self.index = index 

380 

381 @property 

382 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: 

383 return {"schema": self.schema, "data": self.obj} 

384 

385 

386@overload 

387def read_json( 

388 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

389 *, 

390 orient: str | None = ..., 

391 typ: Literal["frame"] = ..., 

392 dtype: DtypeArg | None = ..., 

393 convert_axes=..., 

394 convert_dates: bool | list[str] = ..., 

395 keep_default_dates: bool = ..., 

396 precise_float: bool = ..., 

397 date_unit: str | None = ..., 

398 encoding: str | None = ..., 

399 encoding_errors: str | None = ..., 

400 lines: bool = ..., 

401 chunksize: int, 

402 compression: CompressionOptions = ..., 

403 nrows: int | None = ..., 

404 storage_options: StorageOptions = ..., 

405 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

406 engine: JSONEngine = ..., 

407) -> JsonReader[Literal["frame"]]: 

408 ... 

409 

410 

411@overload 

412def read_json( 

413 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

414 *, 

415 orient: str | None = ..., 

416 typ: Literal["series"], 

417 dtype: DtypeArg | None = ..., 

418 convert_axes=..., 

419 convert_dates: bool | list[str] = ..., 

420 keep_default_dates: bool = ..., 

421 precise_float: bool = ..., 

422 date_unit: str | None = ..., 

423 encoding: str | None = ..., 

424 encoding_errors: str | None = ..., 

425 lines: bool = ..., 

426 chunksize: int, 

427 compression: CompressionOptions = ..., 

428 nrows: int | None = ..., 

429 storage_options: StorageOptions = ..., 

430 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

431 engine: JSONEngine = ..., 

432) -> JsonReader[Literal["series"]]: 

433 ... 

434 

435 

436@overload 

437def read_json( 

438 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

439 *, 

440 orient: str | None = ..., 

441 typ: Literal["series"], 

442 dtype: DtypeArg | None = ..., 

443 convert_axes=..., 

444 convert_dates: bool | list[str] = ..., 

445 keep_default_dates: bool = ..., 

446 precise_float: bool = ..., 

447 date_unit: str | None = ..., 

448 encoding: str | None = ..., 

449 encoding_errors: str | None = ..., 

450 lines: bool = ..., 

451 chunksize: None = ..., 

452 compression: CompressionOptions = ..., 

453 nrows: int | None = ..., 

454 storage_options: StorageOptions = ..., 

455 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

456 engine: JSONEngine = ..., 

457) -> Series: 

458 ... 

459 

460 

461@overload 

462def read_json( 

463 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

464 *, 

465 orient: str | None = ..., 

466 typ: Literal["frame"] = ..., 

467 dtype: DtypeArg | None = ..., 

468 convert_axes=..., 

469 convert_dates: bool | list[str] = ..., 

470 keep_default_dates: bool = ..., 

471 precise_float: bool = ..., 

472 date_unit: str | None = ..., 

473 encoding: str | None = ..., 

474 encoding_errors: str | None = ..., 

475 lines: bool = ..., 

476 chunksize: None = ..., 

477 compression: CompressionOptions = ..., 

478 nrows: int | None = ..., 

479 storage_options: StorageOptions = ..., 

480 dtype_backend: DtypeBackend | lib.NoDefault = ..., 

481 engine: JSONEngine = ..., 

482) -> DataFrame: 

483 ... 

484 

485 

486@doc( 

487 storage_options=_shared_docs["storage_options"], 

488 decompression_options=_shared_docs["decompression_options"] % "path_or_buf", 

489) 

490def read_json( 

491 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

492 *, 

493 orient: str | None = None, 

494 typ: Literal["frame", "series"] = "frame", 

495 dtype: DtypeArg | None = None, 

496 convert_axes=None, 

497 convert_dates: bool | list[str] = True, 

498 keep_default_dates: bool = True, 

499 precise_float: bool = False, 

500 date_unit: str | None = None, 

501 encoding: str | None = None, 

502 encoding_errors: str | None = "strict", 

503 lines: bool = False, 

504 chunksize: int | None = None, 

505 compression: CompressionOptions = "infer", 

506 nrows: int | None = None, 

507 storage_options: StorageOptions = None, 

508 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

509 engine: JSONEngine = "ujson", 

510) -> DataFrame | Series | JsonReader: 

511 """ 

512 Convert a JSON string to pandas object. 

513 

514 Parameters 

515 ---------- 

516 path_or_buf : a valid JSON str, path object or file-like object 

517 Any valid string path is acceptable. The string could be a URL. Valid 

518 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

519 expected. A local file could be: 

520 ``file://localhost/path/to/table.json``. 

521 

522 If you want to pass in a path object, pandas accepts any 

523 ``os.PathLike``. 

524 

525 By file-like object, we refer to objects with a ``read()`` method, 

526 such as a file handle (e.g. via builtin ``open`` function) 

527 or ``StringIO``. 

528 orient : str, optional 

529 Indication of expected JSON string format. 

530 Compatible JSON strings can be produced by ``to_json()`` with a 

531 corresponding orient value. 

532 The set of possible orients is: 

533 

534 - ``'split'`` : dict like 

535 ``{{index -> [index], columns -> [columns], data -> [values]}}`` 

536 - ``'records'`` : list like 

537 ``[{{column -> value}}, ... , {{column -> value}}]`` 

538 - ``'index'`` : dict like ``{{index -> {{column -> value}}}}`` 

539 - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}`` 

540 - ``'values'`` : just the values array 

541 

542 The allowed and default values depend on the value 

543 of the `typ` parameter. 

544 

545 * when ``typ == 'series'``, 

546 

547 - allowed orients are ``{{'split','records','index'}}`` 

548 - default is ``'index'`` 

549 - The Series index must be unique for orient ``'index'``. 

550 

551 * when ``typ == 'frame'``, 

552 

553 - allowed orients are ``{{'split','records','index', 

554 'columns','values', 'table'}}`` 

555 - default is ``'columns'`` 

556 - The DataFrame index must be unique for orients ``'index'`` and 

557 ``'columns'``. 

558 - The DataFrame columns must be unique for orients ``'index'``, 

559 ``'columns'``, and ``'records'``. 

560 

561 typ : {{'frame', 'series'}}, default 'frame' 

562 The type of object to recover. 

563 

564 dtype : bool or dict, default None 

565 If True, infer dtypes; if a dict of column to dtype, then use those; 

566 if False, then don't infer dtypes at all, applies only to the data. 

567 

568 For all ``orient`` values except ``'table'``, default is True. 

569 

570 convert_axes : bool, default None 

571 Try to convert the axes to the proper dtypes. 

572 

573 For all ``orient`` values except ``'table'``, default is True. 

574 

575 convert_dates : bool or list of str, default True 

576 If True then default datelike columns may be converted (depending on 

577 keep_default_dates). 

578 If False, no dates will be converted. 

579 If a list of column names, then those columns will be converted and 

580 default datelike columns may also be converted (depending on 

581 keep_default_dates). 

582 

583 keep_default_dates : bool, default True 

584 If parsing dates (convert_dates is not False), then try to parse the 

585 default datelike columns. 

586 A column label is datelike if 

587 

588 * it ends with ``'_at'``, 

589 

590 * it ends with ``'_time'``, 

591 

592 * it begins with ``'timestamp'``, 

593 

594 * it is ``'modified'``, or 

595 

596 * it is ``'date'``. 

597 

598 precise_float : bool, default False 

599 Set to enable usage of higher precision (strtod) function when 

600 decoding string to double values. Default (False) is to use fast but 

601 less precise builtin functionality. 

602 

603 date_unit : str, default None 

604 The timestamp unit to detect if converting dates. The default behaviour 

605 is to try and detect the correct precision, but if this is not desired 

606 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, 

607 milliseconds, microseconds or nanoseconds respectively. 

608 

609 encoding : str, default is 'utf-8' 

610 The encoding to use to decode py3 bytes. 

611 

612 encoding_errors : str, optional, default "strict" 

613 How encoding errors are treated. `List of possible values 

614 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ . 

615 

616 .. versionadded:: 1.3.0 

617 

618 lines : bool, default False 

619 Read the file as a json object per line. 

620 

621 chunksize : int, optional 

622 Return JsonReader object for iteration. 

623 See the `line-delimited json docs 

624 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_ 

625 for more information on ``chunksize``. 

626 This can only be passed if `lines=True`. 

627 If this is None, the file will be read into memory all at once. 

628 

629 .. versionchanged:: 1.2 

630 

631 ``JsonReader`` is a context manager. 

632 

633 {decompression_options} 

634 

635 .. versionchanged:: 1.4.0 Zstandard support. 

636 

637 nrows : int, optional 

638 The number of lines from the line-delimited jsonfile that has to be read. 

639 This can only be passed if `lines=True`. 

640 If this is None, all the rows will be returned. 

641 

642 .. versionadded:: 1.1 

643 

644 {storage_options} 

645 

646 .. versionadded:: 1.2.0 

647 

648 dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames 

649 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy 

650 arrays, nullable dtypes are used for all dtypes that have a nullable 

651 implementation when "numpy_nullable" is set, pyarrow is used for all 

652 dtypes if "pyarrow" is set. 

653 

654 The dtype_backends are still experimential. 

655 

656 .. versionadded:: 2.0 

657 

658 engine : {{"ujson", "pyarrow"}}, default "ujson" 

659 Parser engine to use. The ``"pyarrow"`` engine is only available when 

660 ``lines=True``. 

661 

662 .. versionadded:: 2.0 

663 

664 Returns 

665 ------- 

666 Series or DataFrame 

667 The type returned depends on the value of `typ`. 

668 

669 See Also 

670 -------- 

671 DataFrame.to_json : Convert a DataFrame to a JSON string. 

672 Series.to_json : Convert a Series to a JSON string. 

673 json_normalize : Normalize semi-structured JSON data into a flat table. 

674 

675 Notes 

676 ----- 

677 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal 

678 :class:`Index` name of `index` gets written with :func:`to_json`, the 

679 subsequent read operation will incorrectly set the :class:`Index` name to 

680 ``None``. This is because `index` is also used by :func:`DataFrame.to_json` 

681 to denote a missing :class:`Index` name, and the subsequent 

682 :func:`read_json` operation cannot distinguish between the two. The same 

683 limitation is encountered with a :class:`MultiIndex` and any names 

684 beginning with ``'level_'``. 

685 

686 Examples 

687 -------- 

688 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], 

689 ... index=['row 1', 'row 2'], 

690 ... columns=['col 1', 'col 2']) 

691 

692 Encoding/decoding a Dataframe using ``'split'`` formatted JSON: 

693 

694 >>> df.to_json(orient='split') 

695 '\ 

696{{\ 

697"columns":["col 1","col 2"],\ 

698"index":["row 1","row 2"],\ 

699"data":[["a","b"],["c","d"]]\ 

700}}\ 

701' 

702 >>> pd.read_json(_, orient='split') 

703 col 1 col 2 

704 row 1 a b 

705 row 2 c d 

706 

707 Encoding/decoding a Dataframe using ``'index'`` formatted JSON: 

708 

709 >>> df.to_json(orient='index') 

710 '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}' 

711 

712 >>> pd.read_json(_, orient='index') 

713 col 1 col 2 

714 row 1 a b 

715 row 2 c d 

716 

717 Encoding/decoding a Dataframe using ``'records'`` formatted JSON. 

718 Note that index labels are not preserved with this encoding. 

719 

720 >>> df.to_json(orient='records') 

721 '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]' 

722 >>> pd.read_json(_, orient='records') 

723 col 1 col 2 

724 0 a b 

725 1 c d 

726 

727 Encoding with Table Schema 

728 

729 >>> df.to_json(orient='table') 

730 '\ 

731{{"schema":{{"fields":[\ 

732{{"name":"index","type":"string"}},\ 

733{{"name":"col 1","type":"string"}},\ 

734{{"name":"col 2","type":"string"}}],\ 

735"primaryKey":["index"],\ 

736"pandas_version":"1.4.0"}},\ 

737"data":[\ 

738{{"index":"row 1","col 1":"a","col 2":"b"}},\ 

739{{"index":"row 2","col 1":"c","col 2":"d"}}]\ 

740}}\ 

741' 

742 """ 

743 if orient == "table" and dtype: 

744 raise ValueError("cannot pass both dtype and orient='table'") 

745 if orient == "table" and convert_axes: 

746 raise ValueError("cannot pass both convert_axes and orient='table'") 

747 

748 check_dtype_backend(dtype_backend) 

749 

750 if dtype is None and orient != "table": 

751 # error: Incompatible types in assignment (expression has type "bool", variable 

752 # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], 

753 # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable, 

754 # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], 

755 # Type[int], Type[complex], Type[bool], Type[object]]], None]") 

756 dtype = True # type: ignore[assignment] 

757 if convert_axes is None and orient != "table": 

758 convert_axes = True 

759 

760 json_reader = JsonReader( 

761 path_or_buf, 

762 orient=orient, 

763 typ=typ, 

764 dtype=dtype, 

765 convert_axes=convert_axes, 

766 convert_dates=convert_dates, 

767 keep_default_dates=keep_default_dates, 

768 precise_float=precise_float, 

769 date_unit=date_unit, 

770 encoding=encoding, 

771 lines=lines, 

772 chunksize=chunksize, 

773 compression=compression, 

774 nrows=nrows, 

775 storage_options=storage_options, 

776 encoding_errors=encoding_errors, 

777 dtype_backend=dtype_backend, 

778 engine=engine, 

779 ) 

780 

781 if chunksize: 

782 return json_reader 

783 else: 

784 return json_reader.read() 

785 

786 

787class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]): 

788 """ 

789 JsonReader provides an interface for reading in a JSON file. 

790 

791 If initialized with ``lines=True`` and ``chunksize``, can be iterated over 

792 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the 

793 whole document. 

794 """ 

795 

796 def __init__( 

797 self, 

798 filepath_or_buffer, 

799 orient, 

800 typ: FrameSeriesStrT, 

801 dtype, 

802 convert_axes, 

803 convert_dates, 

804 keep_default_dates: bool, 

805 precise_float: bool, 

806 date_unit, 

807 encoding, 

808 lines: bool, 

809 chunksize: int | None, 

810 compression: CompressionOptions, 

811 nrows: int | None, 

812 storage_options: StorageOptions = None, 

813 encoding_errors: str | None = "strict", 

814 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

815 engine: JSONEngine = "ujson", 

816 ) -> None: 

817 self.orient = orient 

818 self.typ = typ 

819 self.dtype = dtype 

820 self.convert_axes = convert_axes 

821 self.convert_dates = convert_dates 

822 self.keep_default_dates = keep_default_dates 

823 self.precise_float = precise_float 

824 self.date_unit = date_unit 

825 self.encoding = encoding 

826 self.engine = engine 

827 self.compression = compression 

828 self.storage_options = storage_options 

829 self.lines = lines 

830 self.chunksize = chunksize 

831 self.nrows_seen = 0 

832 self.nrows = nrows 

833 self.encoding_errors = encoding_errors 

834 self.handles: IOHandles[str] | None = None 

835 self.dtype_backend = dtype_backend 

836 

837 if self.engine not in {"pyarrow", "ujson"}: 

838 raise ValueError( 

839 f"The engine type {self.engine} is currently not supported." 

840 ) 

841 if self.chunksize is not None: 

842 self.chunksize = validate_integer("chunksize", self.chunksize, 1) 

843 if not self.lines: 

844 raise ValueError("chunksize can only be passed if lines=True") 

845 if self.engine == "pyarrow": 

846 raise ValueError( 

847 "currently pyarrow engine doesn't support chunksize parameter" 

848 ) 

849 if self.nrows is not None: 

850 self.nrows = validate_integer("nrows", self.nrows, 0) 

851 if not self.lines: 

852 raise ValueError("nrows can only be passed if lines=True") 

853 if self.engine == "pyarrow": 

854 if not self.lines: 

855 raise ValueError( 

856 "currently pyarrow engine only supports " 

857 "the line-delimited JSON format" 

858 ) 

859 self.data = filepath_or_buffer 

860 elif self.engine == "ujson": 

861 data = self._get_data_from_filepath(filepath_or_buffer) 

862 self.data = self._preprocess_data(data) 

863 

864 def _preprocess_data(self, data): 

865 """ 

866 At this point, the data either has a `read` attribute (e.g. a file 

867 object or a StringIO) or is a string that is a JSON document. 

868 

869 If self.chunksize, we prepare the data for the `__next__` method. 

870 Otherwise, we read it into memory for the `read` method. 

871 """ 

872 if hasattr(data, "read") and not (self.chunksize or self.nrows): 

873 with self: 

874 data = data.read() 

875 if not hasattr(data, "read") and (self.chunksize or self.nrows): 

876 data = StringIO(data) 

877 

878 return data 

879 

880 def _get_data_from_filepath(self, filepath_or_buffer): 

881 """ 

882 The function read_json accepts three input types: 

883 1. filepath (string-like) 

884 2. file-like object (e.g. open file object, StringIO) 

885 3. JSON string 

886 

887 This method turns (1) into (2) to simplify the rest of the processing. 

888 It returns input types (2) and (3) unchanged. 

889 

890 It raises FileNotFoundError if the input is a string ending in 

891 one of .json, .json.gz, .json.bz2, etc. but no such file exists. 

892 """ 

893 # if it is a string but the file does not exist, it might be a JSON string 

894 filepath_or_buffer = stringify_path(filepath_or_buffer) 

895 if ( 

896 not isinstance(filepath_or_buffer, str) 

897 or is_url(filepath_or_buffer) 

898 or is_fsspec_url(filepath_or_buffer) 

899 or file_exists(filepath_or_buffer) 

900 ): 

901 self.handles = get_handle( 

902 filepath_or_buffer, 

903 "r", 

904 encoding=self.encoding, 

905 compression=self.compression, 

906 storage_options=self.storage_options, 

907 errors=self.encoding_errors, 

908 ) 

909 filepath_or_buffer = self.handles.handle 

910 elif ( 

911 isinstance(filepath_or_buffer, str) 

912 and filepath_or_buffer.lower().endswith( 

913 (".json",) + tuple(f".json{c}" for c in extension_to_compression) 

914 ) 

915 and not file_exists(filepath_or_buffer) 

916 ): 

917 raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") 

918 

919 return filepath_or_buffer 

920 

921 def _combine_lines(self, lines) -> str: 

922 """ 

923 Combines a list of JSON objects into one JSON object. 

924 """ 

925 return ( 

926 f'[{",".join([line for line in (line.strip() for line in lines) if line])}]' 

927 ) 

928 

929 @overload 

930 def read(self: JsonReader[Literal["frame"]]) -> DataFrame: 

931 ... 

932 

933 @overload 

934 def read(self: JsonReader[Literal["series"]]) -> Series: 

935 ... 

936 

937 @overload 

938 def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: 

939 ... 

940 

941 def read(self) -> DataFrame | Series: 

942 """ 

943 Read the whole JSON input into a pandas object. 

944 """ 

945 obj: DataFrame | Series 

946 with self: 

947 if self.engine == "pyarrow": 

948 pyarrow_json = import_optional_dependency("pyarrow.json") 

949 pa_table = pyarrow_json.read_json(self.data) 

950 

951 mapping: type[ArrowDtype] | None | Callable 

952 if self.dtype_backend == "pyarrow": 

953 mapping = ArrowDtype 

954 elif self.dtype_backend == "numpy_nullable": 

955 from pandas.io._util import _arrow_dtype_mapping 

956 

957 mapping = _arrow_dtype_mapping().get 

958 else: 

959 mapping = None 

960 

961 return pa_table.to_pandas(types_mapper=mapping) 

962 elif self.engine == "ujson": 

963 if self.lines: 

964 if self.chunksize: 

965 obj = concat(self) 

966 elif self.nrows: 

967 lines = list(islice(self.data, self.nrows)) 

968 lines_json = self._combine_lines(lines) 

969 obj = self._get_object_parser(lines_json) 

970 else: 

971 data = ensure_str(self.data) 

972 data_lines = data.split("\n") 

973 obj = self._get_object_parser(self._combine_lines(data_lines)) 

974 else: 

975 obj = self._get_object_parser(self.data) 

976 if self.dtype_backend is not lib.no_default: 

977 return obj.convert_dtypes( 

978 infer_objects=False, dtype_backend=self.dtype_backend 

979 ) 

980 else: 

981 return obj 

982 

983 def _get_object_parser(self, json) -> DataFrame | Series: 

984 """ 

985 Parses a json document into a pandas object. 

986 """ 

987 typ = self.typ 

988 dtype = self.dtype 

989 kwargs = { 

990 "orient": self.orient, 

991 "dtype": self.dtype, 

992 "convert_axes": self.convert_axes, 

993 "convert_dates": self.convert_dates, 

994 "keep_default_dates": self.keep_default_dates, 

995 "precise_float": self.precise_float, 

996 "date_unit": self.date_unit, 

997 "dtype_backend": self.dtype_backend, 

998 } 

999 obj = None 

1000 if typ == "frame": 

1001 obj = FrameParser(json, **kwargs).parse() 

1002 

1003 if typ == "series" or obj is None: 

1004 if not isinstance(dtype, bool): 

1005 kwargs["dtype"] = dtype 

1006 obj = SeriesParser(json, **kwargs).parse() 

1007 

1008 return obj 

1009 

1010 def close(self) -> None: 

1011 """ 

1012 If we opened a stream earlier, in _get_data_from_filepath, we should 

1013 close it. 

1014 

1015 If an open stream or file was passed, we leave it open. 

1016 """ 

1017 if self.handles is not None: 

1018 self.handles.close() 

1019 

1020 def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]: 

1021 return self 

1022 

1023 @overload 

1024 def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame: 

1025 ... 

1026 

1027 @overload 

1028 def __next__(self: JsonReader[Literal["series"]]) -> Series: 

1029 ... 

1030 

1031 @overload 

1032 def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: 

1033 ... 

1034 

1035 def __next__(self) -> DataFrame | Series: 

1036 if self.nrows and self.nrows_seen >= self.nrows: 

1037 self.close() 

1038 raise StopIteration 

1039 

1040 lines = list(islice(self.data, self.chunksize)) 

1041 if not lines: 

1042 self.close() 

1043 raise StopIteration 

1044 

1045 try: 

1046 lines_json = self._combine_lines(lines) 

1047 obj = self._get_object_parser(lines_json) 

1048 

1049 # Make sure that the returned objects have the right index. 

1050 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) 

1051 self.nrows_seen += len(obj) 

1052 except Exception as ex: 

1053 self.close() 

1054 raise ex 

1055 

1056 if self.dtype_backend is not lib.no_default: 

1057 return obj.convert_dtypes( 

1058 infer_objects=False, dtype_backend=self.dtype_backend 

1059 ) 

1060 else: 

1061 return obj 

1062 

1063 def __enter__(self) -> JsonReader[FrameSeriesStrT]: 

1064 return self 

1065 

1066 def __exit__( 

1067 self, 

1068 exc_type: type[BaseException] | None, 

1069 exc_value: BaseException | None, 

1070 traceback: TracebackType | None, 

1071 ) -> None: 

1072 self.close() 

1073 

1074 

1075class Parser: 

1076 _split_keys: tuple[str, ...] 

1077 _default_orient: str 

1078 

1079 _STAMP_UNITS = ("s", "ms", "us", "ns") 

1080 _MIN_STAMPS = { 

1081 "s": 31536000, 

1082 "ms": 31536000000, 

1083 "us": 31536000000000, 

1084 "ns": 31536000000000000, 

1085 } 

1086 

1087 def __init__( 

1088 self, 

1089 json, 

1090 orient, 

1091 dtype: DtypeArg | None = None, 

1092 convert_axes: bool = True, 

1093 convert_dates: bool | list[str] = True, 

1094 keep_default_dates: bool = False, 

1095 precise_float: bool = False, 

1096 date_unit=None, 

1097 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

1098 ) -> None: 

1099 self.json = json 

1100 

1101 if orient is None: 

1102 orient = self._default_orient 

1103 

1104 self.orient = orient 

1105 

1106 self.dtype = dtype 

1107 

1108 if date_unit is not None: 

1109 date_unit = date_unit.lower() 

1110 if date_unit not in self._STAMP_UNITS: 

1111 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") 

1112 self.min_stamp = self._MIN_STAMPS[date_unit] 

1113 else: 

1114 self.min_stamp = self._MIN_STAMPS["s"] 

1115 

1116 self.precise_float = precise_float 

1117 self.convert_axes = convert_axes 

1118 self.convert_dates = convert_dates 

1119 self.date_unit = date_unit 

1120 self.keep_default_dates = keep_default_dates 

1121 self.obj: DataFrame | Series | None = None 

1122 self.dtype_backend = dtype_backend 

1123 

1124 def check_keys_split(self, decoded) -> None: 

1125 """ 

1126 Checks that dict has only the appropriate keys for orient='split'. 

1127 """ 

1128 bad_keys = set(decoded.keys()).difference(set(self._split_keys)) 

1129 if bad_keys: 

1130 bad_keys_joined = ", ".join(bad_keys) 

1131 raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") 

1132 

1133 def parse(self): 

1134 self._parse() 

1135 

1136 if self.obj is None: 

1137 return None 

1138 if self.convert_axes: 

1139 self._convert_axes() 

1140 self._try_convert_types() 

1141 return self.obj 

1142 

1143 def _parse(self): 

1144 raise AbstractMethodError(self) 

1145 

1146 def _convert_axes(self) -> None: 

1147 """ 

1148 Try to convert axes. 

1149 """ 

1150 obj = self.obj 

1151 assert obj is not None # for mypy 

1152 for axis_name in obj._AXIS_ORDERS: 

1153 new_axis, result = self._try_convert_data( 

1154 name=axis_name, 

1155 data=obj._get_axis(axis_name), 

1156 use_dtypes=False, 

1157 convert_dates=True, 

1158 ) 

1159 if result: 

1160 setattr(self.obj, axis_name, new_axis) 

1161 

1162 def _try_convert_types(self): 

1163 raise AbstractMethodError(self) 

1164 

1165 def _try_convert_data( 

1166 self, 

1167 name, 

1168 data, 

1169 use_dtypes: bool = True, 

1170 convert_dates: bool | list[str] = True, 

1171 ): 

1172 """ 

1173 Try to parse a ndarray like into a column by inferring dtype. 

1174 """ 

1175 # don't try to coerce, unless a force conversion 

1176 if use_dtypes: 

1177 if not self.dtype: 

1178 if all(notna(data)): 

1179 return data, False 

1180 return data.fillna(np.nan), True 

1181 

1182 elif self.dtype is True: 

1183 pass 

1184 else: 

1185 # dtype to force 

1186 dtype = ( 

1187 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype 

1188 ) 

1189 if dtype is not None: 

1190 try: 

1191 return data.astype(dtype), True 

1192 except (TypeError, ValueError): 

1193 return data, False 

1194 

1195 if convert_dates: 

1196 new_data, result = self._try_convert_to_date(data) 

1197 if result: 

1198 return new_data, True 

1199 

1200 if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex): 

1201 # Fall through for conversion later on 

1202 return data, True 

1203 elif data.dtype == "object": 

1204 # try float 

1205 try: 

1206 data = data.astype("float64") 

1207 except (TypeError, ValueError): 

1208 pass 

1209 

1210 if data.dtype.kind == "f": 

1211 if data.dtype != "float64": 

1212 # coerce floats to 64 

1213 try: 

1214 data = data.astype("float64") 

1215 except (TypeError, ValueError): 

1216 pass 

1217 

1218 # don't coerce 0-len data 

1219 if len(data) and data.dtype in ("float", "object"): 

1220 # coerce ints if we can 

1221 try: 

1222 new_data = data.astype("int64") 

1223 if (new_data == data).all(): 

1224 data = new_data 

1225 except (TypeError, ValueError, OverflowError): 

1226 pass 

1227 

1228 # coerce ints to 64 

1229 if data.dtype == "int": 

1230 # coerce floats to 64 

1231 try: 

1232 data = data.astype("int64") 

1233 except (TypeError, ValueError): 

1234 pass 

1235 

1236 # if we have an index, we want to preserve dtypes 

1237 if name == "index" and len(data): 

1238 if self.orient == "split": 

1239 return data, False 

1240 

1241 return data, True 

1242 

1243 def _try_convert_to_date(self, data): 

1244 """ 

1245 Try to parse a ndarray like into a date column. 

1246 

1247 Try to coerce object in epoch/iso formats and integer/float in epoch 

1248 formats. Return a boolean if parsing was successful. 

1249 """ 

1250 # no conversion on empty 

1251 if not len(data): 

1252 return data, False 

1253 

1254 new_data = data 

1255 if new_data.dtype == "object": 

1256 try: 

1257 new_data = data.astype("int64") 

1258 except OverflowError: 

1259 return data, False 

1260 except (TypeError, ValueError): 

1261 pass 

1262 

1263 # ignore numbers that are out of range 

1264 if issubclass(new_data.dtype.type, np.number): 

1265 in_range = ( 

1266 isna(new_data._values) 

1267 | (new_data > self.min_stamp) 

1268 | (new_data._values == iNaT) 

1269 ) 

1270 if not in_range.all(): 

1271 return data, False 

1272 

1273 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS 

1274 for date_unit in date_units: 

1275 try: 

1276 new_data = to_datetime(new_data, errors="raise", unit=date_unit) 

1277 except (ValueError, OverflowError, TypeError): 

1278 continue 

1279 return new_data, True 

1280 return data, False 

1281 

1282 def _try_convert_dates(self): 

1283 raise AbstractMethodError(self) 

1284 

1285 

1286class SeriesParser(Parser): 

1287 _default_orient = "index" 

1288 _split_keys = ("name", "index", "data") 

1289 

1290 def _parse(self) -> None: 

1291 data = loads(self.json, precise_float=self.precise_float) 

1292 

1293 if self.orient == "split": 

1294 decoded = {str(k): v for k, v in data.items()} 

1295 self.check_keys_split(decoded) 

1296 self.obj = Series(**decoded) 

1297 else: 

1298 self.obj = Series(data) 

1299 

1300 def _try_convert_types(self) -> None: 

1301 if self.obj is None: 

1302 return 

1303 obj, result = self._try_convert_data( 

1304 "data", self.obj, convert_dates=self.convert_dates 

1305 ) 

1306 if result: 

1307 self.obj = obj 

1308 

1309 

1310class FrameParser(Parser): 

1311 _default_orient = "columns" 

1312 _split_keys = ("columns", "index", "data") 

1313 

1314 def _parse(self) -> None: 

1315 json = self.json 

1316 orient = self.orient 

1317 

1318 if orient == "columns": 

1319 self.obj = DataFrame( 

1320 loads(json, precise_float=self.precise_float), dtype=None 

1321 ) 

1322 elif orient == "split": 

1323 decoded = { 

1324 str(k): v 

1325 for k, v in loads(json, precise_float=self.precise_float).items() 

1326 } 

1327 self.check_keys_split(decoded) 

1328 orig_names = [ 

1329 (tuple(col) if isinstance(col, list) else col) 

1330 for col in decoded["columns"] 

1331 ] 

1332 decoded["columns"] = dedup_names( 

1333 orig_names, 

1334 is_potential_multi_index(orig_names, None), 

1335 ) 

1336 self.obj = DataFrame(dtype=None, **decoded) 

1337 elif orient == "index": 

1338 self.obj = DataFrame.from_dict( 

1339 loads(json, precise_float=self.precise_float), 

1340 dtype=None, 

1341 orient="index", 

1342 ) 

1343 elif orient == "table": 

1344 self.obj = parse_table_schema(json, precise_float=self.precise_float) 

1345 else: 

1346 self.obj = DataFrame( 

1347 loads(json, precise_float=self.precise_float), dtype=None 

1348 ) 

1349 

1350 def _process_converter(self, f, filt=None) -> None: 

1351 """ 

1352 Take a conversion function and possibly recreate the frame. 

1353 """ 

1354 if filt is None: 

1355 filt = lambda col, c: True 

1356 

1357 obj = self.obj 

1358 assert obj is not None # for mypy 

1359 

1360 needs_new_obj = False 

1361 new_obj = {} 

1362 for i, (col, c) in enumerate(obj.items()): 

1363 if filt(col, c): 

1364 new_data, result = f(col, c) 

1365 if result: 

1366 c = new_data 

1367 needs_new_obj = True 

1368 new_obj[i] = c 

1369 

1370 if needs_new_obj: 

1371 # possibly handle dup columns 

1372 new_frame = DataFrame(new_obj, index=obj.index) 

1373 new_frame.columns = obj.columns 

1374 self.obj = new_frame 

1375 

1376 def _try_convert_types(self) -> None: 

1377 if self.obj is None: 

1378 return 

1379 if self.convert_dates: 

1380 self._try_convert_dates() 

1381 

1382 self._process_converter( 

1383 lambda col, c: self._try_convert_data(col, c, convert_dates=False) 

1384 ) 

1385 

1386 def _try_convert_dates(self) -> None: 

1387 if self.obj is None: 

1388 return 

1389 

1390 # our columns to parse 

1391 convert_dates_list_bool = self.convert_dates 

1392 if isinstance(convert_dates_list_bool, bool): 

1393 convert_dates_list_bool = [] 

1394 convert_dates = set(convert_dates_list_bool) 

1395 

1396 def is_ok(col) -> bool: 

1397 """ 

1398 Return if this col is ok to try for a date parse. 

1399 """ 

1400 if not isinstance(col, str): 

1401 return False 

1402 

1403 col_lower = col.lower() 

1404 if ( 

1405 col_lower.endswith("_at") 

1406 or col_lower.endswith("_time") 

1407 or col_lower == "modified" 

1408 or col_lower == "date" 

1409 or col_lower == "datetime" 

1410 or col_lower.startswith("timestamp") 

1411 ): 

1412 return True 

1413 return False 

1414 

1415 self._process_converter( 

1416 lambda col, c: self._try_convert_to_date(c), 

1417 lambda col, c: ( 

1418 (self.keep_default_dates and is_ok(col)) or col in convert_dates 

1419 ), 

1420 )