Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/

1from __future__ import annotations

3from abc import (

4 ABC,

5 abstractmethod,

7from collections import abc

8from io import StringIO

9from itertools import islice

10from typing import (

11 TYPE_CHECKING,

12 Any,

13 Callable,

14 Generic,

15 Literal,

16 TypeVar,

17 final,

18 overload,

19)

20import warnings

22import numpy as np

24from pandas._libs import lib

25from pandas._libs.json import (

26 ujson_dumps,

27 ujson_loads,

28)

29from pandas._libs.tslibs import iNaT

30from pandas.compat._optional import import_optional_dependency

31from pandas.errors import AbstractMethodError

32from pandas.util._decorators import doc

33from pandas.util._exceptions import find_stack_level

34from pandas.util._validators import check_dtype_backend

36from pandas.core.dtypes.common import (

37 ensure_str,

38 is_string_dtype,

39)

40from pandas.core.dtypes.dtypes import PeriodDtype

42from pandas import (

43 ArrowDtype,

44 DataFrame,

45 Index,

46 MultiIndex,

47 Series,

48 isna,

49 notna,

50 to_datetime,

51)

52from pandas.core.reshape.concat import concat

53from pandas.core.shared_docs import _shared_docs

55from pandas.io.common import (

56 IOHandles,

57 dedup_names,

58 extension_to_compression,

59 file_exists,

60 get_handle,

61 is_fsspec_url,

62 is_potential_multi_index,

63 is_url,

64 stringify_path,

65)

66from pandas.io.json._normalize import convert_to_line_delimits

67from pandas.io.json._table_schema import (

68 build_table_schema,

69 parse_table_schema,

70)

71from pandas.io.parsers.readers import validate_integer

73if TYPE_CHECKING:

74 from collections.abc import (

75 Hashable,

76 Mapping,

77 )

78 from types import TracebackType

80 from pandas._typing import (

81 CompressionOptions,

82 DtypeArg,

83 DtypeBackend,

84 FilePath,

85 IndexLabel,

86 JSONEngine,

87 JSONSerializable,

88 ReadBuffer,

89 Self,

90 StorageOptions,

91 WriteBuffer,

92 )

94 from pandas.core.generic import NDFrame

96FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"])

99# interface to/from

100@overload

101def to_json(

102 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes],

103 obj: NDFrame,

104 orient: str | None = ...,

105 date_format: str = ...,

106 double_precision: int = ...,

107 force_ascii: bool = ...,

108 date_unit: str = ...,

109 default_handler: Callable[[Any], JSONSerializable] | None = ...,

110 lines: bool = ...,

111 compression: CompressionOptions = ...,

112 index: bool | None = ...,

113 indent: int = ...,

114 storage_options: StorageOptions = ...,

115 mode: Literal["a", "w"] = ...,

116) -> None:

117 ...

118

119

120@overload

121def to_json(

122 path_or_buf: None,

123 obj: NDFrame,

124 orient: str | None = ...,

125 date_format: str = ...,

126 double_precision: int = ...,

127 force_ascii: bool = ...,

128 date_unit: str = ...,

129 default_handler: Callable[[Any], JSONSerializable] | None = ...,

130 lines: bool = ...,

131 compression: CompressionOptions = ...,

132 index: bool | None = ...,

133 indent: int = ...,

134 storage_options: StorageOptions = ...,

135 mode: Literal["a", "w"] = ...,

136) -> str:

137 ...

138

139

140def to_json(

141 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None,

142 obj: NDFrame,

143 orient: str | None = None,

144 date_format: str = "epoch",

145 double_precision: int = 10,

146 force_ascii: bool = True,

147 date_unit: str = "ms",

148 default_handler: Callable[[Any], JSONSerializable] | None = None,

149 lines: bool = False,

150 compression: CompressionOptions = "infer",

151 index: bool | None = None,

152 indent: int = 0,

153 storage_options: StorageOptions | None = None,

154 mode: Literal["a", "w"] = "w",

155) -> str | None:

156 if orient in ["records", "values"] and index is True:

157 raise ValueError(

158 "'index=True' is only valid when 'orient' is 'split', 'table', "

159 "'index', or 'columns'."

160 )

161 elif orient in ["index", "columns"] and index is False:

162 raise ValueError(

163 "'index=False' is only valid when 'orient' is 'split', 'table', "

164 "'records', or 'values'."

165 )

166 elif index is None:

167 # will be ignored for orient='records' and 'values'

168 index = True

169

170 if lines and orient != "records":

171 raise ValueError("'lines' keyword only valid when 'orient' is records")

172

173 if mode not in ["a", "w"]:

174 msg = (

175 f"mode={mode} is not a valid option."

176 "Only 'w' and 'a' are currently supported."

177 )

178 raise ValueError(msg)

179

180 if mode == "a" and (not lines or orient != "records"):

181 msg = (

182 "mode='a' (append) is only supported when "

183 "lines is True and orient is 'records'"

184 )

185 raise ValueError(msg)

186

187 if orient == "table" and isinstance(obj, Series):

188 obj = obj.to_frame(name=obj.name or "values")

189

190 writer: type[Writer]

191 if orient == "table" and isinstance(obj, DataFrame):

192 writer = JSONTableWriter

193 elif isinstance(obj, Series):

194 writer = SeriesWriter

195 elif isinstance(obj, DataFrame):

196 writer = FrameWriter

197 else:

198 raise NotImplementedError("'obj' should be a Series or a DataFrame")

199

200 s = writer(

201 obj,

202 orient=orient,

203 date_format=date_format,

204 double_precision=double_precision,

205 ensure_ascii=force_ascii,

206 date_unit=date_unit,

207 default_handler=default_handler,

208 index=index,

209 indent=indent,

210 ).write()

211

212 if lines:

213 s = convert_to_line_delimits(s)

214

215 if path_or_buf is not None:

216 # apply compression and byte/text conversion

217 with get_handle(

218 path_or_buf, mode, compression=compression, storage_options=storage_options

219 ) as handles:

220 handles.handle.write(s)

221 else:

222 return s

223 return None

224

225

226class Writer(ABC):

227 _default_orient: str

228

229 def __init__(

230 self,

231 obj: NDFrame,

232 orient: str | None,

233 date_format: str,

234 double_precision: int,

235 ensure_ascii: bool,

236 date_unit: str,

237 index: bool,

238 default_handler: Callable[[Any], JSONSerializable] | None = None,

239 indent: int = 0,

240 ) -> None:

241 self.obj = obj

242

243 if orient is None:

244 orient = self._default_orient

245

246 self.orient = orient

247 self.date_format = date_format

248 self.double_precision = double_precision

249 self.ensure_ascii = ensure_ascii

250 self.date_unit = date_unit

251 self.default_handler = default_handler

252 self.index = index

253 self.indent = indent

254

255 self.is_copy = None

256 self._format_axes()

257

258 def _format_axes(self) -> None:

259 raise AbstractMethodError(self)

260

261 def write(self) -> str:

262 iso_dates = self.date_format == "iso"

263 return ujson_dumps(

264 self.obj_to_write,

265 orient=self.orient,

266 double_precision=self.double_precision,

267 ensure_ascii=self.ensure_ascii,

268 date_unit=self.date_unit,

269 iso_dates=iso_dates,

270 default_handler=self.default_handler,

271 indent=self.indent,

272 )

273

274 @property

275 @abstractmethod

276 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

277 """Object to write in JSON format."""

278

279

280class SeriesWriter(Writer):

281 _default_orient = "index"

282

283 @property

284 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

285 if not self.index and self.orient == "split":

286 return {"name": self.obj.name, "data": self.obj.values}

287 else:

288 return self.obj

289

290 def _format_axes(self) -> None:

291 if not self.obj.index.is_unique and self.orient == "index":

292 raise ValueError(f"Series index must be unique for orient='{self.orient}'")

293

294

295class FrameWriter(Writer):

296 _default_orient = "columns"

297

298 @property

299 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

300 if not self.index and self.orient == "split":

301 obj_to_write = self.obj.to_dict(orient="split")

302 del obj_to_write["index"]

303 else:

304 obj_to_write = self.obj

305 return obj_to_write

306

307 def _format_axes(self) -> None:

308 """

309 Try to format axes if they are datelike.

310 """

311 if not self.obj.index.is_unique and self.orient in ("index", "columns"):

312 raise ValueError(

313 f"DataFrame index must be unique for orient='{self.orient}'."

314 )

315 if not self.obj.columns.is_unique and self.orient in (

316 "index",

317 "columns",

318 "records",

319 ):

320 raise ValueError(

321 f"DataFrame columns must be unique for orient='{self.orient}'."

322 )

323

324

325class JSONTableWriter(FrameWriter):

326 _default_orient = "records"

327

328 def __init__(

329 self,

330 obj,

331 orient: str | None,

332 date_format: str,

333 double_precision: int,

334 ensure_ascii: bool,

335 date_unit: str,

336 index: bool,

337 default_handler: Callable[[Any], JSONSerializable] | None = None,

338 indent: int = 0,

339 ) -> None:

340 """

341 Adds a `schema` attribute with the Table Schema, resets

342 the index (can't do in caller, because the schema inference needs

343 to know what the index is, forces orient to records, and forces

344 date_format to 'iso'.

345 """

346 super().__init__(

347 obj,

348 orient,

349 date_format,

350 double_precision,

351 ensure_ascii,

352 date_unit,

353 index,

354 default_handler=default_handler,

355 indent=indent,

356 )

357

358 if date_format != "iso":

359 msg = (

360 "Trying to write with `orient='table'` and "

361 f"`date_format='{date_format}'`. Table Schema requires dates "

362 "to be formatted with `date_format='iso'`"

363 )

364 raise ValueError(msg)

365

366 self.schema = build_table_schema(obj, index=self.index)

367

368 # NotImplemented on a column MultiIndex

369 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):

370 raise NotImplementedError(

371 "orient='table' is not supported for MultiIndex columns"

372 )

373

374 # TODO: Do this timedelta properly in objToJSON.c See GH #15137

375 if (

376 (obj.ndim == 1)

377 and (obj.name in set(obj.index.names))

378 or len(obj.columns.intersection(obj.index.names))

379 ):

380 msg = "Overlapping names between the index and columns"

381 raise ValueError(msg)

382

383 obj = obj.copy()

384 timedeltas = obj.select_dtypes(include=["timedelta"]).columns

385 if len(timedeltas):

386 obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat())

387 # Convert PeriodIndex to datetimes before serializing

388 if isinstance(obj.index.dtype, PeriodDtype):

389 obj.index = obj.index.to_timestamp()

390

391 # exclude index from obj if index=False

392 if not self.index:

393 self.obj = obj.reset_index(drop=True)

394 else:

395 self.obj = obj.reset_index(drop=False)

396 self.date_format = "iso"

397 self.orient = "records"

398 self.index = index

399

400 @property

401 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

402 return {"schema": self.schema, "data": self.obj}

403

404

405@overload

406def read_json(

407 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

408 *,

409 orient: str | None = ...,

410 typ: Literal["frame"] = ...,

411 dtype: DtypeArg | None = ...,

412 convert_axes: bool | None = ...,

413 convert_dates: bool | list[str] = ...,

414 keep_default_dates: bool = ...,

415 precise_float: bool = ...,

416 date_unit: str | None = ...,

417 encoding: str | None = ...,

418 encoding_errors: str | None = ...,

419 lines: bool = ...,

420 chunksize: int,

421 compression: CompressionOptions = ...,

422 nrows: int | None = ...,

423 storage_options: StorageOptions = ...,

424 dtype_backend: DtypeBackend | lib.NoDefault = ...,

425 engine: JSONEngine = ...,

426) -> JsonReader[Literal["frame"]]:

427 ...

428

429

430@overload

431def read_json(

432 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

433 *,

434 orient: str | None = ...,

435 typ: Literal["series"],

436 dtype: DtypeArg | None = ...,

437 convert_axes: bool | None = ...,

438 convert_dates: bool | list[str] = ...,

439 keep_default_dates: bool = ...,

440 precise_float: bool = ...,

441 date_unit: str | None = ...,

442 encoding: str | None = ...,

443 encoding_errors: str | None = ...,

444 lines: bool = ...,

445 chunksize: int,

446 compression: CompressionOptions = ...,

447 nrows: int | None = ...,

448 storage_options: StorageOptions = ...,

449 dtype_backend: DtypeBackend | lib.NoDefault = ...,

450 engine: JSONEngine = ...,

451) -> JsonReader[Literal["series"]]:

452 ...

453

454

455@overload

456def read_json(

457 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

458 *,

459 orient: str | None = ...,

460 typ: Literal["series"],

461 dtype: DtypeArg | None = ...,

462 convert_axes: bool | None = ...,

463 convert_dates: bool | list[str] = ...,

464 keep_default_dates: bool = ...,

465 precise_float: bool = ...,

466 date_unit: str | None = ...,

467 encoding: str | None = ...,

468 encoding_errors: str | None = ...,

469 lines: bool = ...,

470 chunksize: None = ...,

471 compression: CompressionOptions = ...,

472 nrows: int | None = ...,

473 storage_options: StorageOptions = ...,

474 dtype_backend: DtypeBackend | lib.NoDefault = ...,

475 engine: JSONEngine = ...,

476) -> Series:

477 ...

478

479

480@overload

481def read_json(

482 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

483 *,

484 orient: str | None = ...,

485 typ: Literal["frame"] = ...,

486 dtype: DtypeArg | None = ...,

487 convert_axes: bool | None = ...,

488 convert_dates: bool | list[str] = ...,

489 keep_default_dates: bool = ...,

490 precise_float: bool = ...,

491 date_unit: str | None = ...,

492 encoding: str | None = ...,

493 encoding_errors: str | None = ...,

494 lines: bool = ...,

495 chunksize: None = ...,

496 compression: CompressionOptions = ...,

497 nrows: int | None = ...,

498 storage_options: StorageOptions = ...,

499 dtype_backend: DtypeBackend | lib.NoDefault = ...,

500 engine: JSONEngine = ...,

501) -> DataFrame:

502 ...

503

504

505@doc(

506 storage_options=_shared_docs["storage_options"],

507 decompression_options=_shared_docs["decompression_options"] % "path_or_buf",

508)

509def read_json(

510 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

511 *,

512 orient: str | None = None,

513 typ: Literal["frame", "series"] = "frame",

514 dtype: DtypeArg | None = None,

515 convert_axes: bool | None = None,

516 convert_dates: bool | list[str] = True,

517 keep_default_dates: bool = True,

518 precise_float: bool = False,

519 date_unit: str | None = None,

520 encoding: str | None = None,

521 encoding_errors: str | None = "strict",

522 lines: bool = False,

523 chunksize: int | None = None,

524 compression: CompressionOptions = "infer",

525 nrows: int | None = None,

526 storage_options: StorageOptions | None = None,

527 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

528 engine: JSONEngine = "ujson",

529) -> DataFrame | Series | JsonReader:

530 """

531 Convert a JSON string to pandas object.

532

533 Parameters

534 ----------

535 path_or_buf : a valid JSON str, path object or file-like object

536 Any valid string path is acceptable. The string could be a URL. Valid

537 URL schemes include http, ftp, s3, and file. For file URLs, a host is

538 expected. A local file could be:

539 ``file://localhost/path/to/table.json``.

540

541 If you want to pass in a path object, pandas accepts any

542 ``os.PathLike``.

543

544 By file-like object, we refer to objects with a ``read()`` method,

545 such as a file handle (e.g. via builtin ``open`` function)

546 or ``StringIO``.

547

548 .. deprecated:: 2.1.0

549 Passing json literal strings is deprecated.

550

551 orient : str, optional

552 Indication of expected JSON string format.

553 Compatible JSON strings can be produced by ``to_json()`` with a

554 corresponding orient value.

555 The set of possible orients is:

556

557 - ``'split'`` : dict like

558 ``{{index -> [index], columns -> [columns], data -> [values]}}``

559 - ``'records'`` : list like

560 ``[{{column -> value}}, ... , {{column -> value}}]``

561 - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``

562 - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``

563 - ``'values'`` : just the values array

564 - ``'table'`` : dict like ``{{'schema': {{schema}}, 'data': {{data}}}}``

565

566 The allowed and default values depend on the value

567 of the `typ` parameter.

568

569 * when ``typ == 'series'``,

570

571 - allowed orients are ``{{'split','records','index'}}``

572 - default is ``'index'``

573 - The Series index must be unique for orient ``'index'``.

574

575 * when ``typ == 'frame'``,

576

577 - allowed orients are ``{{'split','records','index',

578 'columns','values', 'table'}}``

579 - default is ``'columns'``

580 - The DataFrame index must be unique for orients ``'index'`` and

581 ``'columns'``.

582 - The DataFrame columns must be unique for orients ``'index'``,

583 ``'columns'``, and ``'records'``.

584

585 typ : {{'frame', 'series'}}, default 'frame'

586 The type of object to recover.

587

588 dtype : bool or dict, default None

589 If True, infer dtypes; if a dict of column to dtype, then use those;

590 if False, then don't infer dtypes at all, applies only to the data.

591

592 For all ``orient`` values except ``'table'``, default is True.

593

594 convert_axes : bool, default None

595 Try to convert the axes to the proper dtypes.

596

597 For all ``orient`` values except ``'table'``, default is True.

598

599 convert_dates : bool or list of str, default True

600 If True then default datelike columns may be converted (depending on

601 keep_default_dates).

602 If False, no dates will be converted.

603 If a list of column names, then those columns will be converted and

604 default datelike columns may also be converted (depending on

605 keep_default_dates).

606

607 keep_default_dates : bool, default True

608 If parsing dates (convert_dates is not False), then try to parse the

609 default datelike columns.

610 A column label is datelike if

611

612 * it ends with ``'_at'``,

613

614 * it ends with ``'_time'``,

615

616 * it begins with ``'timestamp'``,

617

618 * it is ``'modified'``, or

619

620 * it is ``'date'``.

621

622 precise_float : bool, default False

623 Set to enable usage of higher precision (strtod) function when

624 decoding string to double values. Default (False) is to use fast but

625 less precise builtin functionality.

626

627 date_unit : str, default None

628 The timestamp unit to detect if converting dates. The default behaviour

629 is to try and detect the correct precision, but if this is not desired

630 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,

631 milliseconds, microseconds or nanoseconds respectively.

632

633 encoding : str, default is 'utf-8'

634 The encoding to use to decode py3 bytes.

635

636 encoding_errors : str, optional, default "strict"

637 How encoding errors are treated. `List of possible values

638 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .

639

640 .. versionadded:: 1.3.0

641

642 lines : bool, default False

643 Read the file as a json object per line.

644

645 chunksize : int, optional

646 Return JsonReader object for iteration.

647 See the `line-delimited json docs

648 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_

649 for more information on ``chunksize``.

650 This can only be passed if `lines=True`.

651 If this is None, the file will be read into memory all at once.

652 {decompression_options}

653

654 .. versionchanged:: 1.4.0 Zstandard support.

655

656 nrows : int, optional

657 The number of lines from the line-delimited jsonfile that has to be read.

658 This can only be passed if `lines=True`.

659 If this is None, all the rows will be returned.

660

661 {storage_options}

662

663 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'

664 Back-end data type applied to the resultant :class:`DataFrame`

665 (still experimental). Behaviour is as follows:

666

667 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

668 (default).

669 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

670 DataFrame.

671

672 .. versionadded:: 2.0

673

674 engine : {{"ujson", "pyarrow"}}, default "ujson"

675 Parser engine to use. The ``"pyarrow"`` engine is only available when

676 ``lines=True``.

677

678 .. versionadded:: 2.0

679

680 Returns

681 -------

682 Series, DataFrame, or pandas.api.typing.JsonReader

683 A JsonReader is returned when ``chunksize`` is not ``0`` or ``None``.

684 Otherwise, the type returned depends on the value of ``typ``.

685

686 See Also

687 --------

688 DataFrame.to_json : Convert a DataFrame to a JSON string.

689 Series.to_json : Convert a Series to a JSON string.

690 json_normalize : Normalize semi-structured JSON data into a flat table.

691

692 Notes

693 -----

694 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal

695 :class:`Index` name of `index` gets written with :func:`to_json`, the

696 subsequent read operation will incorrectly set the :class:`Index` name to

697 ``None``. This is because `index` is also used by :func:`DataFrame.to_json`

698 to denote a missing :class:`Index` name, and the subsequent

699 :func:`read_json` operation cannot distinguish between the two. The same

700 limitation is encountered with a :class:`MultiIndex` and any names

701 beginning with ``'level_'``.

702

703 Examples

704 --------

705 >>> from io import StringIO

706 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],

707 ... index=['row 1', 'row 2'],

708 ... columns=['col 1', 'col 2'])

709

710 Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

711

712 >>> df.to_json(orient='split')

713 '\

714{{\

715"columns":["col 1","col 2"],\

716"index":["row 1","row 2"],\

717"data":[["a","b"],["c","d"]]\

718}}\

719'

720 >>> pd.read_json(StringIO(_), orient='split')

721 col 1 col 2

722 row 1 a b

723 row 2 c d

724

725 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

726

727 >>> df.to_json(orient='index')

728 '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'

729

730 >>> pd.read_json(StringIO(_), orient='index')

731 col 1 col 2

732 row 1 a b

733 row 2 c d

734

735 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.

736 Note that index labels are not preserved with this encoding.

737

738 >>> df.to_json(orient='records')

739 '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'

740 >>> pd.read_json(StringIO(_), orient='records')

741 col 1 col 2

742 0 a b

743 1 c d

744

745 Encoding with Table Schema

746

747 >>> df.to_json(orient='table')

748 '\

749{{"schema":{{"fields":[\

750{{"name":"index","type":"string"}},\

751{{"name":"col 1","type":"string"}},\

752{{"name":"col 2","type":"string"}}],\

753"primaryKey":["index"],\

754"pandas_version":"1.4.0"}},\

755"data":[\

756{{"index":"row 1","col 1":"a","col 2":"b"}},\

757{{"index":"row 2","col 1":"c","col 2":"d"}}]\

758}}\

759'

760

761 The following example uses ``dtype_backend="numpy_nullable"``

762

763 >>> data = '''{{"index": {{"0": 0, "1": 1}},

764 ... "a": {{"0": 1, "1": null}},

765 ... "b": {{"0": 2.5, "1": 4.5}},

766 ... "c": {{"0": true, "1": false}},

767 ... "d": {{"0": "a", "1": "b"}},

768 ... "e": {{"0": 1577.2, "1": 1577.1}}}}'''

769 >>> pd.read_json(StringIO(data), dtype_backend="numpy_nullable")

770 index a b c d e

771 0 0 1 2.5 True a 1577.2

772 1 1 <NA> 4.5 False b 1577.1

773 """

774 if orient == "table" and dtype:

775 raise ValueError("cannot pass both dtype and orient='table'")

776 if orient == "table" and convert_axes:

777 raise ValueError("cannot pass both convert_axes and orient='table'")

778

779 check_dtype_backend(dtype_backend)

780

781 if dtype is None and orient != "table":

782 # error: Incompatible types in assignment (expression has type "bool", variable

783 # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],

784 # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable,

785 # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],

786 # Type[int], Type[complex], Type[bool], Type[object]]], None]")

787 dtype = True # type: ignore[assignment]

788 if convert_axes is None and orient != "table":

789 convert_axes = True

790

791 json_reader = JsonReader(

792 path_or_buf,

793 orient=orient,

794 typ=typ,

795 dtype=dtype,

796 convert_axes=convert_axes,

797 convert_dates=convert_dates,

798 keep_default_dates=keep_default_dates,

799 precise_float=precise_float,

800 date_unit=date_unit,

801 encoding=encoding,

802 lines=lines,

803 chunksize=chunksize,

804 compression=compression,

805 nrows=nrows,

806 storage_options=storage_options,

807 encoding_errors=encoding_errors,

808 dtype_backend=dtype_backend,

809 engine=engine,

810 )

811

812 if chunksize:

813 return json_reader

814 else:

815 return json_reader.read()

816

817

818class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):

819 """

820 JsonReader provides an interface for reading in a JSON file.

821

822 If initialized with ``lines=True`` and ``chunksize``, can be iterated over

823 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the

824 whole document.

825 """

826

827 def __init__(

828 self,

829 filepath_or_buffer,

830 orient,

831 typ: FrameSeriesStrT,

832 dtype,

833 convert_axes: bool | None,

834 convert_dates,

835 keep_default_dates: bool,

836 precise_float: bool,

837 date_unit,

838 encoding,

839 lines: bool,

840 chunksize: int | None,

841 compression: CompressionOptions,

842 nrows: int | None,

843 storage_options: StorageOptions | None = None,

844 encoding_errors: str | None = "strict",

845 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

846 engine: JSONEngine = "ujson",

847 ) -> None:

848 self.orient = orient

849 self.typ = typ

850 self.dtype = dtype

851 self.convert_axes = convert_axes

852 self.convert_dates = convert_dates

853 self.keep_default_dates = keep_default_dates

854 self.precise_float = precise_float

855 self.date_unit = date_unit

856 self.encoding = encoding

857 self.engine = engine

858 self.compression = compression

859 self.storage_options = storage_options

860 self.lines = lines

861 self.chunksize = chunksize

862 self.nrows_seen = 0

863 self.nrows = nrows

864 self.encoding_errors = encoding_errors

865 self.handles: IOHandles[str] | None = None

866 self.dtype_backend = dtype_backend

867

868 if self.engine not in {"pyarrow", "ujson"}:

869 raise ValueError(

870 f"The engine type {self.engine} is currently not supported."

871 )

872 if self.chunksize is not None:

873 self.chunksize = validate_integer("chunksize", self.chunksize, 1)

874 if not self.lines:

875 raise ValueError("chunksize can only be passed if lines=True")

876 if self.engine == "pyarrow":

877 raise ValueError(

878 "currently pyarrow engine doesn't support chunksize parameter"

879 )

880 if self.nrows is not None:

881 self.nrows = validate_integer("nrows", self.nrows, 0)

882 if not self.lines:

883 raise ValueError("nrows can only be passed if lines=True")

884 if (

885 isinstance(filepath_or_buffer, str)

886 and not self.lines

887 and "\n" in filepath_or_buffer

888 ):

889 warnings.warn(

890 "Passing literal json to 'read_json' is deprecated and "

891 "will be removed in a future version. To read from a "

892 "literal string, wrap it in a 'StringIO' object.",

893 FutureWarning,

894 stacklevel=find_stack_level(),

895 )

896 if self.engine == "pyarrow":

897 if not self.lines:

898 raise ValueError(

899 "currently pyarrow engine only supports "

900 "the line-delimited JSON format"

901 )

902 self.data = filepath_or_buffer

903 elif self.engine == "ujson":

904 data = self._get_data_from_filepath(filepath_or_buffer)

905 self.data = self._preprocess_data(data)

906

907 def _preprocess_data(self, data):

908 """

909 At this point, the data either has a `read` attribute (e.g. a file

910 object or a StringIO) or is a string that is a JSON document.

911

912 If self.chunksize, we prepare the data for the `__next__` method.

913 Otherwise, we read it into memory for the `read` method.

914 """

915 if hasattr(data, "read") and not (self.chunksize or self.nrows):

916 with self:

917 data = data.read()

918 if not hasattr(data, "read") and (self.chunksize or self.nrows):

919 data = StringIO(data)

920

921 return data

922

923 def _get_data_from_filepath(self, filepath_or_buffer):

924 """

925 The function read_json accepts three input types:

926 1. filepath (string-like)

927 2. file-like object (e.g. open file object, StringIO)

928 3. JSON string

929

930 This method turns (1) into (2) to simplify the rest of the processing.

931 It returns input types (2) and (3) unchanged.

932

933 It raises FileNotFoundError if the input is a string ending in

934 one of .json, .json.gz, .json.bz2, etc. but no such file exists.

935 """

936 # if it is a string but the file does not exist, it might be a JSON string

937 filepath_or_buffer = stringify_path(filepath_or_buffer)

938 if (

939 not isinstance(filepath_or_buffer, str)

940 or is_url(filepath_or_buffer)

941 or is_fsspec_url(filepath_or_buffer)

942 or file_exists(filepath_or_buffer)

943 ):

944 self.handles = get_handle(

945 filepath_or_buffer,

946 "r",

947 encoding=self.encoding,

948 compression=self.compression,

949 storage_options=self.storage_options,

950 errors=self.encoding_errors,

951 )

952 filepath_or_buffer = self.handles.handle

953 elif (

954 isinstance(filepath_or_buffer, str)

955 and filepath_or_buffer.lower().endswith(

956 (".json",) + tuple(f".json{c}" for c in extension_to_compression)

957 )

958 and not file_exists(filepath_or_buffer)

959 ):

960 raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")

961 else:

962 warnings.warn(

963 "Passing literal json to 'read_json' is deprecated and "

964 "will be removed in a future version. To read from a "

965 "literal string, wrap it in a 'StringIO' object.",

966 FutureWarning,

967 stacklevel=find_stack_level(),

968 )

969 return filepath_or_buffer

970

971 def _combine_lines(self, lines) -> str:

972 """

973 Combines a list of JSON objects into one JSON object.

974 """

975 return (

976 f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'

977 )

978

979 @overload

980 def read(self: JsonReader[Literal["frame"]]) -> DataFrame:

981 ...

982

983 @overload

984 def read(self: JsonReader[Literal["series"]]) -> Series:

985 ...

986

987 @overload

988 def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:

989 ...

990

991 def read(self) -> DataFrame | Series:

992 """

993 Read the whole JSON input into a pandas object.

994 """

995 obj: DataFrame | Series

996 with self:

997 if self.engine == "pyarrow":

998 pyarrow_json = import_optional_dependency("pyarrow.json")

999 pa_table = pyarrow_json.read_json(self.data)

1000

1001 mapping: type[ArrowDtype] | None | Callable

1002 if self.dtype_backend == "pyarrow":

1003 mapping = ArrowDtype

1004 elif self.dtype_backend == "numpy_nullable":

1005 from pandas.io._util import _arrow_dtype_mapping

1006

1007 mapping = _arrow_dtype_mapping().get

1008 else:

1009 mapping = None

1010

1011 return pa_table.to_pandas(types_mapper=mapping)

1012 elif self.engine == "ujson":

1013 if self.lines:

1014 if self.chunksize:

1015 obj = concat(self)

1016 elif self.nrows:

1017 lines = list(islice(self.data, self.nrows))

1018 lines_json = self._combine_lines(lines)

1019 obj = self._get_object_parser(lines_json)

1020 else:

1021 data = ensure_str(self.data)

1022 data_lines = data.split("\n")

1023 obj = self._get_object_parser(self._combine_lines(data_lines))

1024 else:

1025 obj = self._get_object_parser(self.data)

1026 if self.dtype_backend is not lib.no_default:

1027 return obj.convert_dtypes(

1028 infer_objects=False, dtype_backend=self.dtype_backend

1029 )

1030 else:

1031 return obj

1032

1033 def _get_object_parser(self, json) -> DataFrame | Series:

1034 """

1035 Parses a json document into a pandas object.

1036 """

1037 typ = self.typ

1038 dtype = self.dtype

1039 kwargs = {

1040 "orient": self.orient,

1041 "dtype": self.dtype,

1042 "convert_axes": self.convert_axes,

1043 "convert_dates": self.convert_dates,

1044 "keep_default_dates": self.keep_default_dates,

1045 "precise_float": self.precise_float,

1046 "date_unit": self.date_unit,

1047 "dtype_backend": self.dtype_backend,

1048 }

1049 obj = None

1050 if typ == "frame":

1051 obj = FrameParser(json, **kwargs).parse()

1052

1053 if typ == "series" or obj is None:

1054 if not isinstance(dtype, bool):

1055 kwargs["dtype"] = dtype

1056 obj = SeriesParser(json, **kwargs).parse()

1057

1058 return obj

1059

1060 def close(self) -> None:

1061 """

1062 If we opened a stream earlier, in _get_data_from_filepath, we should

1063 close it.

1064

1065 If an open stream or file was passed, we leave it open.

1066 """

1067 if self.handles is not None:

1068 self.handles.close()

1069

1070 def __iter__(self) -> Self:

1071 return self

1072

1073 @overload

1074 def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame:

1075 ...

1076

1077 @overload

1078 def __next__(self: JsonReader[Literal["series"]]) -> Series:

1079 ...

1080

1081 @overload

1082 def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:

1083 ...

1084

1085 def __next__(self) -> DataFrame | Series:

1086 if self.nrows and self.nrows_seen >= self.nrows:

1087 self.close()

1088 raise StopIteration

1089

1090 lines = list(islice(self.data, self.chunksize))

1091 if not lines:

1092 self.close()

1093 raise StopIteration

1094

1095 try:

1096 lines_json = self._combine_lines(lines)

1097 obj = self._get_object_parser(lines_json)

1098

1099 # Make sure that the returned objects have the right index.

1100 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))

1101 self.nrows_seen += len(obj)

1102 except Exception as ex:

1103 self.close()

1104 raise ex

1105

1106 if self.dtype_backend is not lib.no_default:

1107 return obj.convert_dtypes(

1108 infer_objects=False, dtype_backend=self.dtype_backend

1109 )

1110 else:

1111 return obj

1112

1113 def __enter__(self) -> Self:

1114 return self

1115

1116 def __exit__(

1117 self,

1118 exc_type: type[BaseException] | None,

1119 exc_value: BaseException | None,

1120 traceback: TracebackType | None,

1121 ) -> None:

1122 self.close()

1123

1124

1125class Parser:

1126 _split_keys: tuple[str, ...]

1127 _default_orient: str

1128

1129 _STAMP_UNITS = ("s", "ms", "us", "ns")

1130 _MIN_STAMPS = {

1131 "s": 31536000,

1132 "ms": 31536000000,

1133 "us": 31536000000000,

1134 "ns": 31536000000000000,

1135 }

1136 json: str

1137

1138 def __init__(

1139 self,

1140 json: str,

1141 orient,

1142 dtype: DtypeArg | None = None,

1143 convert_axes: bool = True,

1144 convert_dates: bool | list[str] = True,

1145 keep_default_dates: bool = False,

1146 precise_float: bool = False,

1147 date_unit=None,

1148 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1149 ) -> None:

1150 self.json = json

1151

1152 if orient is None:

1153 orient = self._default_orient

1154

1155 self.orient = orient

1156

1157 self.dtype = dtype

1158

1159 if date_unit is not None:

1160 date_unit = date_unit.lower()

1161 if date_unit not in self._STAMP_UNITS:

1162 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")

1163 self.min_stamp = self._MIN_STAMPS[date_unit]

1164 else:

1165 self.min_stamp = self._MIN_STAMPS["s"]

1166

1167 self.precise_float = precise_float

1168 self.convert_axes = convert_axes

1169 self.convert_dates = convert_dates

1170 self.date_unit = date_unit

1171 self.keep_default_dates = keep_default_dates

1172 self.obj: DataFrame | Series | None = None

1173 self.dtype_backend = dtype_backend

1174

1175 @final

1176 def check_keys_split(self, decoded: dict) -> None:

1177 """

1178 Checks that dict has only the appropriate keys for orient='split'.

1179 """

1180 bad_keys = set(decoded.keys()).difference(set(self._split_keys))

1181 if bad_keys:

1182 bad_keys_joined = ", ".join(bad_keys)

1183 raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")

1184

1185 @final

1186 def parse(self):

1187 self._parse()

1188

1189 if self.obj is None:

1190 return None

1191 if self.convert_axes:

1192 self._convert_axes()

1193 self._try_convert_types()

1194 return self.obj

1195

1196 def _parse(self) -> None:

1197 raise AbstractMethodError(self)

1198

1199 @final

1200 def _convert_axes(self) -> None:

1201 """

1202 Try to convert axes.

1203 """

1204 obj = self.obj

1205 assert obj is not None # for mypy

1206 for axis_name in obj._AXIS_ORDERS:

1207 ax = obj._get_axis(axis_name)

1208 ser = Series(ax, dtype=ax.dtype, copy=False)

1209 new_ser, result = self._try_convert_data(

1210 name=axis_name,

1211 data=ser,

1212 use_dtypes=False,

1213 convert_dates=True,

1214 is_axis=True,

1215 )

1216 if result:

1217 new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False)

1218 setattr(self.obj, axis_name, new_axis)

1219

1220 def _try_convert_types(self) -> None:

1221 raise AbstractMethodError(self)

1222

1223 @final

1224 def _try_convert_data(

1225 self,

1226 name: Hashable,

1227 data: Series,

1228 use_dtypes: bool = True,

1229 convert_dates: bool | list[str] = True,

1230 is_axis: bool = False,

1231 ) -> tuple[Series, bool]:

1232 """

1233 Try to parse a Series into a column by inferring dtype.

1234 """

1235 # don't try to coerce, unless a force conversion

1236 if use_dtypes:

1237 if not self.dtype:

1238 if all(notna(data)):

1239 return data, False

1240

1241 with warnings.catch_warnings():

1242 warnings.filterwarnings(

1243 "ignore",

1244 "Downcasting object dtype arrays",

1245 category=FutureWarning,

1246 )

1247 filled = data.fillna(np.nan)

1248

1249 return filled, True

1250

1251 elif self.dtype is True:

1252 pass

1253 else:

1254 # dtype to force

1255 dtype = (

1256 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype

1257 )

1258 if dtype is not None:

1259 try:

1260 return data.astype(dtype), True

1261 except (TypeError, ValueError):

1262 return data, False

1263

1264 if convert_dates:

1265 new_data, result = self._try_convert_to_date(data)

1266 if result:

1267 return new_data, True

1268

1269 converted = False

1270 if self.dtype_backend is not lib.no_default and not is_axis:

1271 # Fall through for conversion later on

1272 return data, True

1273 elif is_string_dtype(data.dtype):

1274 # try float

1275 try:

1276 data = data.astype("float64")

1277 converted = True

1278 except (TypeError, ValueError):

1279 pass

1280

1281 if data.dtype.kind == "f" and data.dtype != "float64":

1282 # coerce floats to 64

1283 try:

1284 data = data.astype("float64")

1285 converted = True

1286 except (TypeError, ValueError):

1287 pass

1288

1289 # don't coerce 0-len data

1290 if len(data) and data.dtype in ("float", "object"):

1291 # coerce ints if we can

1292 try:

1293 new_data = data.astype("int64")

1294 if (new_data == data).all():

1295 data = new_data

1296 converted = True

1297 except (TypeError, ValueError, OverflowError):

1298 pass

1299

1300 if data.dtype == "int" and data.dtype != "int64":

1301 # coerce ints to 64

1302 try:

1303 data = data.astype("int64")

1304 converted = True

1305 except (TypeError, ValueError):

1306 pass

1307

1308 # if we have an index, we want to preserve dtypes

1309 if name == "index" and len(data):

1310 if self.orient == "split":

1311 return data, False

1312

1313 return data, converted

1314

1315 @final

1316 def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:

1317 """

1318 Try to parse a ndarray like into a date column.

1319

1320 Try to coerce object in epoch/iso formats and integer/float in epoch

1321 formats. Return a boolean if parsing was successful.

1322 """

1323 # no conversion on empty

1324 if not len(data):

1325 return data, False

1326

1327 new_data = data

1328

1329 if new_data.dtype == "string":

1330 new_data = new_data.astype(object)

1331

1332 if new_data.dtype == "object":

1333 try:

1334 new_data = data.astype("int64")

1335 except OverflowError:

1336 return data, False

1337 except (TypeError, ValueError):

1338 pass

1339

1340 # ignore numbers that are out of range

1341 if issubclass(new_data.dtype.type, np.number):

1342 in_range = (

1343 isna(new_data._values)

1344 | (new_data > self.min_stamp)

1345 | (new_data._values == iNaT)

1346 )

1347 if not in_range.all():

1348 return data, False

1349

1350 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS

1351 for date_unit in date_units:

1352 try:

1353 with warnings.catch_warnings():

1354 warnings.filterwarnings(

1355 "ignore",

1356 ".*parsing datetimes with mixed time "

1357 "zones will raise an error",

1358 category=FutureWarning,

1359 )

1360 new_data = to_datetime(new_data, errors="raise", unit=date_unit)

1361 except (ValueError, OverflowError, TypeError):

1362 continue

1363 return new_data, True

1364 return data, False

1365

1366

1367class SeriesParser(Parser):

1368 _default_orient = "index"

1369 _split_keys = ("name", "index", "data")

1370 obj: Series | None

1371

1372 def _parse(self) -> None:

1373 data = ujson_loads(self.json, precise_float=self.precise_float)

1374

1375 if self.orient == "split":

1376 decoded = {str(k): v for k, v in data.items()}

1377 self.check_keys_split(decoded)

1378 self.obj = Series(**decoded)

1379 else:

1380 self.obj = Series(data)

1381

1382 def _try_convert_types(self) -> None:

1383 if self.obj is None:

1384 return

1385 obj, result = self._try_convert_data(

1386 "data", self.obj, convert_dates=self.convert_dates

1387 )

1388 if result:

1389 self.obj = obj

1390

1391

1392class FrameParser(Parser):

1393 _default_orient = "columns"

1394 _split_keys = ("columns", "index", "data")

1395 obj: DataFrame | None

1396

1397 def _parse(self) -> None:

1398 json = self.json

1399 orient = self.orient

1400

1401 if orient == "columns":

1402 self.obj = DataFrame(

1403 ujson_loads(json, precise_float=self.precise_float), dtype=None

1404 )

1405 elif orient == "split":

1406 decoded = {

1407 str(k): v

1408 for k, v in ujson_loads(json, precise_float=self.precise_float).items()

1409 }

1410 self.check_keys_split(decoded)

1411 orig_names = [

1412 (tuple(col) if isinstance(col, list) else col)

1413 for col in decoded["columns"]

1414 ]

1415 decoded["columns"] = dedup_names(

1416 orig_names,

1417 is_potential_multi_index(orig_names, None),

1418 )

1419 self.obj = DataFrame(dtype=None, **decoded)

1420 elif orient == "index":

1421 self.obj = DataFrame.from_dict(

1422 ujson_loads(json, precise_float=self.precise_float),

1423 dtype=None,

1424 orient="index",

1425 )

1426 elif orient == "table":

1427 self.obj = parse_table_schema(json, precise_float=self.precise_float)

1428 else:

1429 self.obj = DataFrame(

1430 ujson_loads(json, precise_float=self.precise_float), dtype=None

1431 )

1432

1433 def _process_converter(

1434 self,

1435 f: Callable[[Hashable, Series], tuple[Series, bool]],

1436 filt: Callable[[Hashable], bool] | None = None,

1437 ) -> None:

1438 """

1439 Take a conversion function and possibly recreate the frame.

1440 """

1441 if filt is None:

1442 filt = lambda col: True

1443

1444 obj = self.obj

1445 assert obj is not None # for mypy

1446

1447 needs_new_obj = False

1448 new_obj = {}

1449 for i, (col, c) in enumerate(obj.items()):

1450 if filt(col):

1451 new_data, result = f(col, c)

1452 if result:

1453 c = new_data

1454 needs_new_obj = True

1455 new_obj[i] = c

1456

1457 if needs_new_obj:

1458 # possibly handle dup columns

1459 new_frame = DataFrame(new_obj, index=obj.index)

1460 new_frame.columns = obj.columns

1461 self.obj = new_frame

1462

1463 def _try_convert_types(self) -> None:

1464 if self.obj is None:

1465 return

1466 if self.convert_dates:

1467 self._try_convert_dates()

1468

1469 self._process_converter(

1470 lambda col, c: self._try_convert_data(col, c, convert_dates=False)

1471 )

1472

1473 def _try_convert_dates(self) -> None:

1474 if self.obj is None:

1475 return

1476

1477 # our columns to parse

1478 convert_dates_list_bool = self.convert_dates

1479 if isinstance(convert_dates_list_bool, bool):

1480 convert_dates_list_bool = []

1481 convert_dates = set(convert_dates_list_bool)

1482

1483 def is_ok(col) -> bool:

1484 """

1485 Return if this col is ok to try for a date parse.

1486 """

1487 if col in convert_dates:

1488 return True

1489 if not self.keep_default_dates:

1490 return False

1491 if not isinstance(col, str):

1492 return False

1493

1494 col_lower = col.lower()

1495 if (

1496 col_lower.endswith(("_at", "_time"))

1497 or col_lower == "modified"

1498 or col_lower == "date"

1499 or col_lower == "datetime"

1500 or col_lower.startswith("timestamp")

1501 ):

1502 return True

1503 return False

1504

1505 self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/_json.py: 58%

535 statements