Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/

1from __future__ import annotations

3from abc import (

4 ABC,

5 abstractmethod,

7from collections import abc

8from io import StringIO

9from itertools import islice

10from types import TracebackType

11from typing import (

12 TYPE_CHECKING,

13 Any,

14 Callable,

15 Generic,

16 Literal,

17 Mapping,

18 TypeVar,

19 overload,

20)

22import numpy as np

24from pandas._libs import lib

25from pandas._libs.json import (

26 dumps,

27 loads,

28)

29from pandas._libs.tslibs import iNaT

30from pandas._typing import (

31 CompressionOptions,

32 DtypeArg,

33 DtypeBackend,

34 FilePath,

35 IndexLabel,

36 JSONEngine,

37 JSONSerializable,

38 ReadBuffer,

39 StorageOptions,

40 WriteBuffer,

41)

42from pandas.compat._optional import import_optional_dependency

43from pandas.errors import AbstractMethodError

44from pandas.util._decorators import doc

45from pandas.util._validators import check_dtype_backend

47from pandas.core.dtypes.common import (

48 ensure_str,

49 is_period_dtype,

50)

51from pandas.core.dtypes.generic import ABCIndex

53from pandas import (

54 ArrowDtype,

55 DataFrame,

56 MultiIndex,

57 Series,

58 isna,

59 notna,

60 to_datetime,

61)

62from pandas.core.reshape.concat import concat

63from pandas.core.shared_docs import _shared_docs

65from pandas.io.common import (

66 IOHandles,

67 dedup_names,

68 extension_to_compression,

69 file_exists,

70 get_handle,

71 is_fsspec_url,

72 is_potential_multi_index,

73 is_url,

74 stringify_path,

75)

76from pandas.io.json._normalize import convert_to_line_delimits

77from pandas.io.json._table_schema import (

78 build_table_schema,

79 parse_table_schema,

80)

81from pandas.io.parsers.readers import validate_integer

83if TYPE_CHECKING:

84 from pandas.core.generic import NDFrame

86FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"])

89# interface to/from

90@overload

91def to_json(

92 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes],

93 obj: NDFrame,

94 orient: str | None = ...,

95 date_format: str = ...,

96 double_precision: int = ...,

97 force_ascii: bool = ...,

98 date_unit: str = ...,

99 default_handler: Callable[[Any], JSONSerializable] | None = ...,

100 lines: bool = ...,

101 compression: CompressionOptions = ...,

102 index: bool = ...,

103 indent: int = ...,

104 storage_options: StorageOptions = ...,

105 mode: Literal["a", "w"] = ...,

106) -> None:

107 ...

108

109

110@overload

111def to_json(

112 path_or_buf: None,

113 obj: NDFrame,

114 orient: str | None = ...,

115 date_format: str = ...,

116 double_precision: int = ...,

117 force_ascii: bool = ...,

118 date_unit: str = ...,

119 default_handler: Callable[[Any], JSONSerializable] | None = ...,

120 lines: bool = ...,

121 compression: CompressionOptions = ...,

122 index: bool = ...,

123 indent: int = ...,

124 storage_options: StorageOptions = ...,

125 mode: Literal["a", "w"] = ...,

126) -> str:

127 ...

128

129

130def to_json(

131 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None,

132 obj: NDFrame,

133 orient: str | None = None,

134 date_format: str = "epoch",

135 double_precision: int = 10,

136 force_ascii: bool = True,

137 date_unit: str = "ms",

138 default_handler: Callable[[Any], JSONSerializable] | None = None,

139 lines: bool = False,

140 compression: CompressionOptions = "infer",

141 index: bool = True,

142 indent: int = 0,

143 storage_options: StorageOptions = None,

144 mode: Literal["a", "w"] = "w",

145) -> str | None:

146 if not index and orient not in ["split", "table"]:

147 raise ValueError(

148 "'index=False' is only valid when 'orient' is 'split' or 'table'"

149 )

150

151 if lines and orient != "records":

152 raise ValueError("'lines' keyword only valid when 'orient' is records")

153

154 if mode not in ["a", "w"]:

155 msg = (

156 f"mode={mode} is not a valid option."

157 "Only 'w' and 'a' are currently supported."

158 )

159 raise ValueError(msg)

160

161 if mode == "a" and (not lines or orient != "records"):

162 msg = (

163 "mode='a' (append) is only supported when"

164 "lines is True and orient is 'records'"

165 )

166 raise ValueError(msg)

167

168 if orient == "table" and isinstance(obj, Series):

169 obj = obj.to_frame(name=obj.name or "values")

170

171 writer: type[Writer]

172 if orient == "table" and isinstance(obj, DataFrame):

173 writer = JSONTableWriter

174 elif isinstance(obj, Series):

175 writer = SeriesWriter

176 elif isinstance(obj, DataFrame):

177 writer = FrameWriter

178 else:

179 raise NotImplementedError("'obj' should be a Series or a DataFrame")

180

181 s = writer(

182 obj,

183 orient=orient,

184 date_format=date_format,

185 double_precision=double_precision,

186 ensure_ascii=force_ascii,

187 date_unit=date_unit,

188 default_handler=default_handler,

189 index=index,

190 indent=indent,

191 ).write()

192

193 if lines:

194 s = convert_to_line_delimits(s)

195

196 if path_or_buf is not None:

197 # apply compression and byte/text conversion

198 with get_handle(

199 path_or_buf, mode, compression=compression, storage_options=storage_options

200 ) as handles:

201 handles.handle.write(s)

202 else:

203 return s

204 return None

205

206

207class Writer(ABC):

208 _default_orient: str

209

210 def __init__(

211 self,

212 obj: NDFrame,

213 orient: str | None,

214 date_format: str,

215 double_precision: int,

216 ensure_ascii: bool,

217 date_unit: str,

218 index: bool,

219 default_handler: Callable[[Any], JSONSerializable] | None = None,

220 indent: int = 0,

221 ) -> None:

222 self.obj = obj

223

224 if orient is None:

225 orient = self._default_orient

226

227 self.orient = orient

228 self.date_format = date_format

229 self.double_precision = double_precision

230 self.ensure_ascii = ensure_ascii

231 self.date_unit = date_unit

232 self.default_handler = default_handler

233 self.index = index

234 self.indent = indent

235

236 self.is_copy = None

237 self._format_axes()

238

239 def _format_axes(self):

240 raise AbstractMethodError(self)

241

242 def write(self) -> str:

243 iso_dates = self.date_format == "iso"

244 return dumps(

245 self.obj_to_write,

246 orient=self.orient,

247 double_precision=self.double_precision,

248 ensure_ascii=self.ensure_ascii,

249 date_unit=self.date_unit,

250 iso_dates=iso_dates,

251 default_handler=self.default_handler,

252 indent=self.indent,

253 )

254

255 @property

256 @abstractmethod

257 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

258 """Object to write in JSON format."""

259

260

261class SeriesWriter(Writer):

262 _default_orient = "index"

263

264 @property

265 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

266 if not self.index and self.orient == "split":

267 return {"name": self.obj.name, "data": self.obj.values}

268 else:

269 return self.obj

270

271 def _format_axes(self):

272 if not self.obj.index.is_unique and self.orient == "index":

273 raise ValueError(f"Series index must be unique for orient='{self.orient}'")

274

275

276class FrameWriter(Writer):

277 _default_orient = "columns"

278

279 @property

280 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

281 if not self.index and self.orient == "split":

282 obj_to_write = self.obj.to_dict(orient="split")

283 del obj_to_write["index"]

284 else:

285 obj_to_write = self.obj

286 return obj_to_write

287

288 def _format_axes(self):

289 """

290 Try to format axes if they are datelike.

291 """

292 if not self.obj.index.is_unique and self.orient in ("index", "columns"):

293 raise ValueError(

294 f"DataFrame index must be unique for orient='{self.orient}'."

295 )

296 if not self.obj.columns.is_unique and self.orient in (

297 "index",

298 "columns",

299 "records",

300 ):

301 raise ValueError(

302 f"DataFrame columns must be unique for orient='{self.orient}'."

303 )

304

305

306class JSONTableWriter(FrameWriter):

307 _default_orient = "records"

308

309 def __init__(

310 self,

311 obj,

312 orient: str | None,

313 date_format: str,

314 double_precision: int,

315 ensure_ascii: bool,

316 date_unit: str,

317 index: bool,

318 default_handler: Callable[[Any], JSONSerializable] | None = None,

319 indent: int = 0,

320 ) -> None:

321 """

322 Adds a `schema` attribute with the Table Schema, resets

323 the index (can't do in caller, because the schema inference needs

324 to know what the index is, forces orient to records, and forces

325 date_format to 'iso'.

326 """

327 super().__init__(

328 obj,

329 orient,

330 date_format,

331 double_precision,

332 ensure_ascii,

333 date_unit,

334 index,

335 default_handler=default_handler,

336 indent=indent,

337 )

338

339 if date_format != "iso":

340 msg = (

341 "Trying to write with `orient='table'` and "

342 f"`date_format='{date_format}'`. Table Schema requires dates "

343 "to be formatted with `date_format='iso'`"

344 )

345 raise ValueError(msg)

346

347 self.schema = build_table_schema(obj, index=self.index)

348

349 # NotImplemented on a column MultiIndex

350 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):

351 raise NotImplementedError(

352 "orient='table' is not supported for MultiIndex columns"

353 )

354

355 # TODO: Do this timedelta properly in objToJSON.c See GH #15137

356 if (

357 (obj.ndim == 1)

358 and (obj.name in set(obj.index.names))

359 or len(obj.columns.intersection(obj.index.names))

360 ):

361 msg = "Overlapping names between the index and columns"

362 raise ValueError(msg)

363

364 obj = obj.copy()

365 timedeltas = obj.select_dtypes(include=["timedelta"]).columns

366 if len(timedeltas):

367 obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())

368 # Convert PeriodIndex to datetimes before serializing

369 if is_period_dtype(obj.index.dtype):

370 obj.index = obj.index.to_timestamp()

371

372 # exclude index from obj if index=False

373 if not self.index:

374 self.obj = obj.reset_index(drop=True)

375 else:

376 self.obj = obj.reset_index(drop=False)

377 self.date_format = "iso"

378 self.orient = "records"

379 self.index = index

380

381 @property

382 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:

383 return {"schema": self.schema, "data": self.obj}

384

385

386@overload

387def read_json(

388 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

389 *,

390 orient: str | None = ...,

391 typ: Literal["frame"] = ...,

392 dtype: DtypeArg | None = ...,

393 convert_axes=...,

394 convert_dates: bool | list[str] = ...,

395 keep_default_dates: bool = ...,

396 precise_float: bool = ...,

397 date_unit: str | None = ...,

398 encoding: str | None = ...,

399 encoding_errors: str | None = ...,

400 lines: bool = ...,

401 chunksize: int,

402 compression: CompressionOptions = ...,

403 nrows: int | None = ...,

404 storage_options: StorageOptions = ...,

405 dtype_backend: DtypeBackend | lib.NoDefault = ...,

406 engine: JSONEngine = ...,

407) -> JsonReader[Literal["frame"]]:

408 ...

409

410

411@overload

412def read_json(

413 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

414 *,

415 orient: str | None = ...,

416 typ: Literal["series"],

417 dtype: DtypeArg | None = ...,

418 convert_axes=...,

419 convert_dates: bool | list[str] = ...,

420 keep_default_dates: bool = ...,

421 precise_float: bool = ...,

422 date_unit: str | None = ...,

423 encoding: str | None = ...,

424 encoding_errors: str | None = ...,

425 lines: bool = ...,

426 chunksize: int,

427 compression: CompressionOptions = ...,

428 nrows: int | None = ...,

429 storage_options: StorageOptions = ...,

430 dtype_backend: DtypeBackend | lib.NoDefault = ...,

431 engine: JSONEngine = ...,

432) -> JsonReader[Literal["series"]]:

433 ...

434

435

436@overload

437def read_json(

438 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

439 *,

440 orient: str | None = ...,

441 typ: Literal["series"],

442 dtype: DtypeArg | None = ...,

443 convert_axes=...,

444 convert_dates: bool | list[str] = ...,

445 keep_default_dates: bool = ...,

446 precise_float: bool = ...,

447 date_unit: str | None = ...,

448 encoding: str | None = ...,

449 encoding_errors: str | None = ...,

450 lines: bool = ...,

451 chunksize: None = ...,

452 compression: CompressionOptions = ...,

453 nrows: int | None = ...,

454 storage_options: StorageOptions = ...,

455 dtype_backend: DtypeBackend | lib.NoDefault = ...,

456 engine: JSONEngine = ...,

457) -> Series:

458 ...

459

460

461@overload

462def read_json(

463 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

464 *,

465 orient: str | None = ...,

466 typ: Literal["frame"] = ...,

467 dtype: DtypeArg | None = ...,

468 convert_axes=...,

469 convert_dates: bool | list[str] = ...,

470 keep_default_dates: bool = ...,

471 precise_float: bool = ...,

472 date_unit: str | None = ...,

473 encoding: str | None = ...,

474 encoding_errors: str | None = ...,

475 lines: bool = ...,

476 chunksize: None = ...,

477 compression: CompressionOptions = ...,

478 nrows: int | None = ...,

479 storage_options: StorageOptions = ...,

480 dtype_backend: DtypeBackend | lib.NoDefault = ...,

481 engine: JSONEngine = ...,

482) -> DataFrame:

483 ...

484

485

486@doc(

487 storage_options=_shared_docs["storage_options"],

488 decompression_options=_shared_docs["decompression_options"] % "path_or_buf",

489)

490def read_json(

491 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

492 *,

493 orient: str | None = None,

494 typ: Literal["frame", "series"] = "frame",

495 dtype: DtypeArg | None = None,

496 convert_axes=None,

497 convert_dates: bool | list[str] = True,

498 keep_default_dates: bool = True,

499 precise_float: bool = False,

500 date_unit: str | None = None,

501 encoding: str | None = None,

502 encoding_errors: str | None = "strict",

503 lines: bool = False,

504 chunksize: int | None = None,

505 compression: CompressionOptions = "infer",

506 nrows: int | None = None,

507 storage_options: StorageOptions = None,

508 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

509 engine: JSONEngine = "ujson",

510) -> DataFrame | Series | JsonReader:

511 """

512 Convert a JSON string to pandas object.

513

514 Parameters

515 ----------

516 path_or_buf : a valid JSON str, path object or file-like object

517 Any valid string path is acceptable. The string could be a URL. Valid

518 URL schemes include http, ftp, s3, and file. For file URLs, a host is

519 expected. A local file could be:

520 ``file://localhost/path/to/table.json``.

521

522 If you want to pass in a path object, pandas accepts any

523 ``os.PathLike``.

524

525 By file-like object, we refer to objects with a ``read()`` method,

526 such as a file handle (e.g. via builtin ``open`` function)

527 or ``StringIO``.

528 orient : str, optional

529 Indication of expected JSON string format.

530 Compatible JSON strings can be produced by ``to_json()`` with a

531 corresponding orient value.

532 The set of possible orients is:

533

534 - ``'split'`` : dict like

535 ``{{index -> [index], columns -> [columns], data -> [values]}}``

536 - ``'records'`` : list like

537 ``[{{column -> value}}, ... , {{column -> value}}]``

538 - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``

539 - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``

540 - ``'values'`` : just the values array

541

542 The allowed and default values depend on the value

543 of the `typ` parameter.

544

545 * when ``typ == 'series'``,

546

547 - allowed orients are ``{{'split','records','index'}}``

548 - default is ``'index'``

549 - The Series index must be unique for orient ``'index'``.

550

551 * when ``typ == 'frame'``,

552

553 - allowed orients are ``{{'split','records','index',

554 'columns','values', 'table'}}``

555 - default is ``'columns'``

556 - The DataFrame index must be unique for orients ``'index'`` and

557 ``'columns'``.

558 - The DataFrame columns must be unique for orients ``'index'``,

559 ``'columns'``, and ``'records'``.

560

561 typ : {{'frame', 'series'}}, default 'frame'

562 The type of object to recover.

563

564 dtype : bool or dict, default None

565 If True, infer dtypes; if a dict of column to dtype, then use those;

566 if False, then don't infer dtypes at all, applies only to the data.

567

568 For all ``orient`` values except ``'table'``, default is True.

569

570 convert_axes : bool, default None

571 Try to convert the axes to the proper dtypes.

572

573 For all ``orient`` values except ``'table'``, default is True.

574

575 convert_dates : bool or list of str, default True

576 If True then default datelike columns may be converted (depending on

577 keep_default_dates).

578 If False, no dates will be converted.

579 If a list of column names, then those columns will be converted and

580 default datelike columns may also be converted (depending on

581 keep_default_dates).

582

583 keep_default_dates : bool, default True

584 If parsing dates (convert_dates is not False), then try to parse the

585 default datelike columns.

586 A column label is datelike if

587

588 * it ends with ``'_at'``,

589

590 * it ends with ``'_time'``,

591

592 * it begins with ``'timestamp'``,

593

594 * it is ``'modified'``, or

595

596 * it is ``'date'``.

597

598 precise_float : bool, default False

599 Set to enable usage of higher precision (strtod) function when

600 decoding string to double values. Default (False) is to use fast but

601 less precise builtin functionality.

602

603 date_unit : str, default None

604 The timestamp unit to detect if converting dates. The default behaviour

605 is to try and detect the correct precision, but if this is not desired

606 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,

607 milliseconds, microseconds or nanoseconds respectively.

608

609 encoding : str, default is 'utf-8'

610 The encoding to use to decode py3 bytes.

611

612 encoding_errors : str, optional, default "strict"

613 How encoding errors are treated. `List of possible values

614 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .

615

616 .. versionadded:: 1.3.0

617

618 lines : bool, default False

619 Read the file as a json object per line.

620

621 chunksize : int, optional

622 Return JsonReader object for iteration.

623 See the `line-delimited json docs

624 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_

625 for more information on ``chunksize``.

626 This can only be passed if `lines=True`.

627 If this is None, the file will be read into memory all at once.

628

629 .. versionchanged:: 1.2

630

631 ``JsonReader`` is a context manager.

632

633 {decompression_options}

634

635 .. versionchanged:: 1.4.0 Zstandard support.

636

637 nrows : int, optional

638 The number of lines from the line-delimited jsonfile that has to be read.

639 This can only be passed if `lines=True`.

640 If this is None, all the rows will be returned.

641

642 .. versionadded:: 1.1

643

644 {storage_options}

645

646 .. versionadded:: 1.2.0

647

648 dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames

649 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy

650 arrays, nullable dtypes are used for all dtypes that have a nullable

651 implementation when "numpy_nullable" is set, pyarrow is used for all

652 dtypes if "pyarrow" is set.

653

654 The dtype_backends are still experimential.

655

656 .. versionadded:: 2.0

657

658 engine : {{"ujson", "pyarrow"}}, default "ujson"

659 Parser engine to use. The ``"pyarrow"`` engine is only available when

660 ``lines=True``.

661

662 .. versionadded:: 2.0

663

664 Returns

665 -------

666 Series or DataFrame

667 The type returned depends on the value of `typ`.

668

669 See Also

670 --------

671 DataFrame.to_json : Convert a DataFrame to a JSON string.

672 Series.to_json : Convert a Series to a JSON string.

673 json_normalize : Normalize semi-structured JSON data into a flat table.

674

675 Notes

676 -----

677 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal

678 :class:`Index` name of `index` gets written with :func:`to_json`, the

679 subsequent read operation will incorrectly set the :class:`Index` name to

680 ``None``. This is because `index` is also used by :func:`DataFrame.to_json`

681 to denote a missing :class:`Index` name, and the subsequent

682 :func:`read_json` operation cannot distinguish between the two. The same

683 limitation is encountered with a :class:`MultiIndex` and any names

684 beginning with ``'level_'``.

685

686 Examples

687 --------

688 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],

689 ... index=['row 1', 'row 2'],

690 ... columns=['col 1', 'col 2'])

691

692 Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

693

694 >>> df.to_json(orient='split')

695 '\

696{{\

697"columns":["col 1","col 2"],\

698"index":["row 1","row 2"],\

699"data":[["a","b"],["c","d"]]\

700}}\

701'

702 >>> pd.read_json(_, orient='split')

703 col 1 col 2

704 row 1 a b

705 row 2 c d

706

707 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

708

709 >>> df.to_json(orient='index')

710 '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'

711

712 >>> pd.read_json(_, orient='index')

713 col 1 col 2

714 row 1 a b

715 row 2 c d

716

717 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.

718 Note that index labels are not preserved with this encoding.

719

720 >>> df.to_json(orient='records')

721 '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'

722 >>> pd.read_json(_, orient='records')

723 col 1 col 2

724 0 a b

725 1 c d

726

727 Encoding with Table Schema

728

729 >>> df.to_json(orient='table')

730 '\

731{{"schema":{{"fields":[\

732{{"name":"index","type":"string"}},\

733{{"name":"col 1","type":"string"}},\

734{{"name":"col 2","type":"string"}}],\

735"primaryKey":["index"],\

736"pandas_version":"1.4.0"}},\

737"data":[\

738{{"index":"row 1","col 1":"a","col 2":"b"}},\

739{{"index":"row 2","col 1":"c","col 2":"d"}}]\

740}}\

741'

742 """

743 if orient == "table" and dtype:

744 raise ValueError("cannot pass both dtype and orient='table'")

745 if orient == "table" and convert_axes:

746 raise ValueError("cannot pass both convert_axes and orient='table'")

747

748 check_dtype_backend(dtype_backend)

749

750 if dtype is None and orient != "table":

751 # error: Incompatible types in assignment (expression has type "bool", variable

752 # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],

753 # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable,

754 # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],

755 # Type[int], Type[complex], Type[bool], Type[object]]], None]")

756 dtype = True # type: ignore[assignment]

757 if convert_axes is None and orient != "table":

758 convert_axes = True

759

760 json_reader = JsonReader(

761 path_or_buf,

762 orient=orient,

763 typ=typ,

764 dtype=dtype,

765 convert_axes=convert_axes,

766 convert_dates=convert_dates,

767 keep_default_dates=keep_default_dates,

768 precise_float=precise_float,

769 date_unit=date_unit,

770 encoding=encoding,

771 lines=lines,

772 chunksize=chunksize,

773 compression=compression,

774 nrows=nrows,

775 storage_options=storage_options,

776 encoding_errors=encoding_errors,

777 dtype_backend=dtype_backend,

778 engine=engine,

779 )

780

781 if chunksize:

782 return json_reader

783 else:

784 return json_reader.read()

785

786

787class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):

788 """

789 JsonReader provides an interface for reading in a JSON file.

790

791 If initialized with ``lines=True`` and ``chunksize``, can be iterated over

792 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the

793 whole document.

794 """

795

796 def __init__(

797 self,

798 filepath_or_buffer,

799 orient,

800 typ: FrameSeriesStrT,

801 dtype,

802 convert_axes,

803 convert_dates,

804 keep_default_dates: bool,

805 precise_float: bool,

806 date_unit,

807 encoding,

808 lines: bool,

809 chunksize: int | None,

810 compression: CompressionOptions,

811 nrows: int | None,

812 storage_options: StorageOptions = None,

813 encoding_errors: str | None = "strict",

814 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

815 engine: JSONEngine = "ujson",

816 ) -> None:

817 self.orient = orient

818 self.typ = typ

819 self.dtype = dtype

820 self.convert_axes = convert_axes

821 self.convert_dates = convert_dates

822 self.keep_default_dates = keep_default_dates

823 self.precise_float = precise_float

824 self.date_unit = date_unit

825 self.encoding = encoding

826 self.engine = engine

827 self.compression = compression

828 self.storage_options = storage_options

829 self.lines = lines

830 self.chunksize = chunksize

831 self.nrows_seen = 0

832 self.nrows = nrows

833 self.encoding_errors = encoding_errors

834 self.handles: IOHandles[str] | None = None

835 self.dtype_backend = dtype_backend

836

837 if self.engine not in {"pyarrow", "ujson"}:

838 raise ValueError(

839 f"The engine type {self.engine} is currently not supported."

840 )

841 if self.chunksize is not None:

842 self.chunksize = validate_integer("chunksize", self.chunksize, 1)

843 if not self.lines:

844 raise ValueError("chunksize can only be passed if lines=True")

845 if self.engine == "pyarrow":

846 raise ValueError(

847 "currently pyarrow engine doesn't support chunksize parameter"

848 )

849 if self.nrows is not None:

850 self.nrows = validate_integer("nrows", self.nrows, 0)

851 if not self.lines:

852 raise ValueError("nrows can only be passed if lines=True")

853 if self.engine == "pyarrow":

854 if not self.lines:

855 raise ValueError(

856 "currently pyarrow engine only supports "

857 "the line-delimited JSON format"

858 )

859 self.data = filepath_or_buffer

860 elif self.engine == "ujson":

861 data = self._get_data_from_filepath(filepath_or_buffer)

862 self.data = self._preprocess_data(data)

863

864 def _preprocess_data(self, data):

865 """

866 At this point, the data either has a `read` attribute (e.g. a file

867 object or a StringIO) or is a string that is a JSON document.

868

869 If self.chunksize, we prepare the data for the `__next__` method.

870 Otherwise, we read it into memory for the `read` method.

871 """

872 if hasattr(data, "read") and not (self.chunksize or self.nrows):

873 with self:

874 data = data.read()

875 if not hasattr(data, "read") and (self.chunksize or self.nrows):

876 data = StringIO(data)

877

878 return data

879

880 def _get_data_from_filepath(self, filepath_or_buffer):

881 """

882 The function read_json accepts three input types:

883 1. filepath (string-like)

884 2. file-like object (e.g. open file object, StringIO)

885 3. JSON string

886

887 This method turns (1) into (2) to simplify the rest of the processing.

888 It returns input types (2) and (3) unchanged.

889

890 It raises FileNotFoundError if the input is a string ending in

891 one of .json, .json.gz, .json.bz2, etc. but no such file exists.

892 """

893 # if it is a string but the file does not exist, it might be a JSON string

894 filepath_or_buffer = stringify_path(filepath_or_buffer)

895 if (

896 not isinstance(filepath_or_buffer, str)

897 or is_url(filepath_or_buffer)

898 or is_fsspec_url(filepath_or_buffer)

899 or file_exists(filepath_or_buffer)

900 ):

901 self.handles = get_handle(

902 filepath_or_buffer,

903 "r",

904 encoding=self.encoding,

905 compression=self.compression,

906 storage_options=self.storage_options,

907 errors=self.encoding_errors,

908 )

909 filepath_or_buffer = self.handles.handle

910 elif (

911 isinstance(filepath_or_buffer, str)

912 and filepath_or_buffer.lower().endswith(

913 (".json",) + tuple(f".json{c}" for c in extension_to_compression)

914 )

915 and not file_exists(filepath_or_buffer)

916 ):

917 raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")

918

919 return filepath_or_buffer

920

921 def _combine_lines(self, lines) -> str:

922 """

923 Combines a list of JSON objects into one JSON object.

924 """

925 return (

926 f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'

927 )

928

929 @overload

930 def read(self: JsonReader[Literal["frame"]]) -> DataFrame:

931 ...

932

933 @overload

934 def read(self: JsonReader[Literal["series"]]) -> Series:

935 ...

936

937 @overload

938 def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:

939 ...

940

941 def read(self) -> DataFrame | Series:

942 """

943 Read the whole JSON input into a pandas object.

944 """

945 obj: DataFrame | Series

946 with self:

947 if self.engine == "pyarrow":

948 pyarrow_json = import_optional_dependency("pyarrow.json")

949 pa_table = pyarrow_json.read_json(self.data)

950

951 mapping: type[ArrowDtype] | None | Callable

952 if self.dtype_backend == "pyarrow":

953 mapping = ArrowDtype

954 elif self.dtype_backend == "numpy_nullable":

955 from pandas.io._util import _arrow_dtype_mapping

956

957 mapping = _arrow_dtype_mapping().get

958 else:

959 mapping = None

960

961 return pa_table.to_pandas(types_mapper=mapping)

962 elif self.engine == "ujson":

963 if self.lines:

964 if self.chunksize:

965 obj = concat(self)

966 elif self.nrows:

967 lines = list(islice(self.data, self.nrows))

968 lines_json = self._combine_lines(lines)

969 obj = self._get_object_parser(lines_json)

970 else:

971 data = ensure_str(self.data)

972 data_lines = data.split("\n")

973 obj = self._get_object_parser(self._combine_lines(data_lines))

974 else:

975 obj = self._get_object_parser(self.data)

976 if self.dtype_backend is not lib.no_default:

977 return obj.convert_dtypes(

978 infer_objects=False, dtype_backend=self.dtype_backend

979 )

980 else:

981 return obj

982

983 def _get_object_parser(self, json) -> DataFrame | Series:

984 """

985 Parses a json document into a pandas object.

986 """

987 typ = self.typ

988 dtype = self.dtype

989 kwargs = {

990 "orient": self.orient,

991 "dtype": self.dtype,

992 "convert_axes": self.convert_axes,

993 "convert_dates": self.convert_dates,

994 "keep_default_dates": self.keep_default_dates,

995 "precise_float": self.precise_float,

996 "date_unit": self.date_unit,

997 "dtype_backend": self.dtype_backend,

998 }

999 obj = None

1000 if typ == "frame":

1001 obj = FrameParser(json, **kwargs).parse()

1002

1003 if typ == "series" or obj is None:

1004 if not isinstance(dtype, bool):

1005 kwargs["dtype"] = dtype

1006 obj = SeriesParser(json, **kwargs).parse()

1007

1008 return obj

1009

1010 def close(self) -> None:

1011 """

1012 If we opened a stream earlier, in _get_data_from_filepath, we should

1013 close it.

1014

1015 If an open stream or file was passed, we leave it open.

1016 """

1017 if self.handles is not None:

1018 self.handles.close()

1019

1020 def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]:

1021 return self

1022

1023 @overload

1024 def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame:

1025 ...

1026

1027 @overload

1028 def __next__(self: JsonReader[Literal["series"]]) -> Series:

1029 ...

1030

1031 @overload

1032 def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:

1033 ...

1034

1035 def __next__(self) -> DataFrame | Series:

1036 if self.nrows and self.nrows_seen >= self.nrows:

1037 self.close()

1038 raise StopIteration

1039

1040 lines = list(islice(self.data, self.chunksize))

1041 if not lines:

1042 self.close()

1043 raise StopIteration

1044

1045 try:

1046 lines_json = self._combine_lines(lines)

1047 obj = self._get_object_parser(lines_json)

1048

1049 # Make sure that the returned objects have the right index.

1050 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))

1051 self.nrows_seen += len(obj)

1052 except Exception as ex:

1053 self.close()

1054 raise ex

1055

1056 if self.dtype_backend is not lib.no_default:

1057 return obj.convert_dtypes(

1058 infer_objects=False, dtype_backend=self.dtype_backend

1059 )

1060 else:

1061 return obj

1062

1063 def __enter__(self) -> JsonReader[FrameSeriesStrT]:

1064 return self

1065

1066 def __exit__(

1067 self,

1068 exc_type: type[BaseException] | None,

1069 exc_value: BaseException | None,

1070 traceback: TracebackType | None,

1071 ) -> None:

1072 self.close()

1073

1074

1075class Parser:

1076 _split_keys: tuple[str, ...]

1077 _default_orient: str

1078

1079 _STAMP_UNITS = ("s", "ms", "us", "ns")

1080 _MIN_STAMPS = {

1081 "s": 31536000,

1082 "ms": 31536000000,

1083 "us": 31536000000000,

1084 "ns": 31536000000000000,

1085 }

1086

1087 def __init__(

1088 self,

1089 json,

1090 orient,

1091 dtype: DtypeArg | None = None,

1092 convert_axes: bool = True,

1093 convert_dates: bool | list[str] = True,

1094 keep_default_dates: bool = False,

1095 precise_float: bool = False,

1096 date_unit=None,

1097 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

1098 ) -> None:

1099 self.json = json

1100

1101 if orient is None:

1102 orient = self._default_orient

1103

1104 self.orient = orient

1105

1106 self.dtype = dtype

1107

1108 if date_unit is not None:

1109 date_unit = date_unit.lower()

1110 if date_unit not in self._STAMP_UNITS:

1111 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")

1112 self.min_stamp = self._MIN_STAMPS[date_unit]

1113 else:

1114 self.min_stamp = self._MIN_STAMPS["s"]

1115

1116 self.precise_float = precise_float

1117 self.convert_axes = convert_axes

1118 self.convert_dates = convert_dates

1119 self.date_unit = date_unit

1120 self.keep_default_dates = keep_default_dates

1121 self.obj: DataFrame | Series | None = None

1122 self.dtype_backend = dtype_backend

1123

1124 def check_keys_split(self, decoded) -> None:

1125 """

1126 Checks that dict has only the appropriate keys for orient='split'.

1127 """

1128 bad_keys = set(decoded.keys()).difference(set(self._split_keys))

1129 if bad_keys:

1130 bad_keys_joined = ", ".join(bad_keys)

1131 raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")

1132

1133 def parse(self):

1134 self._parse()

1135

1136 if self.obj is None:

1137 return None

1138 if self.convert_axes:

1139 self._convert_axes()

1140 self._try_convert_types()

1141 return self.obj

1142

1143 def _parse(self):

1144 raise AbstractMethodError(self)

1145

1146 def _convert_axes(self) -> None:

1147 """

1148 Try to convert axes.

1149 """

1150 obj = self.obj

1151 assert obj is not None # for mypy

1152 for axis_name in obj._AXIS_ORDERS:

1153 new_axis, result = self._try_convert_data(

1154 name=axis_name,

1155 data=obj._get_axis(axis_name),

1156 use_dtypes=False,

1157 convert_dates=True,

1158 )

1159 if result:

1160 setattr(self.obj, axis_name, new_axis)

1161

1162 def _try_convert_types(self):

1163 raise AbstractMethodError(self)

1164

1165 def _try_convert_data(

1166 self,

1167 name,

1168 data,

1169 use_dtypes: bool = True,

1170 convert_dates: bool | list[str] = True,

1171 ):

1172 """

1173 Try to parse a ndarray like into a column by inferring dtype.

1174 """

1175 # don't try to coerce, unless a force conversion

1176 if use_dtypes:

1177 if not self.dtype:

1178 if all(notna(data)):

1179 return data, False

1180 return data.fillna(np.nan), True

1181

1182 elif self.dtype is True:

1183 pass

1184 else:

1185 # dtype to force

1186 dtype = (

1187 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype

1188 )

1189 if dtype is not None:

1190 try:

1191 return data.astype(dtype), True

1192 except (TypeError, ValueError):

1193 return data, False

1194

1195 if convert_dates:

1196 new_data, result = self._try_convert_to_date(data)

1197 if result:

1198 return new_data, True

1199

1200 if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex):

1201 # Fall through for conversion later on

1202 return data, True

1203 elif data.dtype == "object":

1204 # try float

1205 try:

1206 data = data.astype("float64")

1207 except (TypeError, ValueError):

1208 pass

1209

1210 if data.dtype.kind == "f":

1211 if data.dtype != "float64":

1212 # coerce floats to 64

1213 try:

1214 data = data.astype("float64")

1215 except (TypeError, ValueError):

1216 pass

1217

1218 # don't coerce 0-len data

1219 if len(data) and data.dtype in ("float", "object"):

1220 # coerce ints if we can

1221 try:

1222 new_data = data.astype("int64")

1223 if (new_data == data).all():

1224 data = new_data

1225 except (TypeError, ValueError, OverflowError):

1226 pass

1227

1228 # coerce ints to 64

1229 if data.dtype == "int":

1230 # coerce floats to 64

1231 try:

1232 data = data.astype("int64")

1233 except (TypeError, ValueError):

1234 pass

1235

1236 # if we have an index, we want to preserve dtypes

1237 if name == "index" and len(data):

1238 if self.orient == "split":

1239 return data, False

1240

1241 return data, True

1242

1243 def _try_convert_to_date(self, data):

1244 """

1245 Try to parse a ndarray like into a date column.

1246

1247 Try to coerce object in epoch/iso formats and integer/float in epoch

1248 formats. Return a boolean if parsing was successful.

1249 """

1250 # no conversion on empty

1251 if not len(data):

1252 return data, False

1253

1254 new_data = data

1255 if new_data.dtype == "object":

1256 try:

1257 new_data = data.astype("int64")

1258 except OverflowError:

1259 return data, False

1260 except (TypeError, ValueError):

1261 pass

1262

1263 # ignore numbers that are out of range

1264 if issubclass(new_data.dtype.type, np.number):

1265 in_range = (

1266 isna(new_data._values)

1267 | (new_data > self.min_stamp)

1268 | (new_data._values == iNaT)

1269 )

1270 if not in_range.all():

1271 return data, False

1272

1273 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS

1274 for date_unit in date_units:

1275 try:

1276 new_data = to_datetime(new_data, errors="raise", unit=date_unit)

1277 except (ValueError, OverflowError, TypeError):

1278 continue

1279 return new_data, True

1280 return data, False

1281

1282 def _try_convert_dates(self):

1283 raise AbstractMethodError(self)

1284

1285

1286class SeriesParser(Parser):

1287 _default_orient = "index"

1288 _split_keys = ("name", "index", "data")

1289

1290 def _parse(self) -> None:

1291 data = loads(self.json, precise_float=self.precise_float)

1292

1293 if self.orient == "split":

1294 decoded = {str(k): v for k, v in data.items()}

1295 self.check_keys_split(decoded)

1296 self.obj = Series(**decoded)

1297 else:

1298 self.obj = Series(data)

1299

1300 def _try_convert_types(self) -> None:

1301 if self.obj is None:

1302 return

1303 obj, result = self._try_convert_data(

1304 "data", self.obj, convert_dates=self.convert_dates

1305 )

1306 if result:

1307 self.obj = obj

1308

1309

1310class FrameParser(Parser):

1311 _default_orient = "columns"

1312 _split_keys = ("columns", "index", "data")

1313

1314 def _parse(self) -> None:

1315 json = self.json

1316 orient = self.orient

1317

1318 if orient == "columns":

1319 self.obj = DataFrame(

1320 loads(json, precise_float=self.precise_float), dtype=None

1321 )

1322 elif orient == "split":

1323 decoded = {

1324 str(k): v

1325 for k, v in loads(json, precise_float=self.precise_float).items()

1326 }

1327 self.check_keys_split(decoded)

1328 orig_names = [

1329 (tuple(col) if isinstance(col, list) else col)

1330 for col in decoded["columns"]

1331 ]

1332 decoded["columns"] = dedup_names(

1333 orig_names,

1334 is_potential_multi_index(orig_names, None),

1335 )

1336 self.obj = DataFrame(dtype=None, **decoded)

1337 elif orient == "index":

1338 self.obj = DataFrame.from_dict(

1339 loads(json, precise_float=self.precise_float),

1340 dtype=None,

1341 orient="index",

1342 )

1343 elif orient == "table":

1344 self.obj = parse_table_schema(json, precise_float=self.precise_float)

1345 else:

1346 self.obj = DataFrame(

1347 loads(json, precise_float=self.precise_float), dtype=None

1348 )

1349

1350 def _process_converter(self, f, filt=None) -> None:

1351 """

1352 Take a conversion function and possibly recreate the frame.

1353 """

1354 if filt is None:

1355 filt = lambda col, c: True

1356

1357 obj = self.obj

1358 assert obj is not None # for mypy

1359

1360 needs_new_obj = False

1361 new_obj = {}

1362 for i, (col, c) in enumerate(obj.items()):

1363 if filt(col, c):

1364 new_data, result = f(col, c)

1365 if result:

1366 c = new_data

1367 needs_new_obj = True

1368 new_obj[i] = c

1369

1370 if needs_new_obj:

1371 # possibly handle dup columns

1372 new_frame = DataFrame(new_obj, index=obj.index)

1373 new_frame.columns = obj.columns

1374 self.obj = new_frame

1375

1376 def _try_convert_types(self) -> None:

1377 if self.obj is None:

1378 return

1379 if self.convert_dates:

1380 self._try_convert_dates()

1381

1382 self._process_converter(

1383 lambda col, c: self._try_convert_data(col, c, convert_dates=False)

1384 )

1385

1386 def _try_convert_dates(self) -> None:

1387 if self.obj is None:

1388 return

1389

1390 # our columns to parse

1391 convert_dates_list_bool = self.convert_dates

1392 if isinstance(convert_dates_list_bool, bool):

1393 convert_dates_list_bool = []

1394 convert_dates = set(convert_dates_list_bool)

1395

1396 def is_ok(col) -> bool:

1397 """

1398 Return if this col is ok to try for a date parse.

1399 """

1400 if not isinstance(col, str):

1401 return False

1402

1403 col_lower = col.lower()

1404 if (

1405 col_lower.endswith("_at")

1406 or col_lower.endswith("_time")

1407 or col_lower == "modified"

1408 or col_lower == "date"

1409 or col_lower == "datetime"

1410 or col_lower.startswith("timestamp")

1411 ):

1412 return True

1413 return False

1414

1415 self._process_converter(

1416 lambda col, c: self._try_convert_to_date(c),

1417 lambda col, c: (

1418 (self.keep_default_dates and is_ok(col)) or col in convert_dates

1419 ),

1420 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/_json.py: 23%

501 statements