1from __future__ import annotations
2
3from abc import (
4 ABC,
5 abstractmethod,
6)
7from collections import abc
8from io import StringIO
9from itertools import islice
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Callable,
14 Generic,
15 Literal,
16 TypeVar,
17 final,
18 overload,
19)
20import warnings
21
22import numpy as np
23
24from pandas._libs import lib
25from pandas._libs.json import (
26 ujson_dumps,
27 ujson_loads,
28)
29from pandas._libs.tslibs import iNaT
30from pandas.compat._optional import import_optional_dependency
31from pandas.errors import AbstractMethodError
32from pandas.util._decorators import doc
33from pandas.util._exceptions import find_stack_level
34from pandas.util._validators import check_dtype_backend
35
36from pandas.core.dtypes.common import (
37 ensure_str,
38 is_string_dtype,
39)
40from pandas.core.dtypes.dtypes import PeriodDtype
41
42from pandas import (
43 ArrowDtype,
44 DataFrame,
45 Index,
46 MultiIndex,
47 Series,
48 isna,
49 notna,
50 to_datetime,
51)
52from pandas.core.reshape.concat import concat
53from pandas.core.shared_docs import _shared_docs
54
55from pandas.io.common import (
56 IOHandles,
57 dedup_names,
58 extension_to_compression,
59 file_exists,
60 get_handle,
61 is_fsspec_url,
62 is_potential_multi_index,
63 is_url,
64 stringify_path,
65)
66from pandas.io.json._normalize import convert_to_line_delimits
67from pandas.io.json._table_schema import (
68 build_table_schema,
69 parse_table_schema,
70)
71from pandas.io.parsers.readers import validate_integer
72
73if TYPE_CHECKING:
74 from collections.abc import (
75 Hashable,
76 Mapping,
77 )
78 from types import TracebackType
79
80 from pandas._typing import (
81 CompressionOptions,
82 DtypeArg,
83 DtypeBackend,
84 FilePath,
85 IndexLabel,
86 JSONEngine,
87 JSONSerializable,
88 ReadBuffer,
89 Self,
90 StorageOptions,
91 WriteBuffer,
92 )
93
94 from pandas.core.generic import NDFrame
95
96FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"])
97
98
99# interface to/from
100@overload
101def to_json(
102 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes],
103 obj: NDFrame,
104 orient: str | None = ...,
105 date_format: str = ...,
106 double_precision: int = ...,
107 force_ascii: bool = ...,
108 date_unit: str = ...,
109 default_handler: Callable[[Any], JSONSerializable] | None = ...,
110 lines: bool = ...,
111 compression: CompressionOptions = ...,
112 index: bool | None = ...,
113 indent: int = ...,
114 storage_options: StorageOptions = ...,
115 mode: Literal["a", "w"] = ...,
116) -> None:
117 ...
118
119
120@overload
121def to_json(
122 path_or_buf: None,
123 obj: NDFrame,
124 orient: str | None = ...,
125 date_format: str = ...,
126 double_precision: int = ...,
127 force_ascii: bool = ...,
128 date_unit: str = ...,
129 default_handler: Callable[[Any], JSONSerializable] | None = ...,
130 lines: bool = ...,
131 compression: CompressionOptions = ...,
132 index: bool | None = ...,
133 indent: int = ...,
134 storage_options: StorageOptions = ...,
135 mode: Literal["a", "w"] = ...,
136) -> str:
137 ...
138
139
140def to_json(
141 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None,
142 obj: NDFrame,
143 orient: str | None = None,
144 date_format: str = "epoch",
145 double_precision: int = 10,
146 force_ascii: bool = True,
147 date_unit: str = "ms",
148 default_handler: Callable[[Any], JSONSerializable] | None = None,
149 lines: bool = False,
150 compression: CompressionOptions = "infer",
151 index: bool | None = None,
152 indent: int = 0,
153 storage_options: StorageOptions | None = None,
154 mode: Literal["a", "w"] = "w",
155) -> str | None:
156 if orient in ["records", "values"] and index is True:
157 raise ValueError(
158 "'index=True' is only valid when 'orient' is 'split', 'table', "
159 "'index', or 'columns'."
160 )
161 elif orient in ["index", "columns"] and index is False:
162 raise ValueError(
163 "'index=False' is only valid when 'orient' is 'split', 'table', "
164 "'records', or 'values'."
165 )
166 elif index is None:
167 # will be ignored for orient='records' and 'values'
168 index = True
169
170 if lines and orient != "records":
171 raise ValueError("'lines' keyword only valid when 'orient' is records")
172
173 if mode not in ["a", "w"]:
174 msg = (
175 f"mode={mode} is not a valid option."
176 "Only 'w' and 'a' are currently supported."
177 )
178 raise ValueError(msg)
179
180 if mode == "a" and (not lines or orient != "records"):
181 msg = (
182 "mode='a' (append) is only supported when "
183 "lines is True and orient is 'records'"
184 )
185 raise ValueError(msg)
186
187 if orient == "table" and isinstance(obj, Series):
188 obj = obj.to_frame(name=obj.name or "values")
189
190 writer: type[Writer]
191 if orient == "table" and isinstance(obj, DataFrame):
192 writer = JSONTableWriter
193 elif isinstance(obj, Series):
194 writer = SeriesWriter
195 elif isinstance(obj, DataFrame):
196 writer = FrameWriter
197 else:
198 raise NotImplementedError("'obj' should be a Series or a DataFrame")
199
200 s = writer(
201 obj,
202 orient=orient,
203 date_format=date_format,
204 double_precision=double_precision,
205 ensure_ascii=force_ascii,
206 date_unit=date_unit,
207 default_handler=default_handler,
208 index=index,
209 indent=indent,
210 ).write()
211
212 if lines:
213 s = convert_to_line_delimits(s)
214
215 if path_or_buf is not None:
216 # apply compression and byte/text conversion
217 with get_handle(
218 path_or_buf, mode, compression=compression, storage_options=storage_options
219 ) as handles:
220 handles.handle.write(s)
221 else:
222 return s
223 return None
224
225
226class Writer(ABC):
227 _default_orient: str
228
229 def __init__(
230 self,
231 obj: NDFrame,
232 orient: str | None,
233 date_format: str,
234 double_precision: int,
235 ensure_ascii: bool,
236 date_unit: str,
237 index: bool,
238 default_handler: Callable[[Any], JSONSerializable] | None = None,
239 indent: int = 0,
240 ) -> None:
241 self.obj = obj
242
243 if orient is None:
244 orient = self._default_orient
245
246 self.orient = orient
247 self.date_format = date_format
248 self.double_precision = double_precision
249 self.ensure_ascii = ensure_ascii
250 self.date_unit = date_unit
251 self.default_handler = default_handler
252 self.index = index
253 self.indent = indent
254
255 self.is_copy = None
256 self._format_axes()
257
258 def _format_axes(self) -> None:
259 raise AbstractMethodError(self)
260
261 def write(self) -> str:
262 iso_dates = self.date_format == "iso"
263 return ujson_dumps(
264 self.obj_to_write,
265 orient=self.orient,
266 double_precision=self.double_precision,
267 ensure_ascii=self.ensure_ascii,
268 date_unit=self.date_unit,
269 iso_dates=iso_dates,
270 default_handler=self.default_handler,
271 indent=self.indent,
272 )
273
274 @property
275 @abstractmethod
276 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
277 """Object to write in JSON format."""
278
279
280class SeriesWriter(Writer):
281 _default_orient = "index"
282
283 @property
284 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
285 if not self.index and self.orient == "split":
286 return {"name": self.obj.name, "data": self.obj.values}
287 else:
288 return self.obj
289
290 def _format_axes(self) -> None:
291 if not self.obj.index.is_unique and self.orient == "index":
292 raise ValueError(f"Series index must be unique for orient='{self.orient}'")
293
294
295class FrameWriter(Writer):
296 _default_orient = "columns"
297
298 @property
299 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
300 if not self.index and self.orient == "split":
301 obj_to_write = self.obj.to_dict(orient="split")
302 del obj_to_write["index"]
303 else:
304 obj_to_write = self.obj
305 return obj_to_write
306
307 def _format_axes(self) -> None:
308 """
309 Try to format axes if they are datelike.
310 """
311 if not self.obj.index.is_unique and self.orient in ("index", "columns"):
312 raise ValueError(
313 f"DataFrame index must be unique for orient='{self.orient}'."
314 )
315 if not self.obj.columns.is_unique and self.orient in (
316 "index",
317 "columns",
318 "records",
319 ):
320 raise ValueError(
321 f"DataFrame columns must be unique for orient='{self.orient}'."
322 )
323
324
325class JSONTableWriter(FrameWriter):
326 _default_orient = "records"
327
328 def __init__(
329 self,
330 obj,
331 orient: str | None,
332 date_format: str,
333 double_precision: int,
334 ensure_ascii: bool,
335 date_unit: str,
336 index: bool,
337 default_handler: Callable[[Any], JSONSerializable] | None = None,
338 indent: int = 0,
339 ) -> None:
340 """
341 Adds a `schema` attribute with the Table Schema, resets
342 the index (can't do in caller, because the schema inference needs
343 to know what the index is, forces orient to records, and forces
344 date_format to 'iso'.
345 """
346 super().__init__(
347 obj,
348 orient,
349 date_format,
350 double_precision,
351 ensure_ascii,
352 date_unit,
353 index,
354 default_handler=default_handler,
355 indent=indent,
356 )
357
358 if date_format != "iso":
359 msg = (
360 "Trying to write with `orient='table'` and "
361 f"`date_format='{date_format}'`. Table Schema requires dates "
362 "to be formatted with `date_format='iso'`"
363 )
364 raise ValueError(msg)
365
366 self.schema = build_table_schema(obj, index=self.index)
367
368 # NotImplemented on a column MultiIndex
369 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
370 raise NotImplementedError(
371 "orient='table' is not supported for MultiIndex columns"
372 )
373
374 # TODO: Do this timedelta properly in objToJSON.c See GH #15137
375 if (
376 (obj.ndim == 1)
377 and (obj.name in set(obj.index.names))
378 or len(obj.columns.intersection(obj.index.names))
379 ):
380 msg = "Overlapping names between the index and columns"
381 raise ValueError(msg)
382
383 obj = obj.copy()
384 timedeltas = obj.select_dtypes(include=["timedelta"]).columns
385 if len(timedeltas):
386 obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat())
387 # Convert PeriodIndex to datetimes before serializing
388 if isinstance(obj.index.dtype, PeriodDtype):
389 obj.index = obj.index.to_timestamp()
390
391 # exclude index from obj if index=False
392 if not self.index:
393 self.obj = obj.reset_index(drop=True)
394 else:
395 self.obj = obj.reset_index(drop=False)
396 self.date_format = "iso"
397 self.orient = "records"
398 self.index = index
399
400 @property
401 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
402 return {"schema": self.schema, "data": self.obj}
403
404
405@overload
406def read_json(
407 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
408 *,
409 orient: str | None = ...,
410 typ: Literal["frame"] = ...,
411 dtype: DtypeArg | None = ...,
412 convert_axes: bool | None = ...,
413 convert_dates: bool | list[str] = ...,
414 keep_default_dates: bool = ...,
415 precise_float: bool = ...,
416 date_unit: str | None = ...,
417 encoding: str | None = ...,
418 encoding_errors: str | None = ...,
419 lines: bool = ...,
420 chunksize: int,
421 compression: CompressionOptions = ...,
422 nrows: int | None = ...,
423 storage_options: StorageOptions = ...,
424 dtype_backend: DtypeBackend | lib.NoDefault = ...,
425 engine: JSONEngine = ...,
426) -> JsonReader[Literal["frame"]]:
427 ...
428
429
430@overload
431def read_json(
432 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
433 *,
434 orient: str | None = ...,
435 typ: Literal["series"],
436 dtype: DtypeArg | None = ...,
437 convert_axes: bool | None = ...,
438 convert_dates: bool | list[str] = ...,
439 keep_default_dates: bool = ...,
440 precise_float: bool = ...,
441 date_unit: str | None = ...,
442 encoding: str | None = ...,
443 encoding_errors: str | None = ...,
444 lines: bool = ...,
445 chunksize: int,
446 compression: CompressionOptions = ...,
447 nrows: int | None = ...,
448 storage_options: StorageOptions = ...,
449 dtype_backend: DtypeBackend | lib.NoDefault = ...,
450 engine: JSONEngine = ...,
451) -> JsonReader[Literal["series"]]:
452 ...
453
454
455@overload
456def read_json(
457 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
458 *,
459 orient: str | None = ...,
460 typ: Literal["series"],
461 dtype: DtypeArg | None = ...,
462 convert_axes: bool | None = ...,
463 convert_dates: bool | list[str] = ...,
464 keep_default_dates: bool = ...,
465 precise_float: bool = ...,
466 date_unit: str | None = ...,
467 encoding: str | None = ...,
468 encoding_errors: str | None = ...,
469 lines: bool = ...,
470 chunksize: None = ...,
471 compression: CompressionOptions = ...,
472 nrows: int | None = ...,
473 storage_options: StorageOptions = ...,
474 dtype_backend: DtypeBackend | lib.NoDefault = ...,
475 engine: JSONEngine = ...,
476) -> Series:
477 ...
478
479
480@overload
481def read_json(
482 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
483 *,
484 orient: str | None = ...,
485 typ: Literal["frame"] = ...,
486 dtype: DtypeArg | None = ...,
487 convert_axes: bool | None = ...,
488 convert_dates: bool | list[str] = ...,
489 keep_default_dates: bool = ...,
490 precise_float: bool = ...,
491 date_unit: str | None = ...,
492 encoding: str | None = ...,
493 encoding_errors: str | None = ...,
494 lines: bool = ...,
495 chunksize: None = ...,
496 compression: CompressionOptions = ...,
497 nrows: int | None = ...,
498 storage_options: StorageOptions = ...,
499 dtype_backend: DtypeBackend | lib.NoDefault = ...,
500 engine: JSONEngine = ...,
501) -> DataFrame:
502 ...
503
504
505@doc(
506 storage_options=_shared_docs["storage_options"],
507 decompression_options=_shared_docs["decompression_options"] % "path_or_buf",
508)
509def read_json(
510 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
511 *,
512 orient: str | None = None,
513 typ: Literal["frame", "series"] = "frame",
514 dtype: DtypeArg | None = None,
515 convert_axes: bool | None = None,
516 convert_dates: bool | list[str] = True,
517 keep_default_dates: bool = True,
518 precise_float: bool = False,
519 date_unit: str | None = None,
520 encoding: str | None = None,
521 encoding_errors: str | None = "strict",
522 lines: bool = False,
523 chunksize: int | None = None,
524 compression: CompressionOptions = "infer",
525 nrows: int | None = None,
526 storage_options: StorageOptions | None = None,
527 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
528 engine: JSONEngine = "ujson",
529) -> DataFrame | Series | JsonReader:
530 """
531 Convert a JSON string to pandas object.
532
533 Parameters
534 ----------
535 path_or_buf : a valid JSON str, path object or file-like object
536 Any valid string path is acceptable. The string could be a URL. Valid
537 URL schemes include http, ftp, s3, and file. For file URLs, a host is
538 expected. A local file could be:
539 ``file://localhost/path/to/table.json``.
540
541 If you want to pass in a path object, pandas accepts any
542 ``os.PathLike``.
543
544 By file-like object, we refer to objects with a ``read()`` method,
545 such as a file handle (e.g. via builtin ``open`` function)
546 or ``StringIO``.
547
548 .. deprecated:: 2.1.0
549 Passing json literal strings is deprecated.
550
551 orient : str, optional
552 Indication of expected JSON string format.
553 Compatible JSON strings can be produced by ``to_json()`` with a
554 corresponding orient value.
555 The set of possible orients is:
556
557 - ``'split'`` : dict like
558 ``{{index -> [index], columns -> [columns], data -> [values]}}``
559 - ``'records'`` : list like
560 ``[{{column -> value}}, ... , {{column -> value}}]``
561 - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``
562 - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``
563 - ``'values'`` : just the values array
564 - ``'table'`` : dict like ``{{'schema': {{schema}}, 'data': {{data}}}}``
565
566 The allowed and default values depend on the value
567 of the `typ` parameter.
568
569 * when ``typ == 'series'``,
570
571 - allowed orients are ``{{'split','records','index'}}``
572 - default is ``'index'``
573 - The Series index must be unique for orient ``'index'``.
574
575 * when ``typ == 'frame'``,
576
577 - allowed orients are ``{{'split','records','index',
578 'columns','values', 'table'}}``
579 - default is ``'columns'``
580 - The DataFrame index must be unique for orients ``'index'`` and
581 ``'columns'``.
582 - The DataFrame columns must be unique for orients ``'index'``,
583 ``'columns'``, and ``'records'``.
584
585 typ : {{'frame', 'series'}}, default 'frame'
586 The type of object to recover.
587
588 dtype : bool or dict, default None
589 If True, infer dtypes; if a dict of column to dtype, then use those;
590 if False, then don't infer dtypes at all, applies only to the data.
591
592 For all ``orient`` values except ``'table'``, default is True.
593
594 convert_axes : bool, default None
595 Try to convert the axes to the proper dtypes.
596
597 For all ``orient`` values except ``'table'``, default is True.
598
599 convert_dates : bool or list of str, default True
600 If True then default datelike columns may be converted (depending on
601 keep_default_dates).
602 If False, no dates will be converted.
603 If a list of column names, then those columns will be converted and
604 default datelike columns may also be converted (depending on
605 keep_default_dates).
606
607 keep_default_dates : bool, default True
608 If parsing dates (convert_dates is not False), then try to parse the
609 default datelike columns.
610 A column label is datelike if
611
612 * it ends with ``'_at'``,
613
614 * it ends with ``'_time'``,
615
616 * it begins with ``'timestamp'``,
617
618 * it is ``'modified'``, or
619
620 * it is ``'date'``.
621
622 precise_float : bool, default False
623 Set to enable usage of higher precision (strtod) function when
624 decoding string to double values. Default (False) is to use fast but
625 less precise builtin functionality.
626
627 date_unit : str, default None
628 The timestamp unit to detect if converting dates. The default behaviour
629 is to try and detect the correct precision, but if this is not desired
630 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
631 milliseconds, microseconds or nanoseconds respectively.
632
633 encoding : str, default is 'utf-8'
634 The encoding to use to decode py3 bytes.
635
636 encoding_errors : str, optional, default "strict"
637 How encoding errors are treated. `List of possible values
638 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
639
640 .. versionadded:: 1.3.0
641
642 lines : bool, default False
643 Read the file as a json object per line.
644
645 chunksize : int, optional
646 Return JsonReader object for iteration.
647 See the `line-delimited json docs
648 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_
649 for more information on ``chunksize``.
650 This can only be passed if `lines=True`.
651 If this is None, the file will be read into memory all at once.
652 {decompression_options}
653
654 .. versionchanged:: 1.4.0 Zstandard support.
655
656 nrows : int, optional
657 The number of lines from the line-delimited jsonfile that has to be read.
658 This can only be passed if `lines=True`.
659 If this is None, all the rows will be returned.
660
661 {storage_options}
662
663 dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
664 Back-end data type applied to the resultant :class:`DataFrame`
665 (still experimental). Behaviour is as follows:
666
667 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
668 (default).
669 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
670 DataFrame.
671
672 .. versionadded:: 2.0
673
674 engine : {{"ujson", "pyarrow"}}, default "ujson"
675 Parser engine to use. The ``"pyarrow"`` engine is only available when
676 ``lines=True``.
677
678 .. versionadded:: 2.0
679
680 Returns
681 -------
682 Series, DataFrame, or pandas.api.typing.JsonReader
683 A JsonReader is returned when ``chunksize`` is not ``0`` or ``None``.
684 Otherwise, the type returned depends on the value of ``typ``.
685
686 See Also
687 --------
688 DataFrame.to_json : Convert a DataFrame to a JSON string.
689 Series.to_json : Convert a Series to a JSON string.
690 json_normalize : Normalize semi-structured JSON data into a flat table.
691
692 Notes
693 -----
694 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
695 :class:`Index` name of `index` gets written with :func:`to_json`, the
696 subsequent read operation will incorrectly set the :class:`Index` name to
697 ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
698 to denote a missing :class:`Index` name, and the subsequent
699 :func:`read_json` operation cannot distinguish between the two. The same
700 limitation is encountered with a :class:`MultiIndex` and any names
701 beginning with ``'level_'``.
702
703 Examples
704 --------
705 >>> from io import StringIO
706 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
707 ... index=['row 1', 'row 2'],
708 ... columns=['col 1', 'col 2'])
709
710 Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
711
712 >>> df.to_json(orient='split')
713 '\
714{{\
715"columns":["col 1","col 2"],\
716"index":["row 1","row 2"],\
717"data":[["a","b"],["c","d"]]\
718}}\
719'
720 >>> pd.read_json(StringIO(_), orient='split')
721 col 1 col 2
722 row 1 a b
723 row 2 c d
724
725 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
726
727 >>> df.to_json(orient='index')
728 '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'
729
730 >>> pd.read_json(StringIO(_), orient='index')
731 col 1 col 2
732 row 1 a b
733 row 2 c d
734
735 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
736 Note that index labels are not preserved with this encoding.
737
738 >>> df.to_json(orient='records')
739 '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'
740 >>> pd.read_json(StringIO(_), orient='records')
741 col 1 col 2
742 0 a b
743 1 c d
744
745 Encoding with Table Schema
746
747 >>> df.to_json(orient='table')
748 '\
749{{"schema":{{"fields":[\
750{{"name":"index","type":"string"}},\
751{{"name":"col 1","type":"string"}},\
752{{"name":"col 2","type":"string"}}],\
753"primaryKey":["index"],\
754"pandas_version":"1.4.0"}},\
755"data":[\
756{{"index":"row 1","col 1":"a","col 2":"b"}},\
757{{"index":"row 2","col 1":"c","col 2":"d"}}]\
758}}\
759'
760
761 The following example uses ``dtype_backend="numpy_nullable"``
762
763 >>> data = '''{{"index": {{"0": 0, "1": 1}},
764 ... "a": {{"0": 1, "1": null}},
765 ... "b": {{"0": 2.5, "1": 4.5}},
766 ... "c": {{"0": true, "1": false}},
767 ... "d": {{"0": "a", "1": "b"}},
768 ... "e": {{"0": 1577.2, "1": 1577.1}}}}'''
769 >>> pd.read_json(StringIO(data), dtype_backend="numpy_nullable")
770 index a b c d e
771 0 0 1 2.5 True a 1577.2
772 1 1 <NA> 4.5 False b 1577.1
773 """
774 if orient == "table" and dtype:
775 raise ValueError("cannot pass both dtype and orient='table'")
776 if orient == "table" and convert_axes:
777 raise ValueError("cannot pass both convert_axes and orient='table'")
778
779 check_dtype_backend(dtype_backend)
780
781 if dtype is None and orient != "table":
782 # error: Incompatible types in assignment (expression has type "bool", variable
783 # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],
784 # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable,
785 # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
786 # Type[int], Type[complex], Type[bool], Type[object]]], None]")
787 dtype = True # type: ignore[assignment]
788 if convert_axes is None and orient != "table":
789 convert_axes = True
790
791 json_reader = JsonReader(
792 path_or_buf,
793 orient=orient,
794 typ=typ,
795 dtype=dtype,
796 convert_axes=convert_axes,
797 convert_dates=convert_dates,
798 keep_default_dates=keep_default_dates,
799 precise_float=precise_float,
800 date_unit=date_unit,
801 encoding=encoding,
802 lines=lines,
803 chunksize=chunksize,
804 compression=compression,
805 nrows=nrows,
806 storage_options=storage_options,
807 encoding_errors=encoding_errors,
808 dtype_backend=dtype_backend,
809 engine=engine,
810 )
811
812 if chunksize:
813 return json_reader
814 else:
815 return json_reader.read()
816
817
818class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):
819 """
820 JsonReader provides an interface for reading in a JSON file.
821
822 If initialized with ``lines=True`` and ``chunksize``, can be iterated over
823 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
824 whole document.
825 """
826
827 def __init__(
828 self,
829 filepath_or_buffer,
830 orient,
831 typ: FrameSeriesStrT,
832 dtype,
833 convert_axes: bool | None,
834 convert_dates,
835 keep_default_dates: bool,
836 precise_float: bool,
837 date_unit,
838 encoding,
839 lines: bool,
840 chunksize: int | None,
841 compression: CompressionOptions,
842 nrows: int | None,
843 storage_options: StorageOptions | None = None,
844 encoding_errors: str | None = "strict",
845 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
846 engine: JSONEngine = "ujson",
847 ) -> None:
848 self.orient = orient
849 self.typ = typ
850 self.dtype = dtype
851 self.convert_axes = convert_axes
852 self.convert_dates = convert_dates
853 self.keep_default_dates = keep_default_dates
854 self.precise_float = precise_float
855 self.date_unit = date_unit
856 self.encoding = encoding
857 self.engine = engine
858 self.compression = compression
859 self.storage_options = storage_options
860 self.lines = lines
861 self.chunksize = chunksize
862 self.nrows_seen = 0
863 self.nrows = nrows
864 self.encoding_errors = encoding_errors
865 self.handles: IOHandles[str] | None = None
866 self.dtype_backend = dtype_backend
867
868 if self.engine not in {"pyarrow", "ujson"}:
869 raise ValueError(
870 f"The engine type {self.engine} is currently not supported."
871 )
872 if self.chunksize is not None:
873 self.chunksize = validate_integer("chunksize", self.chunksize, 1)
874 if not self.lines:
875 raise ValueError("chunksize can only be passed if lines=True")
876 if self.engine == "pyarrow":
877 raise ValueError(
878 "currently pyarrow engine doesn't support chunksize parameter"
879 )
880 if self.nrows is not None:
881 self.nrows = validate_integer("nrows", self.nrows, 0)
882 if not self.lines:
883 raise ValueError("nrows can only be passed if lines=True")
884 if (
885 isinstance(filepath_or_buffer, str)
886 and not self.lines
887 and "\n" in filepath_or_buffer
888 ):
889 warnings.warn(
890 "Passing literal json to 'read_json' is deprecated and "
891 "will be removed in a future version. To read from a "
892 "literal string, wrap it in a 'StringIO' object.",
893 FutureWarning,
894 stacklevel=find_stack_level(),
895 )
896 if self.engine == "pyarrow":
897 if not self.lines:
898 raise ValueError(
899 "currently pyarrow engine only supports "
900 "the line-delimited JSON format"
901 )
902 self.data = filepath_or_buffer
903 elif self.engine == "ujson":
904 data = self._get_data_from_filepath(filepath_or_buffer)
905 self.data = self._preprocess_data(data)
906
907 def _preprocess_data(self, data):
908 """
909 At this point, the data either has a `read` attribute (e.g. a file
910 object or a StringIO) or is a string that is a JSON document.
911
912 If self.chunksize, we prepare the data for the `__next__` method.
913 Otherwise, we read it into memory for the `read` method.
914 """
915 if hasattr(data, "read") and not (self.chunksize or self.nrows):
916 with self:
917 data = data.read()
918 if not hasattr(data, "read") and (self.chunksize or self.nrows):
919 data = StringIO(data)
920
921 return data
922
923 def _get_data_from_filepath(self, filepath_or_buffer):
924 """
925 The function read_json accepts three input types:
926 1. filepath (string-like)
927 2. file-like object (e.g. open file object, StringIO)
928 3. JSON string
929
930 This method turns (1) into (2) to simplify the rest of the processing.
931 It returns input types (2) and (3) unchanged.
932
933 It raises FileNotFoundError if the input is a string ending in
934 one of .json, .json.gz, .json.bz2, etc. but no such file exists.
935 """
936 # if it is a string but the file does not exist, it might be a JSON string
937 filepath_or_buffer = stringify_path(filepath_or_buffer)
938 if (
939 not isinstance(filepath_or_buffer, str)
940 or is_url(filepath_or_buffer)
941 or is_fsspec_url(filepath_or_buffer)
942 or file_exists(filepath_or_buffer)
943 ):
944 self.handles = get_handle(
945 filepath_or_buffer,
946 "r",
947 encoding=self.encoding,
948 compression=self.compression,
949 storage_options=self.storage_options,
950 errors=self.encoding_errors,
951 )
952 filepath_or_buffer = self.handles.handle
953 elif (
954 isinstance(filepath_or_buffer, str)
955 and filepath_or_buffer.lower().endswith(
956 (".json",) + tuple(f".json{c}" for c in extension_to_compression)
957 )
958 and not file_exists(filepath_or_buffer)
959 ):
960 raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
961 else:
962 warnings.warn(
963 "Passing literal json to 'read_json' is deprecated and "
964 "will be removed in a future version. To read from a "
965 "literal string, wrap it in a 'StringIO' object.",
966 FutureWarning,
967 stacklevel=find_stack_level(),
968 )
969 return filepath_or_buffer
970
971 def _combine_lines(self, lines) -> str:
972 """
973 Combines a list of JSON objects into one JSON object.
974 """
975 return (
976 f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'
977 )
978
979 @overload
980 def read(self: JsonReader[Literal["frame"]]) -> DataFrame:
981 ...
982
983 @overload
984 def read(self: JsonReader[Literal["series"]]) -> Series:
985 ...
986
987 @overload
988 def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
989 ...
990
991 def read(self) -> DataFrame | Series:
992 """
993 Read the whole JSON input into a pandas object.
994 """
995 obj: DataFrame | Series
996 with self:
997 if self.engine == "pyarrow":
998 pyarrow_json = import_optional_dependency("pyarrow.json")
999 pa_table = pyarrow_json.read_json(self.data)
1000
1001 mapping: type[ArrowDtype] | None | Callable
1002 if self.dtype_backend == "pyarrow":
1003 mapping = ArrowDtype
1004 elif self.dtype_backend == "numpy_nullable":
1005 from pandas.io._util import _arrow_dtype_mapping
1006
1007 mapping = _arrow_dtype_mapping().get
1008 else:
1009 mapping = None
1010
1011 return pa_table.to_pandas(types_mapper=mapping)
1012 elif self.engine == "ujson":
1013 if self.lines:
1014 if self.chunksize:
1015 obj = concat(self)
1016 elif self.nrows:
1017 lines = list(islice(self.data, self.nrows))
1018 lines_json = self._combine_lines(lines)
1019 obj = self._get_object_parser(lines_json)
1020 else:
1021 data = ensure_str(self.data)
1022 data_lines = data.split("\n")
1023 obj = self._get_object_parser(self._combine_lines(data_lines))
1024 else:
1025 obj = self._get_object_parser(self.data)
1026 if self.dtype_backend is not lib.no_default:
1027 return obj.convert_dtypes(
1028 infer_objects=False, dtype_backend=self.dtype_backend
1029 )
1030 else:
1031 return obj
1032
1033 def _get_object_parser(self, json) -> DataFrame | Series:
1034 """
1035 Parses a json document into a pandas object.
1036 """
1037 typ = self.typ
1038 dtype = self.dtype
1039 kwargs = {
1040 "orient": self.orient,
1041 "dtype": self.dtype,
1042 "convert_axes": self.convert_axes,
1043 "convert_dates": self.convert_dates,
1044 "keep_default_dates": self.keep_default_dates,
1045 "precise_float": self.precise_float,
1046 "date_unit": self.date_unit,
1047 "dtype_backend": self.dtype_backend,
1048 }
1049 obj = None
1050 if typ == "frame":
1051 obj = FrameParser(json, **kwargs).parse()
1052
1053 if typ == "series" or obj is None:
1054 if not isinstance(dtype, bool):
1055 kwargs["dtype"] = dtype
1056 obj = SeriesParser(json, **kwargs).parse()
1057
1058 return obj
1059
1060 def close(self) -> None:
1061 """
1062 If we opened a stream earlier, in _get_data_from_filepath, we should
1063 close it.
1064
1065 If an open stream or file was passed, we leave it open.
1066 """
1067 if self.handles is not None:
1068 self.handles.close()
1069
1070 def __iter__(self) -> Self:
1071 return self
1072
1073 @overload
1074 def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame:
1075 ...
1076
1077 @overload
1078 def __next__(self: JsonReader[Literal["series"]]) -> Series:
1079 ...
1080
1081 @overload
1082 def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
1083 ...
1084
1085 def __next__(self) -> DataFrame | Series:
1086 if self.nrows and self.nrows_seen >= self.nrows:
1087 self.close()
1088 raise StopIteration
1089
1090 lines = list(islice(self.data, self.chunksize))
1091 if not lines:
1092 self.close()
1093 raise StopIteration
1094
1095 try:
1096 lines_json = self._combine_lines(lines)
1097 obj = self._get_object_parser(lines_json)
1098
1099 # Make sure that the returned objects have the right index.
1100 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
1101 self.nrows_seen += len(obj)
1102 except Exception as ex:
1103 self.close()
1104 raise ex
1105
1106 if self.dtype_backend is not lib.no_default:
1107 return obj.convert_dtypes(
1108 infer_objects=False, dtype_backend=self.dtype_backend
1109 )
1110 else:
1111 return obj
1112
1113 def __enter__(self) -> Self:
1114 return self
1115
1116 def __exit__(
1117 self,
1118 exc_type: type[BaseException] | None,
1119 exc_value: BaseException | None,
1120 traceback: TracebackType | None,
1121 ) -> None:
1122 self.close()
1123
1124
1125class Parser:
1126 _split_keys: tuple[str, ...]
1127 _default_orient: str
1128
1129 _STAMP_UNITS = ("s", "ms", "us", "ns")
1130 _MIN_STAMPS = {
1131 "s": 31536000,
1132 "ms": 31536000000,
1133 "us": 31536000000000,
1134 "ns": 31536000000000000,
1135 }
1136 json: str
1137
1138 def __init__(
1139 self,
1140 json: str,
1141 orient,
1142 dtype: DtypeArg | None = None,
1143 convert_axes: bool = True,
1144 convert_dates: bool | list[str] = True,
1145 keep_default_dates: bool = False,
1146 precise_float: bool = False,
1147 date_unit=None,
1148 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1149 ) -> None:
1150 self.json = json
1151
1152 if orient is None:
1153 orient = self._default_orient
1154
1155 self.orient = orient
1156
1157 self.dtype = dtype
1158
1159 if date_unit is not None:
1160 date_unit = date_unit.lower()
1161 if date_unit not in self._STAMP_UNITS:
1162 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
1163 self.min_stamp = self._MIN_STAMPS[date_unit]
1164 else:
1165 self.min_stamp = self._MIN_STAMPS["s"]
1166
1167 self.precise_float = precise_float
1168 self.convert_axes = convert_axes
1169 self.convert_dates = convert_dates
1170 self.date_unit = date_unit
1171 self.keep_default_dates = keep_default_dates
1172 self.obj: DataFrame | Series | None = None
1173 self.dtype_backend = dtype_backend
1174
1175 @final
1176 def check_keys_split(self, decoded: dict) -> None:
1177 """
1178 Checks that dict has only the appropriate keys for orient='split'.
1179 """
1180 bad_keys = set(decoded.keys()).difference(set(self._split_keys))
1181 if bad_keys:
1182 bad_keys_joined = ", ".join(bad_keys)
1183 raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")
1184
1185 @final
1186 def parse(self):
1187 self._parse()
1188
1189 if self.obj is None:
1190 return None
1191 if self.convert_axes:
1192 self._convert_axes()
1193 self._try_convert_types()
1194 return self.obj
1195
1196 def _parse(self) -> None:
1197 raise AbstractMethodError(self)
1198
1199 @final
1200 def _convert_axes(self) -> None:
1201 """
1202 Try to convert axes.
1203 """
1204 obj = self.obj
1205 assert obj is not None # for mypy
1206 for axis_name in obj._AXIS_ORDERS:
1207 ax = obj._get_axis(axis_name)
1208 ser = Series(ax, dtype=ax.dtype, copy=False)
1209 new_ser, result = self._try_convert_data(
1210 name=axis_name,
1211 data=ser,
1212 use_dtypes=False,
1213 convert_dates=True,
1214 is_axis=True,
1215 )
1216 if result:
1217 new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False)
1218 setattr(self.obj, axis_name, new_axis)
1219
1220 def _try_convert_types(self) -> None:
1221 raise AbstractMethodError(self)
1222
1223 @final
1224 def _try_convert_data(
1225 self,
1226 name: Hashable,
1227 data: Series,
1228 use_dtypes: bool = True,
1229 convert_dates: bool | list[str] = True,
1230 is_axis: bool = False,
1231 ) -> tuple[Series, bool]:
1232 """
1233 Try to parse a Series into a column by inferring dtype.
1234 """
1235 # don't try to coerce, unless a force conversion
1236 if use_dtypes:
1237 if not self.dtype:
1238 if all(notna(data)):
1239 return data, False
1240
1241 with warnings.catch_warnings():
1242 warnings.filterwarnings(
1243 "ignore",
1244 "Downcasting object dtype arrays",
1245 category=FutureWarning,
1246 )
1247 filled = data.fillna(np.nan)
1248
1249 return filled, True
1250
1251 elif self.dtype is True:
1252 pass
1253 else:
1254 # dtype to force
1255 dtype = (
1256 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
1257 )
1258 if dtype is not None:
1259 try:
1260 return data.astype(dtype), True
1261 except (TypeError, ValueError):
1262 return data, False
1263
1264 if convert_dates:
1265 new_data, result = self._try_convert_to_date(data)
1266 if result:
1267 return new_data, True
1268
1269 converted = False
1270 if self.dtype_backend is not lib.no_default and not is_axis:
1271 # Fall through for conversion later on
1272 return data, True
1273 elif is_string_dtype(data.dtype):
1274 # try float
1275 try:
1276 data = data.astype("float64")
1277 converted = True
1278 except (TypeError, ValueError):
1279 pass
1280
1281 if data.dtype.kind == "f" and data.dtype != "float64":
1282 # coerce floats to 64
1283 try:
1284 data = data.astype("float64")
1285 converted = True
1286 except (TypeError, ValueError):
1287 pass
1288
1289 # don't coerce 0-len data
1290 if len(data) and data.dtype in ("float", "object"):
1291 # coerce ints if we can
1292 try:
1293 new_data = data.astype("int64")
1294 if (new_data == data).all():
1295 data = new_data
1296 converted = True
1297 except (TypeError, ValueError, OverflowError):
1298 pass
1299
1300 if data.dtype == "int" and data.dtype != "int64":
1301 # coerce ints to 64
1302 try:
1303 data = data.astype("int64")
1304 converted = True
1305 except (TypeError, ValueError):
1306 pass
1307
1308 # if we have an index, we want to preserve dtypes
1309 if name == "index" and len(data):
1310 if self.orient == "split":
1311 return data, False
1312
1313 return data, converted
1314
1315 @final
1316 def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
1317 """
1318 Try to parse a ndarray like into a date column.
1319
1320 Try to coerce object in epoch/iso formats and integer/float in epoch
1321 formats. Return a boolean if parsing was successful.
1322 """
1323 # no conversion on empty
1324 if not len(data):
1325 return data, False
1326
1327 new_data = data
1328
1329 if new_data.dtype == "string":
1330 new_data = new_data.astype(object)
1331
1332 if new_data.dtype == "object":
1333 try:
1334 new_data = data.astype("int64")
1335 except OverflowError:
1336 return data, False
1337 except (TypeError, ValueError):
1338 pass
1339
1340 # ignore numbers that are out of range
1341 if issubclass(new_data.dtype.type, np.number):
1342 in_range = (
1343 isna(new_data._values)
1344 | (new_data > self.min_stamp)
1345 | (new_data._values == iNaT)
1346 )
1347 if not in_range.all():
1348 return data, False
1349
1350 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
1351 for date_unit in date_units:
1352 try:
1353 with warnings.catch_warnings():
1354 warnings.filterwarnings(
1355 "ignore",
1356 ".*parsing datetimes with mixed time "
1357 "zones will raise an error",
1358 category=FutureWarning,
1359 )
1360 new_data = to_datetime(new_data, errors="raise", unit=date_unit)
1361 except (ValueError, OverflowError, TypeError):
1362 continue
1363 return new_data, True
1364 return data, False
1365
1366
1367class SeriesParser(Parser):
1368 _default_orient = "index"
1369 _split_keys = ("name", "index", "data")
1370 obj: Series | None
1371
1372 def _parse(self) -> None:
1373 data = ujson_loads(self.json, precise_float=self.precise_float)
1374
1375 if self.orient == "split":
1376 decoded = {str(k): v for k, v in data.items()}
1377 self.check_keys_split(decoded)
1378 self.obj = Series(**decoded)
1379 else:
1380 self.obj = Series(data)
1381
1382 def _try_convert_types(self) -> None:
1383 if self.obj is None:
1384 return
1385 obj, result = self._try_convert_data(
1386 "data", self.obj, convert_dates=self.convert_dates
1387 )
1388 if result:
1389 self.obj = obj
1390
1391
1392class FrameParser(Parser):
1393 _default_orient = "columns"
1394 _split_keys = ("columns", "index", "data")
1395 obj: DataFrame | None
1396
1397 def _parse(self) -> None:
1398 json = self.json
1399 orient = self.orient
1400
1401 if orient == "columns":
1402 self.obj = DataFrame(
1403 ujson_loads(json, precise_float=self.precise_float), dtype=None
1404 )
1405 elif orient == "split":
1406 decoded = {
1407 str(k): v
1408 for k, v in ujson_loads(json, precise_float=self.precise_float).items()
1409 }
1410 self.check_keys_split(decoded)
1411 orig_names = [
1412 (tuple(col) if isinstance(col, list) else col)
1413 for col in decoded["columns"]
1414 ]
1415 decoded["columns"] = dedup_names(
1416 orig_names,
1417 is_potential_multi_index(orig_names, None),
1418 )
1419 self.obj = DataFrame(dtype=None, **decoded)
1420 elif orient == "index":
1421 self.obj = DataFrame.from_dict(
1422 ujson_loads(json, precise_float=self.precise_float),
1423 dtype=None,
1424 orient="index",
1425 )
1426 elif orient == "table":
1427 self.obj = parse_table_schema(json, precise_float=self.precise_float)
1428 else:
1429 self.obj = DataFrame(
1430 ujson_loads(json, precise_float=self.precise_float), dtype=None
1431 )
1432
1433 def _process_converter(
1434 self,
1435 f: Callable[[Hashable, Series], tuple[Series, bool]],
1436 filt: Callable[[Hashable], bool] | None = None,
1437 ) -> None:
1438 """
1439 Take a conversion function and possibly recreate the frame.
1440 """
1441 if filt is None:
1442 filt = lambda col: True
1443
1444 obj = self.obj
1445 assert obj is not None # for mypy
1446
1447 needs_new_obj = False
1448 new_obj = {}
1449 for i, (col, c) in enumerate(obj.items()):
1450 if filt(col):
1451 new_data, result = f(col, c)
1452 if result:
1453 c = new_data
1454 needs_new_obj = True
1455 new_obj[i] = c
1456
1457 if needs_new_obj:
1458 # possibly handle dup columns
1459 new_frame = DataFrame(new_obj, index=obj.index)
1460 new_frame.columns = obj.columns
1461 self.obj = new_frame
1462
1463 def _try_convert_types(self) -> None:
1464 if self.obj is None:
1465 return
1466 if self.convert_dates:
1467 self._try_convert_dates()
1468
1469 self._process_converter(
1470 lambda col, c: self._try_convert_data(col, c, convert_dates=False)
1471 )
1472
1473 def _try_convert_dates(self) -> None:
1474 if self.obj is None:
1475 return
1476
1477 # our columns to parse
1478 convert_dates_list_bool = self.convert_dates
1479 if isinstance(convert_dates_list_bool, bool):
1480 convert_dates_list_bool = []
1481 convert_dates = set(convert_dates_list_bool)
1482
1483 def is_ok(col) -> bool:
1484 """
1485 Return if this col is ok to try for a date parse.
1486 """
1487 if col in convert_dates:
1488 return True
1489 if not self.keep_default_dates:
1490 return False
1491 if not isinstance(col, str):
1492 return False
1493
1494 col_lower = col.lower()
1495 if (
1496 col_lower.endswith(("_at", "_time"))
1497 or col_lower == "modified"
1498 or col_lower == "date"
1499 or col_lower == "datetime"
1500 or col_lower.startswith("timestamp")
1501 ):
1502 return True
1503 return False
1504
1505 self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok)