1from __future__ import annotations
2
3from abc import (
4 ABC,
5 abstractmethod,
6)
7from collections import abc
8from io import StringIO
9from itertools import islice
10from types import TracebackType
11from typing import (
12 TYPE_CHECKING,
13 Any,
14 Callable,
15 Generic,
16 Literal,
17 Mapping,
18 TypeVar,
19 overload,
20)
21
22import numpy as np
23
24from pandas._libs import lib
25from pandas._libs.json import (
26 dumps,
27 loads,
28)
29from pandas._libs.tslibs import iNaT
30from pandas._typing import (
31 CompressionOptions,
32 DtypeArg,
33 DtypeBackend,
34 FilePath,
35 IndexLabel,
36 JSONEngine,
37 JSONSerializable,
38 ReadBuffer,
39 StorageOptions,
40 WriteBuffer,
41)
42from pandas.compat._optional import import_optional_dependency
43from pandas.errors import AbstractMethodError
44from pandas.util._decorators import doc
45from pandas.util._validators import check_dtype_backend
46
47from pandas.core.dtypes.common import (
48 ensure_str,
49 is_period_dtype,
50)
51from pandas.core.dtypes.generic import ABCIndex
52
53from pandas import (
54 ArrowDtype,
55 DataFrame,
56 MultiIndex,
57 Series,
58 isna,
59 notna,
60 to_datetime,
61)
62from pandas.core.reshape.concat import concat
63from pandas.core.shared_docs import _shared_docs
64
65from pandas.io.common import (
66 IOHandles,
67 dedup_names,
68 extension_to_compression,
69 file_exists,
70 get_handle,
71 is_fsspec_url,
72 is_potential_multi_index,
73 is_url,
74 stringify_path,
75)
76from pandas.io.json._normalize import convert_to_line_delimits
77from pandas.io.json._table_schema import (
78 build_table_schema,
79 parse_table_schema,
80)
81from pandas.io.parsers.readers import validate_integer
82
83if TYPE_CHECKING:
84 from pandas.core.generic import NDFrame
85
86FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"])
87
88
89# interface to/from
90@overload
91def to_json(
92 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes],
93 obj: NDFrame,
94 orient: str | None = ...,
95 date_format: str = ...,
96 double_precision: int = ...,
97 force_ascii: bool = ...,
98 date_unit: str = ...,
99 default_handler: Callable[[Any], JSONSerializable] | None = ...,
100 lines: bool = ...,
101 compression: CompressionOptions = ...,
102 index: bool = ...,
103 indent: int = ...,
104 storage_options: StorageOptions = ...,
105 mode: Literal["a", "w"] = ...,
106) -> None:
107 ...
108
109
110@overload
111def to_json(
112 path_or_buf: None,
113 obj: NDFrame,
114 orient: str | None = ...,
115 date_format: str = ...,
116 double_precision: int = ...,
117 force_ascii: bool = ...,
118 date_unit: str = ...,
119 default_handler: Callable[[Any], JSONSerializable] | None = ...,
120 lines: bool = ...,
121 compression: CompressionOptions = ...,
122 index: bool = ...,
123 indent: int = ...,
124 storage_options: StorageOptions = ...,
125 mode: Literal["a", "w"] = ...,
126) -> str:
127 ...
128
129
130def to_json(
131 path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None,
132 obj: NDFrame,
133 orient: str | None = None,
134 date_format: str = "epoch",
135 double_precision: int = 10,
136 force_ascii: bool = True,
137 date_unit: str = "ms",
138 default_handler: Callable[[Any], JSONSerializable] | None = None,
139 lines: bool = False,
140 compression: CompressionOptions = "infer",
141 index: bool = True,
142 indent: int = 0,
143 storage_options: StorageOptions = None,
144 mode: Literal["a", "w"] = "w",
145) -> str | None:
146 if not index and orient not in ["split", "table"]:
147 raise ValueError(
148 "'index=False' is only valid when 'orient' is 'split' or 'table'"
149 )
150
151 if lines and orient != "records":
152 raise ValueError("'lines' keyword only valid when 'orient' is records")
153
154 if mode not in ["a", "w"]:
155 msg = (
156 f"mode={mode} is not a valid option."
157 "Only 'w' and 'a' are currently supported."
158 )
159 raise ValueError(msg)
160
161 if mode == "a" and (not lines or orient != "records"):
162 msg = (
163 "mode='a' (append) is only supported when"
164 "lines is True and orient is 'records'"
165 )
166 raise ValueError(msg)
167
168 if orient == "table" and isinstance(obj, Series):
169 obj = obj.to_frame(name=obj.name or "values")
170
171 writer: type[Writer]
172 if orient == "table" and isinstance(obj, DataFrame):
173 writer = JSONTableWriter
174 elif isinstance(obj, Series):
175 writer = SeriesWriter
176 elif isinstance(obj, DataFrame):
177 writer = FrameWriter
178 else:
179 raise NotImplementedError("'obj' should be a Series or a DataFrame")
180
181 s = writer(
182 obj,
183 orient=orient,
184 date_format=date_format,
185 double_precision=double_precision,
186 ensure_ascii=force_ascii,
187 date_unit=date_unit,
188 default_handler=default_handler,
189 index=index,
190 indent=indent,
191 ).write()
192
193 if lines:
194 s = convert_to_line_delimits(s)
195
196 if path_or_buf is not None:
197 # apply compression and byte/text conversion
198 with get_handle(
199 path_or_buf, mode, compression=compression, storage_options=storage_options
200 ) as handles:
201 handles.handle.write(s)
202 else:
203 return s
204 return None
205
206
207class Writer(ABC):
208 _default_orient: str
209
210 def __init__(
211 self,
212 obj: NDFrame,
213 orient: str | None,
214 date_format: str,
215 double_precision: int,
216 ensure_ascii: bool,
217 date_unit: str,
218 index: bool,
219 default_handler: Callable[[Any], JSONSerializable] | None = None,
220 indent: int = 0,
221 ) -> None:
222 self.obj = obj
223
224 if orient is None:
225 orient = self._default_orient
226
227 self.orient = orient
228 self.date_format = date_format
229 self.double_precision = double_precision
230 self.ensure_ascii = ensure_ascii
231 self.date_unit = date_unit
232 self.default_handler = default_handler
233 self.index = index
234 self.indent = indent
235
236 self.is_copy = None
237 self._format_axes()
238
239 def _format_axes(self):
240 raise AbstractMethodError(self)
241
242 def write(self) -> str:
243 iso_dates = self.date_format == "iso"
244 return dumps(
245 self.obj_to_write,
246 orient=self.orient,
247 double_precision=self.double_precision,
248 ensure_ascii=self.ensure_ascii,
249 date_unit=self.date_unit,
250 iso_dates=iso_dates,
251 default_handler=self.default_handler,
252 indent=self.indent,
253 )
254
255 @property
256 @abstractmethod
257 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
258 """Object to write in JSON format."""
259
260
261class SeriesWriter(Writer):
262 _default_orient = "index"
263
264 @property
265 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
266 if not self.index and self.orient == "split":
267 return {"name": self.obj.name, "data": self.obj.values}
268 else:
269 return self.obj
270
271 def _format_axes(self):
272 if not self.obj.index.is_unique and self.orient == "index":
273 raise ValueError(f"Series index must be unique for orient='{self.orient}'")
274
275
276class FrameWriter(Writer):
277 _default_orient = "columns"
278
279 @property
280 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
281 if not self.index and self.orient == "split":
282 obj_to_write = self.obj.to_dict(orient="split")
283 del obj_to_write["index"]
284 else:
285 obj_to_write = self.obj
286 return obj_to_write
287
288 def _format_axes(self):
289 """
290 Try to format axes if they are datelike.
291 """
292 if not self.obj.index.is_unique and self.orient in ("index", "columns"):
293 raise ValueError(
294 f"DataFrame index must be unique for orient='{self.orient}'."
295 )
296 if not self.obj.columns.is_unique and self.orient in (
297 "index",
298 "columns",
299 "records",
300 ):
301 raise ValueError(
302 f"DataFrame columns must be unique for orient='{self.orient}'."
303 )
304
305
306class JSONTableWriter(FrameWriter):
307 _default_orient = "records"
308
309 def __init__(
310 self,
311 obj,
312 orient: str | None,
313 date_format: str,
314 double_precision: int,
315 ensure_ascii: bool,
316 date_unit: str,
317 index: bool,
318 default_handler: Callable[[Any], JSONSerializable] | None = None,
319 indent: int = 0,
320 ) -> None:
321 """
322 Adds a `schema` attribute with the Table Schema, resets
323 the index (can't do in caller, because the schema inference needs
324 to know what the index is, forces orient to records, and forces
325 date_format to 'iso'.
326 """
327 super().__init__(
328 obj,
329 orient,
330 date_format,
331 double_precision,
332 ensure_ascii,
333 date_unit,
334 index,
335 default_handler=default_handler,
336 indent=indent,
337 )
338
339 if date_format != "iso":
340 msg = (
341 "Trying to write with `orient='table'` and "
342 f"`date_format='{date_format}'`. Table Schema requires dates "
343 "to be formatted with `date_format='iso'`"
344 )
345 raise ValueError(msg)
346
347 self.schema = build_table_schema(obj, index=self.index)
348
349 # NotImplemented on a column MultiIndex
350 if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
351 raise NotImplementedError(
352 "orient='table' is not supported for MultiIndex columns"
353 )
354
355 # TODO: Do this timedelta properly in objToJSON.c See GH #15137
356 if (
357 (obj.ndim == 1)
358 and (obj.name in set(obj.index.names))
359 or len(obj.columns.intersection(obj.index.names))
360 ):
361 msg = "Overlapping names between the index and columns"
362 raise ValueError(msg)
363
364 obj = obj.copy()
365 timedeltas = obj.select_dtypes(include=["timedelta"]).columns
366 if len(timedeltas):
367 obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())
368 # Convert PeriodIndex to datetimes before serializing
369 if is_period_dtype(obj.index.dtype):
370 obj.index = obj.index.to_timestamp()
371
372 # exclude index from obj if index=False
373 if not self.index:
374 self.obj = obj.reset_index(drop=True)
375 else:
376 self.obj = obj.reset_index(drop=False)
377 self.date_format = "iso"
378 self.orient = "records"
379 self.index = index
380
381 @property
382 def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
383 return {"schema": self.schema, "data": self.obj}
384
385
386@overload
387def read_json(
388 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
389 *,
390 orient: str | None = ...,
391 typ: Literal["frame"] = ...,
392 dtype: DtypeArg | None = ...,
393 convert_axes=...,
394 convert_dates: bool | list[str] = ...,
395 keep_default_dates: bool = ...,
396 precise_float: bool = ...,
397 date_unit: str | None = ...,
398 encoding: str | None = ...,
399 encoding_errors: str | None = ...,
400 lines: bool = ...,
401 chunksize: int,
402 compression: CompressionOptions = ...,
403 nrows: int | None = ...,
404 storage_options: StorageOptions = ...,
405 dtype_backend: DtypeBackend | lib.NoDefault = ...,
406 engine: JSONEngine = ...,
407) -> JsonReader[Literal["frame"]]:
408 ...
409
410
411@overload
412def read_json(
413 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
414 *,
415 orient: str | None = ...,
416 typ: Literal["series"],
417 dtype: DtypeArg | None = ...,
418 convert_axes=...,
419 convert_dates: bool | list[str] = ...,
420 keep_default_dates: bool = ...,
421 precise_float: bool = ...,
422 date_unit: str | None = ...,
423 encoding: str | None = ...,
424 encoding_errors: str | None = ...,
425 lines: bool = ...,
426 chunksize: int,
427 compression: CompressionOptions = ...,
428 nrows: int | None = ...,
429 storage_options: StorageOptions = ...,
430 dtype_backend: DtypeBackend | lib.NoDefault = ...,
431 engine: JSONEngine = ...,
432) -> JsonReader[Literal["series"]]:
433 ...
434
435
436@overload
437def read_json(
438 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
439 *,
440 orient: str | None = ...,
441 typ: Literal["series"],
442 dtype: DtypeArg | None = ...,
443 convert_axes=...,
444 convert_dates: bool | list[str] = ...,
445 keep_default_dates: bool = ...,
446 precise_float: bool = ...,
447 date_unit: str | None = ...,
448 encoding: str | None = ...,
449 encoding_errors: str | None = ...,
450 lines: bool = ...,
451 chunksize: None = ...,
452 compression: CompressionOptions = ...,
453 nrows: int | None = ...,
454 storage_options: StorageOptions = ...,
455 dtype_backend: DtypeBackend | lib.NoDefault = ...,
456 engine: JSONEngine = ...,
457) -> Series:
458 ...
459
460
461@overload
462def read_json(
463 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
464 *,
465 orient: str | None = ...,
466 typ: Literal["frame"] = ...,
467 dtype: DtypeArg | None = ...,
468 convert_axes=...,
469 convert_dates: bool | list[str] = ...,
470 keep_default_dates: bool = ...,
471 precise_float: bool = ...,
472 date_unit: str | None = ...,
473 encoding: str | None = ...,
474 encoding_errors: str | None = ...,
475 lines: bool = ...,
476 chunksize: None = ...,
477 compression: CompressionOptions = ...,
478 nrows: int | None = ...,
479 storage_options: StorageOptions = ...,
480 dtype_backend: DtypeBackend | lib.NoDefault = ...,
481 engine: JSONEngine = ...,
482) -> DataFrame:
483 ...
484
485
486@doc(
487 storage_options=_shared_docs["storage_options"],
488 decompression_options=_shared_docs["decompression_options"] % "path_or_buf",
489)
490def read_json(
491 path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
492 *,
493 orient: str | None = None,
494 typ: Literal["frame", "series"] = "frame",
495 dtype: DtypeArg | None = None,
496 convert_axes=None,
497 convert_dates: bool | list[str] = True,
498 keep_default_dates: bool = True,
499 precise_float: bool = False,
500 date_unit: str | None = None,
501 encoding: str | None = None,
502 encoding_errors: str | None = "strict",
503 lines: bool = False,
504 chunksize: int | None = None,
505 compression: CompressionOptions = "infer",
506 nrows: int | None = None,
507 storage_options: StorageOptions = None,
508 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
509 engine: JSONEngine = "ujson",
510) -> DataFrame | Series | JsonReader:
511 """
512 Convert a JSON string to pandas object.
513
514 Parameters
515 ----------
516 path_or_buf : a valid JSON str, path object or file-like object
517 Any valid string path is acceptable. The string could be a URL. Valid
518 URL schemes include http, ftp, s3, and file. For file URLs, a host is
519 expected. A local file could be:
520 ``file://localhost/path/to/table.json``.
521
522 If you want to pass in a path object, pandas accepts any
523 ``os.PathLike``.
524
525 By file-like object, we refer to objects with a ``read()`` method,
526 such as a file handle (e.g. via builtin ``open`` function)
527 or ``StringIO``.
528 orient : str, optional
529 Indication of expected JSON string format.
530 Compatible JSON strings can be produced by ``to_json()`` with a
531 corresponding orient value.
532 The set of possible orients is:
533
534 - ``'split'`` : dict like
535 ``{{index -> [index], columns -> [columns], data -> [values]}}``
536 - ``'records'`` : list like
537 ``[{{column -> value}}, ... , {{column -> value}}]``
538 - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``
539 - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``
540 - ``'values'`` : just the values array
541
542 The allowed and default values depend on the value
543 of the `typ` parameter.
544
545 * when ``typ == 'series'``,
546
547 - allowed orients are ``{{'split','records','index'}}``
548 - default is ``'index'``
549 - The Series index must be unique for orient ``'index'``.
550
551 * when ``typ == 'frame'``,
552
553 - allowed orients are ``{{'split','records','index',
554 'columns','values', 'table'}}``
555 - default is ``'columns'``
556 - The DataFrame index must be unique for orients ``'index'`` and
557 ``'columns'``.
558 - The DataFrame columns must be unique for orients ``'index'``,
559 ``'columns'``, and ``'records'``.
560
561 typ : {{'frame', 'series'}}, default 'frame'
562 The type of object to recover.
563
564 dtype : bool or dict, default None
565 If True, infer dtypes; if a dict of column to dtype, then use those;
566 if False, then don't infer dtypes at all, applies only to the data.
567
568 For all ``orient`` values except ``'table'``, default is True.
569
570 convert_axes : bool, default None
571 Try to convert the axes to the proper dtypes.
572
573 For all ``orient`` values except ``'table'``, default is True.
574
575 convert_dates : bool or list of str, default True
576 If True then default datelike columns may be converted (depending on
577 keep_default_dates).
578 If False, no dates will be converted.
579 If a list of column names, then those columns will be converted and
580 default datelike columns may also be converted (depending on
581 keep_default_dates).
582
583 keep_default_dates : bool, default True
584 If parsing dates (convert_dates is not False), then try to parse the
585 default datelike columns.
586 A column label is datelike if
587
588 * it ends with ``'_at'``,
589
590 * it ends with ``'_time'``,
591
592 * it begins with ``'timestamp'``,
593
594 * it is ``'modified'``, or
595
596 * it is ``'date'``.
597
598 precise_float : bool, default False
599 Set to enable usage of higher precision (strtod) function when
600 decoding string to double values. Default (False) is to use fast but
601 less precise builtin functionality.
602
603 date_unit : str, default None
604 The timestamp unit to detect if converting dates. The default behaviour
605 is to try and detect the correct precision, but if this is not desired
606 then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
607 milliseconds, microseconds or nanoseconds respectively.
608
609 encoding : str, default is 'utf-8'
610 The encoding to use to decode py3 bytes.
611
612 encoding_errors : str, optional, default "strict"
613 How encoding errors are treated. `List of possible values
614 <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
615
616 .. versionadded:: 1.3.0
617
618 lines : bool, default False
619 Read the file as a json object per line.
620
621 chunksize : int, optional
622 Return JsonReader object for iteration.
623 See the `line-delimited json docs
624 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_
625 for more information on ``chunksize``.
626 This can only be passed if `lines=True`.
627 If this is None, the file will be read into memory all at once.
628
629 .. versionchanged:: 1.2
630
631 ``JsonReader`` is a context manager.
632
633 {decompression_options}
634
635 .. versionchanged:: 1.4.0 Zstandard support.
636
637 nrows : int, optional
638 The number of lines from the line-delimited jsonfile that has to be read.
639 This can only be passed if `lines=True`.
640 If this is None, all the rows will be returned.
641
642 .. versionadded:: 1.1
643
644 {storage_options}
645
646 .. versionadded:: 1.2.0
647
648 dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
649 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
650 arrays, nullable dtypes are used for all dtypes that have a nullable
651 implementation when "numpy_nullable" is set, pyarrow is used for all
652 dtypes if "pyarrow" is set.
653
654 The dtype_backends are still experimential.
655
656 .. versionadded:: 2.0
657
658 engine : {{"ujson", "pyarrow"}}, default "ujson"
659 Parser engine to use. The ``"pyarrow"`` engine is only available when
660 ``lines=True``.
661
662 .. versionadded:: 2.0
663
664 Returns
665 -------
666 Series or DataFrame
667 The type returned depends on the value of `typ`.
668
669 See Also
670 --------
671 DataFrame.to_json : Convert a DataFrame to a JSON string.
672 Series.to_json : Convert a Series to a JSON string.
673 json_normalize : Normalize semi-structured JSON data into a flat table.
674
675 Notes
676 -----
677 Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
678 :class:`Index` name of `index` gets written with :func:`to_json`, the
679 subsequent read operation will incorrectly set the :class:`Index` name to
680 ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
681 to denote a missing :class:`Index` name, and the subsequent
682 :func:`read_json` operation cannot distinguish between the two. The same
683 limitation is encountered with a :class:`MultiIndex` and any names
684 beginning with ``'level_'``.
685
686 Examples
687 --------
688 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
689 ... index=['row 1', 'row 2'],
690 ... columns=['col 1', 'col 2'])
691
692 Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
693
694 >>> df.to_json(orient='split')
695 '\
696{{\
697"columns":["col 1","col 2"],\
698"index":["row 1","row 2"],\
699"data":[["a","b"],["c","d"]]\
700}}\
701'
702 >>> pd.read_json(_, orient='split')
703 col 1 col 2
704 row 1 a b
705 row 2 c d
706
707 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
708
709 >>> df.to_json(orient='index')
710 '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'
711
712 >>> pd.read_json(_, orient='index')
713 col 1 col 2
714 row 1 a b
715 row 2 c d
716
717 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
718 Note that index labels are not preserved with this encoding.
719
720 >>> df.to_json(orient='records')
721 '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'
722 >>> pd.read_json(_, orient='records')
723 col 1 col 2
724 0 a b
725 1 c d
726
727 Encoding with Table Schema
728
729 >>> df.to_json(orient='table')
730 '\
731{{"schema":{{"fields":[\
732{{"name":"index","type":"string"}},\
733{{"name":"col 1","type":"string"}},\
734{{"name":"col 2","type":"string"}}],\
735"primaryKey":["index"],\
736"pandas_version":"1.4.0"}},\
737"data":[\
738{{"index":"row 1","col 1":"a","col 2":"b"}},\
739{{"index":"row 2","col 1":"c","col 2":"d"}}]\
740}}\
741'
742 """
743 if orient == "table" and dtype:
744 raise ValueError("cannot pass both dtype and orient='table'")
745 if orient == "table" and convert_axes:
746 raise ValueError("cannot pass both convert_axes and orient='table'")
747
748 check_dtype_backend(dtype_backend)
749
750 if dtype is None and orient != "table":
751 # error: Incompatible types in assignment (expression has type "bool", variable
752 # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],
753 # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable,
754 # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
755 # Type[int], Type[complex], Type[bool], Type[object]]], None]")
756 dtype = True # type: ignore[assignment]
757 if convert_axes is None and orient != "table":
758 convert_axes = True
759
760 json_reader = JsonReader(
761 path_or_buf,
762 orient=orient,
763 typ=typ,
764 dtype=dtype,
765 convert_axes=convert_axes,
766 convert_dates=convert_dates,
767 keep_default_dates=keep_default_dates,
768 precise_float=precise_float,
769 date_unit=date_unit,
770 encoding=encoding,
771 lines=lines,
772 chunksize=chunksize,
773 compression=compression,
774 nrows=nrows,
775 storage_options=storage_options,
776 encoding_errors=encoding_errors,
777 dtype_backend=dtype_backend,
778 engine=engine,
779 )
780
781 if chunksize:
782 return json_reader
783 else:
784 return json_reader.read()
785
786
787class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):
788 """
789 JsonReader provides an interface for reading in a JSON file.
790
791 If initialized with ``lines=True`` and ``chunksize``, can be iterated over
792 ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
793 whole document.
794 """
795
796 def __init__(
797 self,
798 filepath_or_buffer,
799 orient,
800 typ: FrameSeriesStrT,
801 dtype,
802 convert_axes,
803 convert_dates,
804 keep_default_dates: bool,
805 precise_float: bool,
806 date_unit,
807 encoding,
808 lines: bool,
809 chunksize: int | None,
810 compression: CompressionOptions,
811 nrows: int | None,
812 storage_options: StorageOptions = None,
813 encoding_errors: str | None = "strict",
814 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
815 engine: JSONEngine = "ujson",
816 ) -> None:
817 self.orient = orient
818 self.typ = typ
819 self.dtype = dtype
820 self.convert_axes = convert_axes
821 self.convert_dates = convert_dates
822 self.keep_default_dates = keep_default_dates
823 self.precise_float = precise_float
824 self.date_unit = date_unit
825 self.encoding = encoding
826 self.engine = engine
827 self.compression = compression
828 self.storage_options = storage_options
829 self.lines = lines
830 self.chunksize = chunksize
831 self.nrows_seen = 0
832 self.nrows = nrows
833 self.encoding_errors = encoding_errors
834 self.handles: IOHandles[str] | None = None
835 self.dtype_backend = dtype_backend
836
837 if self.engine not in {"pyarrow", "ujson"}:
838 raise ValueError(
839 f"The engine type {self.engine} is currently not supported."
840 )
841 if self.chunksize is not None:
842 self.chunksize = validate_integer("chunksize", self.chunksize, 1)
843 if not self.lines:
844 raise ValueError("chunksize can only be passed if lines=True")
845 if self.engine == "pyarrow":
846 raise ValueError(
847 "currently pyarrow engine doesn't support chunksize parameter"
848 )
849 if self.nrows is not None:
850 self.nrows = validate_integer("nrows", self.nrows, 0)
851 if not self.lines:
852 raise ValueError("nrows can only be passed if lines=True")
853 if self.engine == "pyarrow":
854 if not self.lines:
855 raise ValueError(
856 "currently pyarrow engine only supports "
857 "the line-delimited JSON format"
858 )
859 self.data = filepath_or_buffer
860 elif self.engine == "ujson":
861 data = self._get_data_from_filepath(filepath_or_buffer)
862 self.data = self._preprocess_data(data)
863
864 def _preprocess_data(self, data):
865 """
866 At this point, the data either has a `read` attribute (e.g. a file
867 object or a StringIO) or is a string that is a JSON document.
868
869 If self.chunksize, we prepare the data for the `__next__` method.
870 Otherwise, we read it into memory for the `read` method.
871 """
872 if hasattr(data, "read") and not (self.chunksize or self.nrows):
873 with self:
874 data = data.read()
875 if not hasattr(data, "read") and (self.chunksize or self.nrows):
876 data = StringIO(data)
877
878 return data
879
880 def _get_data_from_filepath(self, filepath_or_buffer):
881 """
882 The function read_json accepts three input types:
883 1. filepath (string-like)
884 2. file-like object (e.g. open file object, StringIO)
885 3. JSON string
886
887 This method turns (1) into (2) to simplify the rest of the processing.
888 It returns input types (2) and (3) unchanged.
889
890 It raises FileNotFoundError if the input is a string ending in
891 one of .json, .json.gz, .json.bz2, etc. but no such file exists.
892 """
893 # if it is a string but the file does not exist, it might be a JSON string
894 filepath_or_buffer = stringify_path(filepath_or_buffer)
895 if (
896 not isinstance(filepath_or_buffer, str)
897 or is_url(filepath_or_buffer)
898 or is_fsspec_url(filepath_or_buffer)
899 or file_exists(filepath_or_buffer)
900 ):
901 self.handles = get_handle(
902 filepath_or_buffer,
903 "r",
904 encoding=self.encoding,
905 compression=self.compression,
906 storage_options=self.storage_options,
907 errors=self.encoding_errors,
908 )
909 filepath_or_buffer = self.handles.handle
910 elif (
911 isinstance(filepath_or_buffer, str)
912 and filepath_or_buffer.lower().endswith(
913 (".json",) + tuple(f".json{c}" for c in extension_to_compression)
914 )
915 and not file_exists(filepath_or_buffer)
916 ):
917 raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
918
919 return filepath_or_buffer
920
921 def _combine_lines(self, lines) -> str:
922 """
923 Combines a list of JSON objects into one JSON object.
924 """
925 return (
926 f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'
927 )
928
929 @overload
930 def read(self: JsonReader[Literal["frame"]]) -> DataFrame:
931 ...
932
933 @overload
934 def read(self: JsonReader[Literal["series"]]) -> Series:
935 ...
936
937 @overload
938 def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
939 ...
940
941 def read(self) -> DataFrame | Series:
942 """
943 Read the whole JSON input into a pandas object.
944 """
945 obj: DataFrame | Series
946 with self:
947 if self.engine == "pyarrow":
948 pyarrow_json = import_optional_dependency("pyarrow.json")
949 pa_table = pyarrow_json.read_json(self.data)
950
951 mapping: type[ArrowDtype] | None | Callable
952 if self.dtype_backend == "pyarrow":
953 mapping = ArrowDtype
954 elif self.dtype_backend == "numpy_nullable":
955 from pandas.io._util import _arrow_dtype_mapping
956
957 mapping = _arrow_dtype_mapping().get
958 else:
959 mapping = None
960
961 return pa_table.to_pandas(types_mapper=mapping)
962 elif self.engine == "ujson":
963 if self.lines:
964 if self.chunksize:
965 obj = concat(self)
966 elif self.nrows:
967 lines = list(islice(self.data, self.nrows))
968 lines_json = self._combine_lines(lines)
969 obj = self._get_object_parser(lines_json)
970 else:
971 data = ensure_str(self.data)
972 data_lines = data.split("\n")
973 obj = self._get_object_parser(self._combine_lines(data_lines))
974 else:
975 obj = self._get_object_parser(self.data)
976 if self.dtype_backend is not lib.no_default:
977 return obj.convert_dtypes(
978 infer_objects=False, dtype_backend=self.dtype_backend
979 )
980 else:
981 return obj
982
983 def _get_object_parser(self, json) -> DataFrame | Series:
984 """
985 Parses a json document into a pandas object.
986 """
987 typ = self.typ
988 dtype = self.dtype
989 kwargs = {
990 "orient": self.orient,
991 "dtype": self.dtype,
992 "convert_axes": self.convert_axes,
993 "convert_dates": self.convert_dates,
994 "keep_default_dates": self.keep_default_dates,
995 "precise_float": self.precise_float,
996 "date_unit": self.date_unit,
997 "dtype_backend": self.dtype_backend,
998 }
999 obj = None
1000 if typ == "frame":
1001 obj = FrameParser(json, **kwargs).parse()
1002
1003 if typ == "series" or obj is None:
1004 if not isinstance(dtype, bool):
1005 kwargs["dtype"] = dtype
1006 obj = SeriesParser(json, **kwargs).parse()
1007
1008 return obj
1009
1010 def close(self) -> None:
1011 """
1012 If we opened a stream earlier, in _get_data_from_filepath, we should
1013 close it.
1014
1015 If an open stream or file was passed, we leave it open.
1016 """
1017 if self.handles is not None:
1018 self.handles.close()
1019
1020 def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]:
1021 return self
1022
1023 @overload
1024 def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame:
1025 ...
1026
1027 @overload
1028 def __next__(self: JsonReader[Literal["series"]]) -> Series:
1029 ...
1030
1031 @overload
1032 def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
1033 ...
1034
1035 def __next__(self) -> DataFrame | Series:
1036 if self.nrows and self.nrows_seen >= self.nrows:
1037 self.close()
1038 raise StopIteration
1039
1040 lines = list(islice(self.data, self.chunksize))
1041 if not lines:
1042 self.close()
1043 raise StopIteration
1044
1045 try:
1046 lines_json = self._combine_lines(lines)
1047 obj = self._get_object_parser(lines_json)
1048
1049 # Make sure that the returned objects have the right index.
1050 obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
1051 self.nrows_seen += len(obj)
1052 except Exception as ex:
1053 self.close()
1054 raise ex
1055
1056 if self.dtype_backend is not lib.no_default:
1057 return obj.convert_dtypes(
1058 infer_objects=False, dtype_backend=self.dtype_backend
1059 )
1060 else:
1061 return obj
1062
1063 def __enter__(self) -> JsonReader[FrameSeriesStrT]:
1064 return self
1065
1066 def __exit__(
1067 self,
1068 exc_type: type[BaseException] | None,
1069 exc_value: BaseException | None,
1070 traceback: TracebackType | None,
1071 ) -> None:
1072 self.close()
1073
1074
1075class Parser:
1076 _split_keys: tuple[str, ...]
1077 _default_orient: str
1078
1079 _STAMP_UNITS = ("s", "ms", "us", "ns")
1080 _MIN_STAMPS = {
1081 "s": 31536000,
1082 "ms": 31536000000,
1083 "us": 31536000000000,
1084 "ns": 31536000000000000,
1085 }
1086
1087 def __init__(
1088 self,
1089 json,
1090 orient,
1091 dtype: DtypeArg | None = None,
1092 convert_axes: bool = True,
1093 convert_dates: bool | list[str] = True,
1094 keep_default_dates: bool = False,
1095 precise_float: bool = False,
1096 date_unit=None,
1097 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1098 ) -> None:
1099 self.json = json
1100
1101 if orient is None:
1102 orient = self._default_orient
1103
1104 self.orient = orient
1105
1106 self.dtype = dtype
1107
1108 if date_unit is not None:
1109 date_unit = date_unit.lower()
1110 if date_unit not in self._STAMP_UNITS:
1111 raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
1112 self.min_stamp = self._MIN_STAMPS[date_unit]
1113 else:
1114 self.min_stamp = self._MIN_STAMPS["s"]
1115
1116 self.precise_float = precise_float
1117 self.convert_axes = convert_axes
1118 self.convert_dates = convert_dates
1119 self.date_unit = date_unit
1120 self.keep_default_dates = keep_default_dates
1121 self.obj: DataFrame | Series | None = None
1122 self.dtype_backend = dtype_backend
1123
1124 def check_keys_split(self, decoded) -> None:
1125 """
1126 Checks that dict has only the appropriate keys for orient='split'.
1127 """
1128 bad_keys = set(decoded.keys()).difference(set(self._split_keys))
1129 if bad_keys:
1130 bad_keys_joined = ", ".join(bad_keys)
1131 raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")
1132
1133 def parse(self):
1134 self._parse()
1135
1136 if self.obj is None:
1137 return None
1138 if self.convert_axes:
1139 self._convert_axes()
1140 self._try_convert_types()
1141 return self.obj
1142
1143 def _parse(self):
1144 raise AbstractMethodError(self)
1145
1146 def _convert_axes(self) -> None:
1147 """
1148 Try to convert axes.
1149 """
1150 obj = self.obj
1151 assert obj is not None # for mypy
1152 for axis_name in obj._AXIS_ORDERS:
1153 new_axis, result = self._try_convert_data(
1154 name=axis_name,
1155 data=obj._get_axis(axis_name),
1156 use_dtypes=False,
1157 convert_dates=True,
1158 )
1159 if result:
1160 setattr(self.obj, axis_name, new_axis)
1161
1162 def _try_convert_types(self):
1163 raise AbstractMethodError(self)
1164
1165 def _try_convert_data(
1166 self,
1167 name,
1168 data,
1169 use_dtypes: bool = True,
1170 convert_dates: bool | list[str] = True,
1171 ):
1172 """
1173 Try to parse a ndarray like into a column by inferring dtype.
1174 """
1175 # don't try to coerce, unless a force conversion
1176 if use_dtypes:
1177 if not self.dtype:
1178 if all(notna(data)):
1179 return data, False
1180 return data.fillna(np.nan), True
1181
1182 elif self.dtype is True:
1183 pass
1184 else:
1185 # dtype to force
1186 dtype = (
1187 self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
1188 )
1189 if dtype is not None:
1190 try:
1191 return data.astype(dtype), True
1192 except (TypeError, ValueError):
1193 return data, False
1194
1195 if convert_dates:
1196 new_data, result = self._try_convert_to_date(data)
1197 if result:
1198 return new_data, True
1199
1200 if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex):
1201 # Fall through for conversion later on
1202 return data, True
1203 elif data.dtype == "object":
1204 # try float
1205 try:
1206 data = data.astype("float64")
1207 except (TypeError, ValueError):
1208 pass
1209
1210 if data.dtype.kind == "f":
1211 if data.dtype != "float64":
1212 # coerce floats to 64
1213 try:
1214 data = data.astype("float64")
1215 except (TypeError, ValueError):
1216 pass
1217
1218 # don't coerce 0-len data
1219 if len(data) and data.dtype in ("float", "object"):
1220 # coerce ints if we can
1221 try:
1222 new_data = data.astype("int64")
1223 if (new_data == data).all():
1224 data = new_data
1225 except (TypeError, ValueError, OverflowError):
1226 pass
1227
1228 # coerce ints to 64
1229 if data.dtype == "int":
1230 # coerce floats to 64
1231 try:
1232 data = data.astype("int64")
1233 except (TypeError, ValueError):
1234 pass
1235
1236 # if we have an index, we want to preserve dtypes
1237 if name == "index" and len(data):
1238 if self.orient == "split":
1239 return data, False
1240
1241 return data, True
1242
1243 def _try_convert_to_date(self, data):
1244 """
1245 Try to parse a ndarray like into a date column.
1246
1247 Try to coerce object in epoch/iso formats and integer/float in epoch
1248 formats. Return a boolean if parsing was successful.
1249 """
1250 # no conversion on empty
1251 if not len(data):
1252 return data, False
1253
1254 new_data = data
1255 if new_data.dtype == "object":
1256 try:
1257 new_data = data.astype("int64")
1258 except OverflowError:
1259 return data, False
1260 except (TypeError, ValueError):
1261 pass
1262
1263 # ignore numbers that are out of range
1264 if issubclass(new_data.dtype.type, np.number):
1265 in_range = (
1266 isna(new_data._values)
1267 | (new_data > self.min_stamp)
1268 | (new_data._values == iNaT)
1269 )
1270 if not in_range.all():
1271 return data, False
1272
1273 date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
1274 for date_unit in date_units:
1275 try:
1276 new_data = to_datetime(new_data, errors="raise", unit=date_unit)
1277 except (ValueError, OverflowError, TypeError):
1278 continue
1279 return new_data, True
1280 return data, False
1281
1282 def _try_convert_dates(self):
1283 raise AbstractMethodError(self)
1284
1285
1286class SeriesParser(Parser):
1287 _default_orient = "index"
1288 _split_keys = ("name", "index", "data")
1289
1290 def _parse(self) -> None:
1291 data = loads(self.json, precise_float=self.precise_float)
1292
1293 if self.orient == "split":
1294 decoded = {str(k): v for k, v in data.items()}
1295 self.check_keys_split(decoded)
1296 self.obj = Series(**decoded)
1297 else:
1298 self.obj = Series(data)
1299
1300 def _try_convert_types(self) -> None:
1301 if self.obj is None:
1302 return
1303 obj, result = self._try_convert_data(
1304 "data", self.obj, convert_dates=self.convert_dates
1305 )
1306 if result:
1307 self.obj = obj
1308
1309
1310class FrameParser(Parser):
1311 _default_orient = "columns"
1312 _split_keys = ("columns", "index", "data")
1313
1314 def _parse(self) -> None:
1315 json = self.json
1316 orient = self.orient
1317
1318 if orient == "columns":
1319 self.obj = DataFrame(
1320 loads(json, precise_float=self.precise_float), dtype=None
1321 )
1322 elif orient == "split":
1323 decoded = {
1324 str(k): v
1325 for k, v in loads(json, precise_float=self.precise_float).items()
1326 }
1327 self.check_keys_split(decoded)
1328 orig_names = [
1329 (tuple(col) if isinstance(col, list) else col)
1330 for col in decoded["columns"]
1331 ]
1332 decoded["columns"] = dedup_names(
1333 orig_names,
1334 is_potential_multi_index(orig_names, None),
1335 )
1336 self.obj = DataFrame(dtype=None, **decoded)
1337 elif orient == "index":
1338 self.obj = DataFrame.from_dict(
1339 loads(json, precise_float=self.precise_float),
1340 dtype=None,
1341 orient="index",
1342 )
1343 elif orient == "table":
1344 self.obj = parse_table_schema(json, precise_float=self.precise_float)
1345 else:
1346 self.obj = DataFrame(
1347 loads(json, precise_float=self.precise_float), dtype=None
1348 )
1349
1350 def _process_converter(self, f, filt=None) -> None:
1351 """
1352 Take a conversion function and possibly recreate the frame.
1353 """
1354 if filt is None:
1355 filt = lambda col, c: True
1356
1357 obj = self.obj
1358 assert obj is not None # for mypy
1359
1360 needs_new_obj = False
1361 new_obj = {}
1362 for i, (col, c) in enumerate(obj.items()):
1363 if filt(col, c):
1364 new_data, result = f(col, c)
1365 if result:
1366 c = new_data
1367 needs_new_obj = True
1368 new_obj[i] = c
1369
1370 if needs_new_obj:
1371 # possibly handle dup columns
1372 new_frame = DataFrame(new_obj, index=obj.index)
1373 new_frame.columns = obj.columns
1374 self.obj = new_frame
1375
1376 def _try_convert_types(self) -> None:
1377 if self.obj is None:
1378 return
1379 if self.convert_dates:
1380 self._try_convert_dates()
1381
1382 self._process_converter(
1383 lambda col, c: self._try_convert_data(col, c, convert_dates=False)
1384 )
1385
1386 def _try_convert_dates(self) -> None:
1387 if self.obj is None:
1388 return
1389
1390 # our columns to parse
1391 convert_dates_list_bool = self.convert_dates
1392 if isinstance(convert_dates_list_bool, bool):
1393 convert_dates_list_bool = []
1394 convert_dates = set(convert_dates_list_bool)
1395
1396 def is_ok(col) -> bool:
1397 """
1398 Return if this col is ok to try for a date parse.
1399 """
1400 if not isinstance(col, str):
1401 return False
1402
1403 col_lower = col.lower()
1404 if (
1405 col_lower.endswith("_at")
1406 or col_lower.endswith("_time")
1407 or col_lower == "modified"
1408 or col_lower == "date"
1409 or col_lower == "datetime"
1410 or col_lower.startswith("timestamp")
1411 ):
1412 return True
1413 return False
1414
1415 self._process_converter(
1416 lambda col, c: self._try_convert_to_date(c),
1417 lambda col, c: (
1418 (self.keep_default_dates and is_ok(col)) or col in convert_dates
1419 ),
1420 )