1# ---------------------------------------------------------------------
2# JSON normalization routines
3from __future__ import annotations
4
5from collections import (
6 abc,
7 defaultdict,
8)
9import copy
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 DefaultDict,
14)
15
16import numpy as np
17
18from pandas._libs.writers import convert_json_to_lines
19
20import pandas as pd
21from pandas import DataFrame
22
23if TYPE_CHECKING:
24 from collections.abc import Iterable
25
26 from pandas._typing import (
27 IgnoreRaise,
28 Scalar,
29 )
30
31
32def convert_to_line_delimits(s: str) -> str:
33 """
34 Helper function that converts JSON lists to line delimited JSON.
35 """
36 # Determine we have a JSON list to turn to lines otherwise just return the
37 # json object, only lists can
38 if not s[0] == "[" and s[-1] == "]":
39 return s
40 s = s[1:-1]
41
42 return convert_json_to_lines(s)
43
44
45def nested_to_record(
46 ds,
47 prefix: str = "",
48 sep: str = ".",
49 level: int = 0,
50 max_level: int | None = None,
51):
52 """
53 A simplified json_normalize
54
55 Converts a nested dict into a flat dict ("record"), unlike json_normalize,
56 it does not attempt to extract a subset of the data.
57
58 Parameters
59 ----------
60 ds : dict or list of dicts
61 prefix: the prefix, optional, default: ""
62 sep : str, default '.'
63 Nested records will generate names separated by sep,
64 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
65 level: int, optional, default: 0
66 The number of levels in the json string.
67
68 max_level: int, optional, default: None
69 The max depth to normalize.
70
71 Returns
72 -------
73 d - dict or list of dicts, matching `ds`
74
75 Examples
76 --------
77 >>> nested_to_record(
78 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
79 ... )
80 {\
81'flat1': 1, \
82'dict1.c': 1, \
83'dict1.d': 2, \
84'nested.e.c': 1, \
85'nested.e.d': 2, \
86'nested.d': 2\
87}
88 """
89 singleton = False
90 if isinstance(ds, dict):
91 ds = [ds]
92 singleton = True
93 new_ds = []
94 for d in ds:
95 new_d = copy.deepcopy(d)
96 for k, v in d.items():
97 # each key gets renamed with prefix
98 if not isinstance(k, str):
99 k = str(k)
100 if level == 0:
101 newkey = k
102 else:
103 newkey = prefix + sep + k
104
105 # flatten if type is dict and
106 # current dict level < maximum level provided and
107 # only dicts gets recurse-flattened
108 # only at level>1 do we rename the rest of the keys
109 if not isinstance(v, dict) or (
110 max_level is not None and level >= max_level
111 ):
112 if level != 0: # so we skip copying for top level, common case
113 v = new_d.pop(k)
114 new_d[newkey] = v
115 continue
116
117 v = new_d.pop(k)
118 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
119 new_ds.append(new_d)
120
121 if singleton:
122 return new_ds[0]
123 return new_ds
124
125
126def _normalise_json(
127 data: Any,
128 key_string: str,
129 normalized_dict: dict[str, Any],
130 separator: str,
131) -> dict[str, Any]:
132 """
133 Main recursive function
134 Designed for the most basic use case of pd.json_normalize(data)
135 intended as a performance improvement, see #15621
136
137 Parameters
138 ----------
139 data : Any
140 Type dependent on types contained within nested Json
141 key_string : str
142 New key (with separator(s) in) for data
143 normalized_dict : dict
144 The new normalized/flattened Json dict
145 separator : str, default '.'
146 Nested records will generate names separated by sep,
147 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
148 """
149 if isinstance(data, dict):
150 for key, value in data.items():
151 new_key = f"{key_string}{separator}{key}"
152
153 if not key_string:
154 new_key = new_key.removeprefix(separator)
155
156 _normalise_json(
157 data=value,
158 key_string=new_key,
159 normalized_dict=normalized_dict,
160 separator=separator,
161 )
162 else:
163 normalized_dict[key_string] = data
164 return normalized_dict
165
166
167def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
168 """
169 Order the top level keys and then recursively go to depth
170
171 Parameters
172 ----------
173 data : dict or list of dicts
174 separator : str, default '.'
175 Nested records will generate names separated by sep,
176 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
177
178 Returns
179 -------
180 dict or list of dicts, matching `normalised_json_object`
181 """
182 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
183 nested_dict_ = _normalise_json(
184 data={k: v for k, v in data.items() if isinstance(v, dict)},
185 key_string="",
186 normalized_dict={},
187 separator=separator,
188 )
189 return {**top_dict_, **nested_dict_}
190
191
192def _simple_json_normalize(
193 ds: dict | list[dict],
194 sep: str = ".",
195) -> dict | list[dict] | Any:
196 """
197 A optimized basic json_normalize
198
199 Converts a nested dict into a flat dict ("record"), unlike
200 json_normalize and nested_to_record it doesn't do anything clever.
201 But for the most basic use cases it enhances performance.
202 E.g. pd.json_normalize(data)
203
204 Parameters
205 ----------
206 ds : dict or list of dicts
207 sep : str, default '.'
208 Nested records will generate names separated by sep,
209 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
210
211 Returns
212 -------
213 frame : DataFrame
214 d - dict or list of dicts, matching `normalised_json_object`
215
216 Examples
217 --------
218 >>> _simple_json_normalize(
219 ... {
220 ... "flat1": 1,
221 ... "dict1": {"c": 1, "d": 2},
222 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
223 ... }
224 ... )
225 {\
226'flat1': 1, \
227'dict1.c': 1, \
228'dict1.d': 2, \
229'nested.e.c': 1, \
230'nested.e.d': 2, \
231'nested.d': 2\
232}
233
234 """
235 normalised_json_object = {}
236 # expect a dictionary, as most jsons are. However, lists are perfectly valid
237 if isinstance(ds, dict):
238 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
239 elif isinstance(ds, list):
240 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
241 return normalised_json_list
242 return normalised_json_object
243
244
245def json_normalize(
246 data: dict | list[dict],
247 record_path: str | list | None = None,
248 meta: str | list[str | list[str]] | None = None,
249 meta_prefix: str | None = None,
250 record_prefix: str | None = None,
251 errors: IgnoreRaise = "raise",
252 sep: str = ".",
253 max_level: int | None = None,
254) -> DataFrame:
255 """
256 Normalize semi-structured JSON data into a flat table.
257
258 Parameters
259 ----------
260 data : dict or list of dicts
261 Unserialized JSON objects.
262 record_path : str or list of str, default None
263 Path in each object to list of records. If not passed, data will be
264 assumed to be an array of records.
265 meta : list of paths (str or list of str), default None
266 Fields to use as metadata for each record in resulting table.
267 meta_prefix : str, default None
268 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
269 meta is ['foo', 'bar'].
270 record_prefix : str, default None
271 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
272 path to records is ['foo', 'bar'].
273 errors : {'raise', 'ignore'}, default 'raise'
274 Configures error handling.
275
276 * 'ignore' : will ignore KeyError if keys listed in meta are not
277 always present.
278 * 'raise' : will raise KeyError if keys listed in meta are not
279 always present.
280 sep : str, default '.'
281 Nested records will generate names separated by sep.
282 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
283 max_level : int, default None
284 Max number of levels(depth of dict) to normalize.
285 if None, normalizes all levels.
286
287 Returns
288 -------
289 frame : DataFrame
290 Normalize semi-structured JSON data into a flat table.
291
292 Examples
293 --------
294 >>> data = [
295 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
296 ... {"name": {"given": "Mark", "family": "Regner"}},
297 ... {"id": 2, "name": "Faye Raker"},
298 ... ]
299 >>> pd.json_normalize(data)
300 id name.first name.last name.given name.family name
301 0 1.0 Coleen Volk NaN NaN NaN
302 1 NaN NaN NaN Mark Regner NaN
303 2 2.0 NaN NaN NaN NaN Faye Raker
304
305 >>> data = [
306 ... {
307 ... "id": 1,
308 ... "name": "Cole Volk",
309 ... "fitness": {"height": 130, "weight": 60},
310 ... },
311 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
312 ... {
313 ... "id": 2,
314 ... "name": "Faye Raker",
315 ... "fitness": {"height": 130, "weight": 60},
316 ... },
317 ... ]
318 >>> pd.json_normalize(data, max_level=0)
319 id name fitness
320 0 1.0 Cole Volk {'height': 130, 'weight': 60}
321 1 NaN Mark Reg {'height': 130, 'weight': 60}
322 2 2.0 Faye Raker {'height': 130, 'weight': 60}
323
324 Normalizes nested data up to level 1.
325
326 >>> data = [
327 ... {
328 ... "id": 1,
329 ... "name": "Cole Volk",
330 ... "fitness": {"height": 130, "weight": 60},
331 ... },
332 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
333 ... {
334 ... "id": 2,
335 ... "name": "Faye Raker",
336 ... "fitness": {"height": 130, "weight": 60},
337 ... },
338 ... ]
339 >>> pd.json_normalize(data, max_level=1)
340 id name fitness.height fitness.weight
341 0 1.0 Cole Volk 130 60
342 1 NaN Mark Reg 130 60
343 2 2.0 Faye Raker 130 60
344
345 >>> data = [
346 ... {
347 ... "state": "Florida",
348 ... "shortname": "FL",
349 ... "info": {"governor": "Rick Scott"},
350 ... "counties": [
351 ... {"name": "Dade", "population": 12345},
352 ... {"name": "Broward", "population": 40000},
353 ... {"name": "Palm Beach", "population": 60000},
354 ... ],
355 ... },
356 ... {
357 ... "state": "Ohio",
358 ... "shortname": "OH",
359 ... "info": {"governor": "John Kasich"},
360 ... "counties": [
361 ... {"name": "Summit", "population": 1234},
362 ... {"name": "Cuyahoga", "population": 1337},
363 ... ],
364 ... },
365 ... ]
366 >>> result = pd.json_normalize(
367 ... data, "counties", ["state", "shortname", ["info", "governor"]]
368 ... )
369 >>> result
370 name population state shortname info.governor
371 0 Dade 12345 Florida FL Rick Scott
372 1 Broward 40000 Florida FL Rick Scott
373 2 Palm Beach 60000 Florida FL Rick Scott
374 3 Summit 1234 Ohio OH John Kasich
375 4 Cuyahoga 1337 Ohio OH John Kasich
376
377 >>> data = {"A": [1, 2]}
378 >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
379 Prefix.0
380 0 1
381 1 2
382
383 Returns normalized data with columns prefixed with the given string.
384 """
385
386 def _pull_field(
387 js: dict[str, Any], spec: list | str, extract_record: bool = False
388 ) -> Scalar | Iterable:
389 """Internal function to pull field"""
390 result = js
391 try:
392 if isinstance(spec, list):
393 for field in spec:
394 if result is None:
395 raise KeyError(field)
396 result = result[field]
397 else:
398 result = result[spec]
399 except KeyError as e:
400 if extract_record:
401 raise KeyError(
402 f"Key {e} not found. If specifying a record_path, all elements of "
403 f"data should have the path."
404 ) from e
405 if errors == "ignore":
406 return np.nan
407 else:
408 raise KeyError(
409 f"Key {e} not found. To replace missing values of {e} with "
410 f"np.nan, pass in errors='ignore'"
411 ) from e
412
413 return result
414
415 def _pull_records(js: dict[str, Any], spec: list | str) -> list:
416 """
417 Internal function to pull field for records, and similar to
418 _pull_field, but require to return list. And will raise error
419 if has non iterable value.
420 """
421 result = _pull_field(js, spec, extract_record=True)
422
423 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
424 # null, otherwise return an empty list
425 if not isinstance(result, list):
426 if pd.isnull(result):
427 result = []
428 else:
429 raise TypeError(
430 f"{js} has non list value {result} for path {spec}. "
431 "Must be list or null."
432 )
433 return result
434
435 if isinstance(data, list) and not data:
436 return DataFrame()
437 elif isinstance(data, dict):
438 # A bit of a hackjob
439 data = [data]
440 elif isinstance(data, abc.Iterable) and not isinstance(data, str):
441 # GH35923 Fix pd.json_normalize to not skip the first element of a
442 # generator input
443 data = list(data)
444 else:
445 raise NotImplementedError
446
447 # check to see if a simple recursive function is possible to
448 # improve performance (see #15621) but only for cases such
449 # as pd.Dataframe(data) or pd.Dataframe(data, sep)
450 if (
451 record_path is None
452 and meta is None
453 and meta_prefix is None
454 and record_prefix is None
455 and max_level is None
456 ):
457 return DataFrame(_simple_json_normalize(data, sep=sep))
458
459 if record_path is None:
460 if any([isinstance(x, dict) for x in y.values()] for y in data):
461 # naive normalization, this is idempotent for flat records
462 # and potentially will inflate the data considerably for
463 # deeply nested structures:
464 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
465 #
466 # TODO: handle record value which are lists, at least error
467 # reasonably
468 data = nested_to_record(data, sep=sep, max_level=max_level)
469 return DataFrame(data)
470 elif not isinstance(record_path, list):
471 record_path = [record_path]
472
473 if meta is None:
474 meta = []
475 elif not isinstance(meta, list):
476 meta = [meta]
477
478 _meta = [m if isinstance(m, list) else [m] for m in meta]
479
480 # Disastrously inefficient for now
481 records: list = []
482 lengths = []
483
484 meta_vals: DefaultDict = defaultdict(list)
485 meta_keys = [sep.join(val) for val in _meta]
486
487 def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
488 if isinstance(data, dict):
489 data = [data]
490 if len(path) > 1:
491 for obj in data:
492 for val, key in zip(_meta, meta_keys):
493 if level + 1 == len(val):
494 seen_meta[key] = _pull_field(obj, val[-1])
495
496 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
497 else:
498 for obj in data:
499 recs = _pull_records(obj, path[0])
500 recs = [
501 nested_to_record(r, sep=sep, max_level=max_level)
502 if isinstance(r, dict)
503 else r
504 for r in recs
505 ]
506
507 # For repeating the metadata later
508 lengths.append(len(recs))
509 for val, key in zip(_meta, meta_keys):
510 if level + 1 > len(val):
511 meta_val = seen_meta[key]
512 else:
513 meta_val = _pull_field(obj, val[level:])
514 meta_vals[key].append(meta_val)
515 records.extend(recs)
516
517 _recursive_extract(data, record_path, {}, level=0)
518
519 result = DataFrame(records)
520
521 if record_prefix is not None:
522 result = result.rename(columns=lambda x: f"{record_prefix}{x}")
523
524 # Data types, a problem
525 for k, v in meta_vals.items():
526 if meta_prefix is not None:
527 k = meta_prefix + k
528
529 if k in result:
530 raise ValueError(
531 f"Conflicting metadata name {k}, need distinguishing prefix "
532 )
533 # GH 37782
534
535 values = np.array(v, dtype=object)
536
537 if values.ndim > 1:
538 # GH 37782
539 values = np.empty((len(v),), dtype=object)
540 for i, v in enumerate(v):
541 values[i] = v
542
543 result[k] = values.repeat(lengths)
544 return result