1# ---------------------------------------------------------------------
2# JSON normalization routines
3from __future__ import annotations
4
5from collections import (
6 abc,
7 defaultdict,
8)
9import copy
10import sys
11from typing import (
12 Any,
13 DefaultDict,
14 Iterable,
15)
16
17import numpy as np
18
19from pandas._libs.writers import convert_json_to_lines
20from pandas._typing import (
21 IgnoreRaise,
22 Scalar,
23)
24
25import pandas as pd
26from pandas import DataFrame
27
28
29def convert_to_line_delimits(s: str) -> str:
30 """
31 Helper function that converts JSON lists to line delimited JSON.
32 """
33 # Determine we have a JSON list to turn to lines otherwise just return the
34 # json object, only lists can
35 if not s[0] == "[" and s[-1] == "]":
36 return s
37 s = s[1:-1]
38
39 return convert_json_to_lines(s)
40
41
42def nested_to_record(
43 ds,
44 prefix: str = "",
45 sep: str = ".",
46 level: int = 0,
47 max_level: int | None = None,
48):
49 """
50 A simplified json_normalize
51
52 Converts a nested dict into a flat dict ("record"), unlike json_normalize,
53 it does not attempt to extract a subset of the data.
54
55 Parameters
56 ----------
57 ds : dict or list of dicts
58 prefix: the prefix, optional, default: ""
59 sep : str, default '.'
60 Nested records will generate names separated by sep,
61 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
62 level: int, optional, default: 0
63 The number of levels in the json string.
64
65 max_level: int, optional, default: None
66 The max depth to normalize.
67
68 Returns
69 -------
70 d - dict or list of dicts, matching `ds`
71
72 Examples
73 --------
74 >>> nested_to_record(
75 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
76 ... )
77 {\
78'flat1': 1, \
79'dict1.c': 1, \
80'dict1.d': 2, \
81'nested.e.c': 1, \
82'nested.e.d': 2, \
83'nested.d': 2\
84}
85 """
86 singleton = False
87 if isinstance(ds, dict):
88 ds = [ds]
89 singleton = True
90 new_ds = []
91 for d in ds:
92 new_d = copy.deepcopy(d)
93 for k, v in d.items():
94 # each key gets renamed with prefix
95 if not isinstance(k, str):
96 k = str(k)
97 if level == 0:
98 newkey = k
99 else:
100 newkey = prefix + sep + k
101
102 # flatten if type is dict and
103 # current dict level < maximum level provided and
104 # only dicts gets recurse-flattened
105 # only at level>1 do we rename the rest of the keys
106 if not isinstance(v, dict) or (
107 max_level is not None and level >= max_level
108 ):
109 if level != 0: # so we skip copying for top level, common case
110 v = new_d.pop(k)
111 new_d[newkey] = v
112 continue
113
114 v = new_d.pop(k)
115 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
116 new_ds.append(new_d)
117
118 if singleton:
119 return new_ds[0]
120 return new_ds
121
122
123def _normalise_json(
124 data: Any,
125 key_string: str,
126 normalized_dict: dict[str, Any],
127 separator: str,
128) -> dict[str, Any]:
129 """
130 Main recursive function
131 Designed for the most basic use case of pd.json_normalize(data)
132 intended as a performance improvement, see #15621
133
134 Parameters
135 ----------
136 data : Any
137 Type dependent on types contained within nested Json
138 key_string : str
139 New key (with separator(s) in) for data
140 normalized_dict : dict
141 The new normalized/flattened Json dict
142 separator : str, default '.'
143 Nested records will generate names separated by sep,
144 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
145 """
146 if isinstance(data, dict):
147 for key, value in data.items():
148 new_key = f"{key_string}{separator}{key}"
149
150 if not key_string:
151 if sys.version_info < (3, 9):
152 from pandas.util._str_methods import removeprefix
153
154 new_key = removeprefix(new_key, separator)
155 else:
156 new_key = new_key.removeprefix(separator)
157
158 _normalise_json(
159 data=value,
160 key_string=new_key,
161 normalized_dict=normalized_dict,
162 separator=separator,
163 )
164 else:
165 normalized_dict[key_string] = data
166 return normalized_dict
167
168
169def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
170 """
171 Order the top level keys and then recursively go to depth
172
173 Parameters
174 ----------
175 data : dict or list of dicts
176 separator : str, default '.'
177 Nested records will generate names separated by sep,
178 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
179
180 Returns
181 -------
182 dict or list of dicts, matching `normalised_json_object`
183 """
184 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
185 nested_dict_ = _normalise_json(
186 data={k: v for k, v in data.items() if isinstance(v, dict)},
187 key_string="",
188 normalized_dict={},
189 separator=separator,
190 )
191 return {**top_dict_, **nested_dict_}
192
193
194def _simple_json_normalize(
195 ds: dict | list[dict],
196 sep: str = ".",
197) -> dict | list[dict] | Any:
198 """
199 A optimized basic json_normalize
200
201 Converts a nested dict into a flat dict ("record"), unlike
202 json_normalize and nested_to_record it doesn't do anything clever.
203 But for the most basic use cases it enhances performance.
204 E.g. pd.json_normalize(data)
205
206 Parameters
207 ----------
208 ds : dict or list of dicts
209 sep : str, default '.'
210 Nested records will generate names separated by sep,
211 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
212
213 Returns
214 -------
215 frame : DataFrame
216 d - dict or list of dicts, matching `normalised_json_object`
217
218 Examples
219 --------
220 >>> _simple_json_normalize(
221 ... {
222 ... "flat1": 1,
223 ... "dict1": {"c": 1, "d": 2},
224 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
225 ... }
226 ... )
227 {\
228'flat1': 1, \
229'dict1.c': 1, \
230'dict1.d': 2, \
231'nested.e.c': 1, \
232'nested.e.d': 2, \
233'nested.d': 2\
234}
235
236 """
237 normalised_json_object = {}
238 # expect a dictionary, as most jsons are. However, lists are perfectly valid
239 if isinstance(ds, dict):
240 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
241 elif isinstance(ds, list):
242 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
243 return normalised_json_list
244 return normalised_json_object
245
246
247def json_normalize(
248 data: dict | list[dict],
249 record_path: str | list | None = None,
250 meta: str | list[str | list[str]] | None = None,
251 meta_prefix: str | None = None,
252 record_prefix: str | None = None,
253 errors: IgnoreRaise = "raise",
254 sep: str = ".",
255 max_level: int | None = None,
256) -> DataFrame:
257 """
258 Normalize semi-structured JSON data into a flat table.
259
260 Parameters
261 ----------
262 data : dict or list of dicts
263 Unserialized JSON objects.
264 record_path : str or list of str, default None
265 Path in each object to list of records. If not passed, data will be
266 assumed to be an array of records.
267 meta : list of paths (str or list of str), default None
268 Fields to use as metadata for each record in resulting table.
269 meta_prefix : str, default None
270 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
271 meta is ['foo', 'bar'].
272 record_prefix : str, default None
273 If True, prefix records with dotted (?) path, e.g. foo.bar.field if
274 path to records is ['foo', 'bar'].
275 errors : {'raise', 'ignore'}, default 'raise'
276 Configures error handling.
277
278 * 'ignore' : will ignore KeyError if keys listed in meta are not
279 always present.
280 * 'raise' : will raise KeyError if keys listed in meta are not
281 always present.
282 sep : str, default '.'
283 Nested records will generate names separated by sep.
284 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
285 max_level : int, default None
286 Max number of levels(depth of dict) to normalize.
287 if None, normalizes all levels.
288
289 Returns
290 -------
291 frame : DataFrame
292 Normalize semi-structured JSON data into a flat table.
293
294 Examples
295 --------
296 >>> data = [
297 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
298 ... {"name": {"given": "Mark", "family": "Regner"}},
299 ... {"id": 2, "name": "Faye Raker"},
300 ... ]
301 >>> pd.json_normalize(data)
302 id name.first name.last name.given name.family name
303 0 1.0 Coleen Volk NaN NaN NaN
304 1 NaN NaN NaN Mark Regner NaN
305 2 2.0 NaN NaN NaN NaN Faye Raker
306
307 >>> data = [
308 ... {
309 ... "id": 1,
310 ... "name": "Cole Volk",
311 ... "fitness": {"height": 130, "weight": 60},
312 ... },
313 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
314 ... {
315 ... "id": 2,
316 ... "name": "Faye Raker",
317 ... "fitness": {"height": 130, "weight": 60},
318 ... },
319 ... ]
320 >>> pd.json_normalize(data, max_level=0)
321 id name fitness
322 0 1.0 Cole Volk {'height': 130, 'weight': 60}
323 1 NaN Mark Reg {'height': 130, 'weight': 60}
324 2 2.0 Faye Raker {'height': 130, 'weight': 60}
325
326 Normalizes nested data up to level 1.
327
328 >>> data = [
329 ... {
330 ... "id": 1,
331 ... "name": "Cole Volk",
332 ... "fitness": {"height": 130, "weight": 60},
333 ... },
334 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
335 ... {
336 ... "id": 2,
337 ... "name": "Faye Raker",
338 ... "fitness": {"height": 130, "weight": 60},
339 ... },
340 ... ]
341 >>> pd.json_normalize(data, max_level=1)
342 id name fitness.height fitness.weight
343 0 1.0 Cole Volk 130 60
344 1 NaN Mark Reg 130 60
345 2 2.0 Faye Raker 130 60
346
347 >>> data = [
348 ... {
349 ... "state": "Florida",
350 ... "shortname": "FL",
351 ... "info": {"governor": "Rick Scott"},
352 ... "counties": [
353 ... {"name": "Dade", "population": 12345},
354 ... {"name": "Broward", "population": 40000},
355 ... {"name": "Palm Beach", "population": 60000},
356 ... ],
357 ... },
358 ... {
359 ... "state": "Ohio",
360 ... "shortname": "OH",
361 ... "info": {"governor": "John Kasich"},
362 ... "counties": [
363 ... {"name": "Summit", "population": 1234},
364 ... {"name": "Cuyahoga", "population": 1337},
365 ... ],
366 ... },
367 ... ]
368 >>> result = pd.json_normalize(
369 ... data, "counties", ["state", "shortname", ["info", "governor"]]
370 ... )
371 >>> result
372 name population state shortname info.governor
373 0 Dade 12345 Florida FL Rick Scott
374 1 Broward 40000 Florida FL Rick Scott
375 2 Palm Beach 60000 Florida FL Rick Scott
376 3 Summit 1234 Ohio OH John Kasich
377 4 Cuyahoga 1337 Ohio OH John Kasich
378
379 >>> data = {"A": [1, 2]}
380 >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
381 Prefix.0
382 0 1
383 1 2
384
385 Returns normalized data with columns prefixed with the given string.
386 """
387
388 def _pull_field(
389 js: dict[str, Any], spec: list | str, extract_record: bool = False
390 ) -> Scalar | Iterable:
391 """Internal function to pull field"""
392 result = js
393 try:
394 if isinstance(spec, list):
395 for field in spec:
396 if result is None:
397 raise KeyError(field)
398 result = result[field]
399 else:
400 result = result[spec]
401 except KeyError as e:
402 if extract_record:
403 raise KeyError(
404 f"Key {e} not found. If specifying a record_path, all elements of "
405 f"data should have the path."
406 ) from e
407 if errors == "ignore":
408 return np.nan
409 else:
410 raise KeyError(
411 f"Key {e} not found. To replace missing values of {e} with "
412 f"np.nan, pass in errors='ignore'"
413 ) from e
414
415 return result
416
417 def _pull_records(js: dict[str, Any], spec: list | str) -> list:
418 """
419 Internal function to pull field for records, and similar to
420 _pull_field, but require to return list. And will raise error
421 if has non iterable value.
422 """
423 result = _pull_field(js, spec, extract_record=True)
424
425 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
426 # null, otherwise return an empty list
427 if not isinstance(result, list):
428 if pd.isnull(result):
429 result = []
430 else:
431 raise TypeError(
432 f"{js} has non list value {result} for path {spec}. "
433 "Must be list or null."
434 )
435 return result
436
437 if isinstance(data, list) and not data:
438 return DataFrame()
439 elif isinstance(data, dict):
440 # A bit of a hackjob
441 data = [data]
442 elif isinstance(data, abc.Iterable) and not isinstance(data, str):
443 # GH35923 Fix pd.json_normalize to not skip the first element of a
444 # generator input
445 data = list(data)
446 else:
447 raise NotImplementedError
448
449 # check to see if a simple recursive function is possible to
450 # improve performance (see #15621) but only for cases such
451 # as pd.Dataframe(data) or pd.Dataframe(data, sep)
452 if (
453 record_path is None
454 and meta is None
455 and meta_prefix is None
456 and record_prefix is None
457 and max_level is None
458 ):
459 return DataFrame(_simple_json_normalize(data, sep=sep))
460
461 if record_path is None:
462 if any([isinstance(x, dict) for x in y.values()] for y in data):
463 # naive normalization, this is idempotent for flat records
464 # and potentially will inflate the data considerably for
465 # deeply nested structures:
466 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
467 #
468 # TODO: handle record value which are lists, at least error
469 # reasonably
470 data = nested_to_record(data, sep=sep, max_level=max_level)
471 return DataFrame(data)
472 elif not isinstance(record_path, list):
473 record_path = [record_path]
474
475 if meta is None:
476 meta = []
477 elif not isinstance(meta, list):
478 meta = [meta]
479
480 _meta = [m if isinstance(m, list) else [m] for m in meta]
481
482 # Disastrously inefficient for now
483 records: list = []
484 lengths = []
485
486 meta_vals: DefaultDict = defaultdict(list)
487 meta_keys = [sep.join(val) for val in _meta]
488
489 def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
490 if isinstance(data, dict):
491 data = [data]
492 if len(path) > 1:
493 for obj in data:
494 for val, key in zip(_meta, meta_keys):
495 if level + 1 == len(val):
496 seen_meta[key] = _pull_field(obj, val[-1])
497
498 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
499 else:
500 for obj in data:
501 recs = _pull_records(obj, path[0])
502 recs = [
503 nested_to_record(r, sep=sep, max_level=max_level)
504 if isinstance(r, dict)
505 else r
506 for r in recs
507 ]
508
509 # For repeating the metadata later
510 lengths.append(len(recs))
511 for val, key in zip(_meta, meta_keys):
512 if level + 1 > len(val):
513 meta_val = seen_meta[key]
514 else:
515 meta_val = _pull_field(obj, val[level:])
516 meta_vals[key].append(meta_val)
517 records.extend(recs)
518
519 _recursive_extract(data, record_path, {}, level=0)
520
521 result = DataFrame(records)
522
523 if record_prefix is not None:
524 result = result.rename(columns=lambda x: f"{record_prefix}{x}")
525
526 # Data types, a problem
527 for k, v in meta_vals.items():
528 if meta_prefix is not None:
529 k = meta_prefix + k
530
531 if k in result:
532 raise ValueError(
533 f"Conflicting metadata name {k}, need distinguishing prefix "
534 )
535 result[k] = np.array(v, dtype=object).repeat(lengths)
536 return result