Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/_normalize.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

145 statements  

1# --------------------------------------------------------------------- 

2# JSON normalization routines 

3from __future__ import annotations 

4 

5from collections import ( 

6 abc, 

7 defaultdict, 

8) 

9import copy 

10from typing import ( 

11 TYPE_CHECKING, 

12 Any, 

13 DefaultDict, 

14) 

15 

16import numpy as np 

17 

18from pandas._libs.writers import convert_json_to_lines 

19 

20import pandas as pd 

21from pandas import DataFrame 

22 

23if TYPE_CHECKING: 

24 from collections.abc import Iterable 

25 

26 from pandas._typing import ( 

27 IgnoreRaise, 

28 Scalar, 

29 ) 

30 

31 

32def convert_to_line_delimits(s: str) -> str: 

33 """ 

34 Helper function that converts JSON lists to line delimited JSON. 

35 """ 

36 # Determine we have a JSON list to turn to lines otherwise just return the 

37 # json object, only lists can 

38 if not s[0] == "[" and s[-1] == "]": 

39 return s 

40 s = s[1:-1] 

41 

42 return convert_json_to_lines(s) 

43 

44 

45def nested_to_record( 

46 ds, 

47 prefix: str = "", 

48 sep: str = ".", 

49 level: int = 0, 

50 max_level: int | None = None, 

51): 

52 """ 

53 A simplified json_normalize 

54 

55 Converts a nested dict into a flat dict ("record"), unlike json_normalize, 

56 it does not attempt to extract a subset of the data. 

57 

58 Parameters 

59 ---------- 

60 ds : dict or list of dicts 

61 prefix: the prefix, optional, default: "" 

62 sep : str, default '.' 

63 Nested records will generate names separated by sep, 

64 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

65 level: int, optional, default: 0 

66 The number of levels in the json string. 

67 

68 max_level: int, optional, default: None 

69 The max depth to normalize. 

70 

71 Returns 

72 ------- 

73 d - dict or list of dicts, matching `ds` 

74 

75 Examples 

76 -------- 

77 >>> nested_to_record( 

78 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) 

79 ... ) 

80 {\ 

81'flat1': 1, \ 

82'dict1.c': 1, \ 

83'dict1.d': 2, \ 

84'nested.e.c': 1, \ 

85'nested.e.d': 2, \ 

86'nested.d': 2\ 

87} 

88 """ 

89 singleton = False 

90 if isinstance(ds, dict): 

91 ds = [ds] 

92 singleton = True 

93 new_ds = [] 

94 for d in ds: 

95 new_d = copy.deepcopy(d) 

96 for k, v in d.items(): 

97 # each key gets renamed with prefix 

98 if not isinstance(k, str): 

99 k = str(k) 

100 if level == 0: 

101 newkey = k 

102 else: 

103 newkey = prefix + sep + k 

104 

105 # flatten if type is dict and 

106 # current dict level < maximum level provided and 

107 # only dicts gets recurse-flattened 

108 # only at level>1 do we rename the rest of the keys 

109 if not isinstance(v, dict) or ( 

110 max_level is not None and level >= max_level 

111 ): 

112 if level != 0: # so we skip copying for top level, common case 

113 v = new_d.pop(k) 

114 new_d[newkey] = v 

115 continue 

116 

117 v = new_d.pop(k) 

118 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) 

119 new_ds.append(new_d) 

120 

121 if singleton: 

122 return new_ds[0] 

123 return new_ds 

124 

125 

126def _normalise_json( 

127 data: Any, 

128 key_string: str, 

129 normalized_dict: dict[str, Any], 

130 separator: str, 

131) -> dict[str, Any]: 

132 """ 

133 Main recursive function 

134 Designed for the most basic use case of pd.json_normalize(data) 

135 intended as a performance improvement, see #15621 

136 

137 Parameters 

138 ---------- 

139 data : Any 

140 Type dependent on types contained within nested Json 

141 key_string : str 

142 New key (with separator(s) in) for data 

143 normalized_dict : dict 

144 The new normalized/flattened Json dict 

145 separator : str, default '.' 

146 Nested records will generate names separated by sep, 

147 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

148 """ 

149 if isinstance(data, dict): 

150 for key, value in data.items(): 

151 new_key = f"{key_string}{separator}{key}" 

152 

153 if not key_string: 

154 new_key = new_key.removeprefix(separator) 

155 

156 _normalise_json( 

157 data=value, 

158 key_string=new_key, 

159 normalized_dict=normalized_dict, 

160 separator=separator, 

161 ) 

162 else: 

163 normalized_dict[key_string] = data 

164 return normalized_dict 

165 

166 

167def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: 

168 """ 

169 Order the top level keys and then recursively go to depth 

170 

171 Parameters 

172 ---------- 

173 data : dict or list of dicts 

174 separator : str, default '.' 

175 Nested records will generate names separated by sep, 

176 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

177 

178 Returns 

179 ------- 

180 dict or list of dicts, matching `normalised_json_object` 

181 """ 

182 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)} 

183 nested_dict_ = _normalise_json( 

184 data={k: v for k, v in data.items() if isinstance(v, dict)}, 

185 key_string="", 

186 normalized_dict={}, 

187 separator=separator, 

188 ) 

189 return {**top_dict_, **nested_dict_} 

190 

191 

192def _simple_json_normalize( 

193 ds: dict | list[dict], 

194 sep: str = ".", 

195) -> dict | list[dict] | Any: 

196 """ 

197 A optimized basic json_normalize 

198 

199 Converts a nested dict into a flat dict ("record"), unlike 

200 json_normalize and nested_to_record it doesn't do anything clever. 

201 But for the most basic use cases it enhances performance. 

202 E.g. pd.json_normalize(data) 

203 

204 Parameters 

205 ---------- 

206 ds : dict or list of dicts 

207 sep : str, default '.' 

208 Nested records will generate names separated by sep, 

209 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

210 

211 Returns 

212 ------- 

213 frame : DataFrame 

214 d - dict or list of dicts, matching `normalised_json_object` 

215 

216 Examples 

217 -------- 

218 >>> _simple_json_normalize( 

219 ... { 

220 ... "flat1": 1, 

221 ... "dict1": {"c": 1, "d": 2}, 

222 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2}, 

223 ... } 

224 ... ) 

225 {\ 

226'flat1': 1, \ 

227'dict1.c': 1, \ 

228'dict1.d': 2, \ 

229'nested.e.c': 1, \ 

230'nested.e.d': 2, \ 

231'nested.d': 2\ 

232} 

233 

234 """ 

235 normalised_json_object = {} 

236 # expect a dictionary, as most jsons are. However, lists are perfectly valid 

237 if isinstance(ds, dict): 

238 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep) 

239 elif isinstance(ds, list): 

240 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] 

241 return normalised_json_list 

242 return normalised_json_object 

243 

244 

245def json_normalize( 

246 data: dict | list[dict], 

247 record_path: str | list | None = None, 

248 meta: str | list[str | list[str]] | None = None, 

249 meta_prefix: str | None = None, 

250 record_prefix: str | None = None, 

251 errors: IgnoreRaise = "raise", 

252 sep: str = ".", 

253 max_level: int | None = None, 

254) -> DataFrame: 

255 """ 

256 Normalize semi-structured JSON data into a flat table. 

257 

258 Parameters 

259 ---------- 

260 data : dict or list of dicts 

261 Unserialized JSON objects. 

262 record_path : str or list of str, default None 

263 Path in each object to list of records. If not passed, data will be 

264 assumed to be an array of records. 

265 meta : list of paths (str or list of str), default None 

266 Fields to use as metadata for each record in resulting table. 

267 meta_prefix : str, default None 

268 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

269 meta is ['foo', 'bar']. 

270 record_prefix : str, default None 

271 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

272 path to records is ['foo', 'bar']. 

273 errors : {'raise', 'ignore'}, default 'raise' 

274 Configures error handling. 

275 

276 * 'ignore' : will ignore KeyError if keys listed in meta are not 

277 always present. 

278 * 'raise' : will raise KeyError if keys listed in meta are not 

279 always present. 

280 sep : str, default '.' 

281 Nested records will generate names separated by sep. 

282 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. 

283 max_level : int, default None 

284 Max number of levels(depth of dict) to normalize. 

285 if None, normalizes all levels. 

286 

287 Returns 

288 ------- 

289 frame : DataFrame 

290 Normalize semi-structured JSON data into a flat table. 

291 

292 Examples 

293 -------- 

294 >>> data = [ 

295 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, 

296 ... {"name": {"given": "Mark", "family": "Regner"}}, 

297 ... {"id": 2, "name": "Faye Raker"}, 

298 ... ] 

299 >>> pd.json_normalize(data) 

300 id name.first name.last name.given name.family name 

301 0 1.0 Coleen Volk NaN NaN NaN 

302 1 NaN NaN NaN Mark Regner NaN 

303 2 2.0 NaN NaN NaN NaN Faye Raker 

304 

305 >>> data = [ 

306 ... { 

307 ... "id": 1, 

308 ... "name": "Cole Volk", 

309 ... "fitness": {"height": 130, "weight": 60}, 

310 ... }, 

311 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, 

312 ... { 

313 ... "id": 2, 

314 ... "name": "Faye Raker", 

315 ... "fitness": {"height": 130, "weight": 60}, 

316 ... }, 

317 ... ] 

318 >>> pd.json_normalize(data, max_level=0) 

319 id name fitness 

320 0 1.0 Cole Volk {'height': 130, 'weight': 60} 

321 1 NaN Mark Reg {'height': 130, 'weight': 60} 

322 2 2.0 Faye Raker {'height': 130, 'weight': 60} 

323 

324 Normalizes nested data up to level 1. 

325 

326 >>> data = [ 

327 ... { 

328 ... "id": 1, 

329 ... "name": "Cole Volk", 

330 ... "fitness": {"height": 130, "weight": 60}, 

331 ... }, 

332 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, 

333 ... { 

334 ... "id": 2, 

335 ... "name": "Faye Raker", 

336 ... "fitness": {"height": 130, "weight": 60}, 

337 ... }, 

338 ... ] 

339 >>> pd.json_normalize(data, max_level=1) 

340 id name fitness.height fitness.weight 

341 0 1.0 Cole Volk 130 60 

342 1 NaN Mark Reg 130 60 

343 2 2.0 Faye Raker 130 60 

344 

345 >>> data = [ 

346 ... { 

347 ... "state": "Florida", 

348 ... "shortname": "FL", 

349 ... "info": {"governor": "Rick Scott"}, 

350 ... "counties": [ 

351 ... {"name": "Dade", "population": 12345}, 

352 ... {"name": "Broward", "population": 40000}, 

353 ... {"name": "Palm Beach", "population": 60000}, 

354 ... ], 

355 ... }, 

356 ... { 

357 ... "state": "Ohio", 

358 ... "shortname": "OH", 

359 ... "info": {"governor": "John Kasich"}, 

360 ... "counties": [ 

361 ... {"name": "Summit", "population": 1234}, 

362 ... {"name": "Cuyahoga", "population": 1337}, 

363 ... ], 

364 ... }, 

365 ... ] 

366 >>> result = pd.json_normalize( 

367 ... data, "counties", ["state", "shortname", ["info", "governor"]] 

368 ... ) 

369 >>> result 

370 name population state shortname info.governor 

371 0 Dade 12345 Florida FL Rick Scott 

372 1 Broward 40000 Florida FL Rick Scott 

373 2 Palm Beach 60000 Florida FL Rick Scott 

374 3 Summit 1234 Ohio OH John Kasich 

375 4 Cuyahoga 1337 Ohio OH John Kasich 

376 

377 >>> data = {"A": [1, 2]} 

378 >>> pd.json_normalize(data, "A", record_prefix="Prefix.") 

379 Prefix.0 

380 0 1 

381 1 2 

382 

383 Returns normalized data with columns prefixed with the given string. 

384 """ 

385 

386 def _pull_field( 

387 js: dict[str, Any], spec: list | str, extract_record: bool = False 

388 ) -> Scalar | Iterable: 

389 """Internal function to pull field""" 

390 result = js 

391 try: 

392 if isinstance(spec, list): 

393 for field in spec: 

394 if result is None: 

395 raise KeyError(field) 

396 result = result[field] 

397 else: 

398 result = result[spec] 

399 except KeyError as e: 

400 if extract_record: 

401 raise KeyError( 

402 f"Key {e} not found. If specifying a record_path, all elements of " 

403 f"data should have the path." 

404 ) from e 

405 if errors == "ignore": 

406 return np.nan 

407 else: 

408 raise KeyError( 

409 f"Key {e} not found. To replace missing values of {e} with " 

410 f"np.nan, pass in errors='ignore'" 

411 ) from e 

412 

413 return result 

414 

415 def _pull_records(js: dict[str, Any], spec: list | str) -> list: 

416 """ 

417 Internal function to pull field for records, and similar to 

418 _pull_field, but require to return list. And will raise error 

419 if has non iterable value. 

420 """ 

421 result = _pull_field(js, spec, extract_record=True) 

422 

423 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not 

424 # null, otherwise return an empty list 

425 if not isinstance(result, list): 

426 if pd.isnull(result): 

427 result = [] 

428 else: 

429 raise TypeError( 

430 f"{js} has non list value {result} for path {spec}. " 

431 "Must be list or null." 

432 ) 

433 return result 

434 

435 if isinstance(data, list) and not data: 

436 return DataFrame() 

437 elif isinstance(data, dict): 

438 # A bit of a hackjob 

439 data = [data] 

440 elif isinstance(data, abc.Iterable) and not isinstance(data, str): 

441 # GH35923 Fix pd.json_normalize to not skip the first element of a 

442 # generator input 

443 data = list(data) 

444 else: 

445 raise NotImplementedError 

446 

447 # check to see if a simple recursive function is possible to 

448 # improve performance (see #15621) but only for cases such 

449 # as pd.Dataframe(data) or pd.Dataframe(data, sep) 

450 if ( 

451 record_path is None 

452 and meta is None 

453 and meta_prefix is None 

454 and record_prefix is None 

455 and max_level is None 

456 ): 

457 return DataFrame(_simple_json_normalize(data, sep=sep)) 

458 

459 if record_path is None: 

460 if any([isinstance(x, dict) for x in y.values()] for y in data): 

461 # naive normalization, this is idempotent for flat records 

462 # and potentially will inflate the data considerably for 

463 # deeply nested structures: 

464 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} 

465 # 

466 # TODO: handle record value which are lists, at least error 

467 # reasonably 

468 data = nested_to_record(data, sep=sep, max_level=max_level) 

469 return DataFrame(data) 

470 elif not isinstance(record_path, list): 

471 record_path = [record_path] 

472 

473 if meta is None: 

474 meta = [] 

475 elif not isinstance(meta, list): 

476 meta = [meta] 

477 

478 _meta = [m if isinstance(m, list) else [m] for m in meta] 

479 

480 # Disastrously inefficient for now 

481 records: list = [] 

482 lengths = [] 

483 

484 meta_vals: DefaultDict = defaultdict(list) 

485 meta_keys = [sep.join(val) for val in _meta] 

486 

487 def _recursive_extract(data, path, seen_meta, level: int = 0) -> None: 

488 if isinstance(data, dict): 

489 data = [data] 

490 if len(path) > 1: 

491 for obj in data: 

492 for val, key in zip(_meta, meta_keys): 

493 if level + 1 == len(val): 

494 seen_meta[key] = _pull_field(obj, val[-1]) 

495 

496 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) 

497 else: 

498 for obj in data: 

499 recs = _pull_records(obj, path[0]) 

500 recs = [ 

501 nested_to_record(r, sep=sep, max_level=max_level) 

502 if isinstance(r, dict) 

503 else r 

504 for r in recs 

505 ] 

506 

507 # For repeating the metadata later 

508 lengths.append(len(recs)) 

509 for val, key in zip(_meta, meta_keys): 

510 if level + 1 > len(val): 

511 meta_val = seen_meta[key] 

512 else: 

513 meta_val = _pull_field(obj, val[level:]) 

514 meta_vals[key].append(meta_val) 

515 records.extend(recs) 

516 

517 _recursive_extract(data, record_path, {}, level=0) 

518 

519 result = DataFrame(records) 

520 

521 if record_prefix is not None: 

522 result = result.rename(columns=lambda x: f"{record_prefix}{x}") 

523 

524 # Data types, a problem 

525 for k, v in meta_vals.items(): 

526 if meta_prefix is not None: 

527 k = meta_prefix + k 

528 

529 if k in result: 

530 raise ValueError( 

531 f"Conflicting metadata name {k}, need distinguishing prefix " 

532 ) 

533 # GH 37782 

534 

535 values = np.array(v, dtype=object) 

536 

537 if values.ndim > 1: 

538 # GH 37782 

539 values = np.empty((len(v),), dtype=object) 

540 for i, v in enumerate(v): 

541 values[i] = v 

542 

543 result[k] = values.repeat(lengths) 

544 return result