Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/_normalize.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

142 statements  

1# --------------------------------------------------------------------- 

2# JSON normalization routines 

3from __future__ import annotations 

4 

5from collections import ( 

6 abc, 

7 defaultdict, 

8) 

9import copy 

10import sys 

11from typing import ( 

12 Any, 

13 DefaultDict, 

14 Iterable, 

15) 

16 

17import numpy as np 

18 

19from pandas._libs.writers import convert_json_to_lines 

20from pandas._typing import ( 

21 IgnoreRaise, 

22 Scalar, 

23) 

24 

25import pandas as pd 

26from pandas import DataFrame 

27 

28 

29def convert_to_line_delimits(s: str) -> str: 

30 """ 

31 Helper function that converts JSON lists to line delimited JSON. 

32 """ 

33 # Determine we have a JSON list to turn to lines otherwise just return the 

34 # json object, only lists can 

35 if not s[0] == "[" and s[-1] == "]": 

36 return s 

37 s = s[1:-1] 

38 

39 return convert_json_to_lines(s) 

40 

41 

42def nested_to_record( 

43 ds, 

44 prefix: str = "", 

45 sep: str = ".", 

46 level: int = 0, 

47 max_level: int | None = None, 

48): 

49 """ 

50 A simplified json_normalize 

51 

52 Converts a nested dict into a flat dict ("record"), unlike json_normalize, 

53 it does not attempt to extract a subset of the data. 

54 

55 Parameters 

56 ---------- 

57 ds : dict or list of dicts 

58 prefix: the prefix, optional, default: "" 

59 sep : str, default '.' 

60 Nested records will generate names separated by sep, 

61 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

62 level: int, optional, default: 0 

63 The number of levels in the json string. 

64 

65 max_level: int, optional, default: None 

66 The max depth to normalize. 

67 

68 Returns 

69 ------- 

70 d - dict or list of dicts, matching `ds` 

71 

72 Examples 

73 -------- 

74 >>> nested_to_record( 

75 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) 

76 ... ) 

77 {\ 

78'flat1': 1, \ 

79'dict1.c': 1, \ 

80'dict1.d': 2, \ 

81'nested.e.c': 1, \ 

82'nested.e.d': 2, \ 

83'nested.d': 2\ 

84} 

85 """ 

86 singleton = False 

87 if isinstance(ds, dict): 

88 ds = [ds] 

89 singleton = True 

90 new_ds = [] 

91 for d in ds: 

92 new_d = copy.deepcopy(d) 

93 for k, v in d.items(): 

94 # each key gets renamed with prefix 

95 if not isinstance(k, str): 

96 k = str(k) 

97 if level == 0: 

98 newkey = k 

99 else: 

100 newkey = prefix + sep + k 

101 

102 # flatten if type is dict and 

103 # current dict level < maximum level provided and 

104 # only dicts gets recurse-flattened 

105 # only at level>1 do we rename the rest of the keys 

106 if not isinstance(v, dict) or ( 

107 max_level is not None and level >= max_level 

108 ): 

109 if level != 0: # so we skip copying for top level, common case 

110 v = new_d.pop(k) 

111 new_d[newkey] = v 

112 continue 

113 

114 v = new_d.pop(k) 

115 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) 

116 new_ds.append(new_d) 

117 

118 if singleton: 

119 return new_ds[0] 

120 return new_ds 

121 

122 

123def _normalise_json( 

124 data: Any, 

125 key_string: str, 

126 normalized_dict: dict[str, Any], 

127 separator: str, 

128) -> dict[str, Any]: 

129 """ 

130 Main recursive function 

131 Designed for the most basic use case of pd.json_normalize(data) 

132 intended as a performance improvement, see #15621 

133 

134 Parameters 

135 ---------- 

136 data : Any 

137 Type dependent on types contained within nested Json 

138 key_string : str 

139 New key (with separator(s) in) for data 

140 normalized_dict : dict 

141 The new normalized/flattened Json dict 

142 separator : str, default '.' 

143 Nested records will generate names separated by sep, 

144 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

145 """ 

146 if isinstance(data, dict): 

147 for key, value in data.items(): 

148 new_key = f"{key_string}{separator}{key}" 

149 

150 if not key_string: 

151 if sys.version_info < (3, 9): 

152 from pandas.util._str_methods import removeprefix 

153 

154 new_key = removeprefix(new_key, separator) 

155 else: 

156 new_key = new_key.removeprefix(separator) 

157 

158 _normalise_json( 

159 data=value, 

160 key_string=new_key, 

161 normalized_dict=normalized_dict, 

162 separator=separator, 

163 ) 

164 else: 

165 normalized_dict[key_string] = data 

166 return normalized_dict 

167 

168 

169def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: 

170 """ 

171 Order the top level keys and then recursively go to depth 

172 

173 Parameters 

174 ---------- 

175 data : dict or list of dicts 

176 separator : str, default '.' 

177 Nested records will generate names separated by sep, 

178 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

179 

180 Returns 

181 ------- 

182 dict or list of dicts, matching `normalised_json_object` 

183 """ 

184 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)} 

185 nested_dict_ = _normalise_json( 

186 data={k: v for k, v in data.items() if isinstance(v, dict)}, 

187 key_string="", 

188 normalized_dict={}, 

189 separator=separator, 

190 ) 

191 return {**top_dict_, **nested_dict_} 

192 

193 

194def _simple_json_normalize( 

195 ds: dict | list[dict], 

196 sep: str = ".", 

197) -> dict | list[dict] | Any: 

198 """ 

199 A optimized basic json_normalize 

200 

201 Converts a nested dict into a flat dict ("record"), unlike 

202 json_normalize and nested_to_record it doesn't do anything clever. 

203 But for the most basic use cases it enhances performance. 

204 E.g. pd.json_normalize(data) 

205 

206 Parameters 

207 ---------- 

208 ds : dict or list of dicts 

209 sep : str, default '.' 

210 Nested records will generate names separated by sep, 

211 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar 

212 

213 Returns 

214 ------- 

215 frame : DataFrame 

216 d - dict or list of dicts, matching `normalised_json_object` 

217 

218 Examples 

219 -------- 

220 >>> _simple_json_normalize( 

221 ... { 

222 ... "flat1": 1, 

223 ... "dict1": {"c": 1, "d": 2}, 

224 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2}, 

225 ... } 

226 ... ) 

227 {\ 

228'flat1': 1, \ 

229'dict1.c': 1, \ 

230'dict1.d': 2, \ 

231'nested.e.c': 1, \ 

232'nested.e.d': 2, \ 

233'nested.d': 2\ 

234} 

235 

236 """ 

237 normalised_json_object = {} 

238 # expect a dictionary, as most jsons are. However, lists are perfectly valid 

239 if isinstance(ds, dict): 

240 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep) 

241 elif isinstance(ds, list): 

242 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] 

243 return normalised_json_list 

244 return normalised_json_object 

245 

246 

247def json_normalize( 

248 data: dict | list[dict], 

249 record_path: str | list | None = None, 

250 meta: str | list[str | list[str]] | None = None, 

251 meta_prefix: str | None = None, 

252 record_prefix: str | None = None, 

253 errors: IgnoreRaise = "raise", 

254 sep: str = ".", 

255 max_level: int | None = None, 

256) -> DataFrame: 

257 """ 

258 Normalize semi-structured JSON data into a flat table. 

259 

260 Parameters 

261 ---------- 

262 data : dict or list of dicts 

263 Unserialized JSON objects. 

264 record_path : str or list of str, default None 

265 Path in each object to list of records. If not passed, data will be 

266 assumed to be an array of records. 

267 meta : list of paths (str or list of str), default None 

268 Fields to use as metadata for each record in resulting table. 

269 meta_prefix : str, default None 

270 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

271 meta is ['foo', 'bar']. 

272 record_prefix : str, default None 

273 If True, prefix records with dotted (?) path, e.g. foo.bar.field if 

274 path to records is ['foo', 'bar']. 

275 errors : {'raise', 'ignore'}, default 'raise' 

276 Configures error handling. 

277 

278 * 'ignore' : will ignore KeyError if keys listed in meta are not 

279 always present. 

280 * 'raise' : will raise KeyError if keys listed in meta are not 

281 always present. 

282 sep : str, default '.' 

283 Nested records will generate names separated by sep. 

284 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. 

285 max_level : int, default None 

286 Max number of levels(depth of dict) to normalize. 

287 if None, normalizes all levels. 

288 

289 Returns 

290 ------- 

291 frame : DataFrame 

292 Normalize semi-structured JSON data into a flat table. 

293 

294 Examples 

295 -------- 

296 >>> data = [ 

297 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, 

298 ... {"name": {"given": "Mark", "family": "Regner"}}, 

299 ... {"id": 2, "name": "Faye Raker"}, 

300 ... ] 

301 >>> pd.json_normalize(data) 

302 id name.first name.last name.given name.family name 

303 0 1.0 Coleen Volk NaN NaN NaN 

304 1 NaN NaN NaN Mark Regner NaN 

305 2 2.0 NaN NaN NaN NaN Faye Raker 

306 

307 >>> data = [ 

308 ... { 

309 ... "id": 1, 

310 ... "name": "Cole Volk", 

311 ... "fitness": {"height": 130, "weight": 60}, 

312 ... }, 

313 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, 

314 ... { 

315 ... "id": 2, 

316 ... "name": "Faye Raker", 

317 ... "fitness": {"height": 130, "weight": 60}, 

318 ... }, 

319 ... ] 

320 >>> pd.json_normalize(data, max_level=0) 

321 id name fitness 

322 0 1.0 Cole Volk {'height': 130, 'weight': 60} 

323 1 NaN Mark Reg {'height': 130, 'weight': 60} 

324 2 2.0 Faye Raker {'height': 130, 'weight': 60} 

325 

326 Normalizes nested data up to level 1. 

327 

328 >>> data = [ 

329 ... { 

330 ... "id": 1, 

331 ... "name": "Cole Volk", 

332 ... "fitness": {"height": 130, "weight": 60}, 

333 ... }, 

334 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, 

335 ... { 

336 ... "id": 2, 

337 ... "name": "Faye Raker", 

338 ... "fitness": {"height": 130, "weight": 60}, 

339 ... }, 

340 ... ] 

341 >>> pd.json_normalize(data, max_level=1) 

342 id name fitness.height fitness.weight 

343 0 1.0 Cole Volk 130 60 

344 1 NaN Mark Reg 130 60 

345 2 2.0 Faye Raker 130 60 

346 

347 >>> data = [ 

348 ... { 

349 ... "state": "Florida", 

350 ... "shortname": "FL", 

351 ... "info": {"governor": "Rick Scott"}, 

352 ... "counties": [ 

353 ... {"name": "Dade", "population": 12345}, 

354 ... {"name": "Broward", "population": 40000}, 

355 ... {"name": "Palm Beach", "population": 60000}, 

356 ... ], 

357 ... }, 

358 ... { 

359 ... "state": "Ohio", 

360 ... "shortname": "OH", 

361 ... "info": {"governor": "John Kasich"}, 

362 ... "counties": [ 

363 ... {"name": "Summit", "population": 1234}, 

364 ... {"name": "Cuyahoga", "population": 1337}, 

365 ... ], 

366 ... }, 

367 ... ] 

368 >>> result = pd.json_normalize( 

369 ... data, "counties", ["state", "shortname", ["info", "governor"]] 

370 ... ) 

371 >>> result 

372 name population state shortname info.governor 

373 0 Dade 12345 Florida FL Rick Scott 

374 1 Broward 40000 Florida FL Rick Scott 

375 2 Palm Beach 60000 Florida FL Rick Scott 

376 3 Summit 1234 Ohio OH John Kasich 

377 4 Cuyahoga 1337 Ohio OH John Kasich 

378 

379 >>> data = {"A": [1, 2]} 

380 >>> pd.json_normalize(data, "A", record_prefix="Prefix.") 

381 Prefix.0 

382 0 1 

383 1 2 

384 

385 Returns normalized data with columns prefixed with the given string. 

386 """ 

387 

388 def _pull_field( 

389 js: dict[str, Any], spec: list | str, extract_record: bool = False 

390 ) -> Scalar | Iterable: 

391 """Internal function to pull field""" 

392 result = js 

393 try: 

394 if isinstance(spec, list): 

395 for field in spec: 

396 if result is None: 

397 raise KeyError(field) 

398 result = result[field] 

399 else: 

400 result = result[spec] 

401 except KeyError as e: 

402 if extract_record: 

403 raise KeyError( 

404 f"Key {e} not found. If specifying a record_path, all elements of " 

405 f"data should have the path." 

406 ) from e 

407 if errors == "ignore": 

408 return np.nan 

409 else: 

410 raise KeyError( 

411 f"Key {e} not found. To replace missing values of {e} with " 

412 f"np.nan, pass in errors='ignore'" 

413 ) from e 

414 

415 return result 

416 

417 def _pull_records(js: dict[str, Any], spec: list | str) -> list: 

418 """ 

419 Internal function to pull field for records, and similar to 

420 _pull_field, but require to return list. And will raise error 

421 if has non iterable value. 

422 """ 

423 result = _pull_field(js, spec, extract_record=True) 

424 

425 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not 

426 # null, otherwise return an empty list 

427 if not isinstance(result, list): 

428 if pd.isnull(result): 

429 result = [] 

430 else: 

431 raise TypeError( 

432 f"{js} has non list value {result} for path {spec}. " 

433 "Must be list or null." 

434 ) 

435 return result 

436 

437 if isinstance(data, list) and not data: 

438 return DataFrame() 

439 elif isinstance(data, dict): 

440 # A bit of a hackjob 

441 data = [data] 

442 elif isinstance(data, abc.Iterable) and not isinstance(data, str): 

443 # GH35923 Fix pd.json_normalize to not skip the first element of a 

444 # generator input 

445 data = list(data) 

446 else: 

447 raise NotImplementedError 

448 

449 # check to see if a simple recursive function is possible to 

450 # improve performance (see #15621) but only for cases such 

451 # as pd.Dataframe(data) or pd.Dataframe(data, sep) 

452 if ( 

453 record_path is None 

454 and meta is None 

455 and meta_prefix is None 

456 and record_prefix is None 

457 and max_level is None 

458 ): 

459 return DataFrame(_simple_json_normalize(data, sep=sep)) 

460 

461 if record_path is None: 

462 if any([isinstance(x, dict) for x in y.values()] for y in data): 

463 # naive normalization, this is idempotent for flat records 

464 # and potentially will inflate the data considerably for 

465 # deeply nested structures: 

466 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} 

467 # 

468 # TODO: handle record value which are lists, at least error 

469 # reasonably 

470 data = nested_to_record(data, sep=sep, max_level=max_level) 

471 return DataFrame(data) 

472 elif not isinstance(record_path, list): 

473 record_path = [record_path] 

474 

475 if meta is None: 

476 meta = [] 

477 elif not isinstance(meta, list): 

478 meta = [meta] 

479 

480 _meta = [m if isinstance(m, list) else [m] for m in meta] 

481 

482 # Disastrously inefficient for now 

483 records: list = [] 

484 lengths = [] 

485 

486 meta_vals: DefaultDict = defaultdict(list) 

487 meta_keys = [sep.join(val) for val in _meta] 

488 

489 def _recursive_extract(data, path, seen_meta, level: int = 0) -> None: 

490 if isinstance(data, dict): 

491 data = [data] 

492 if len(path) > 1: 

493 for obj in data: 

494 for val, key in zip(_meta, meta_keys): 

495 if level + 1 == len(val): 

496 seen_meta[key] = _pull_field(obj, val[-1]) 

497 

498 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) 

499 else: 

500 for obj in data: 

501 recs = _pull_records(obj, path[0]) 

502 recs = [ 

503 nested_to_record(r, sep=sep, max_level=max_level) 

504 if isinstance(r, dict) 

505 else r 

506 for r in recs 

507 ] 

508 

509 # For repeating the metadata later 

510 lengths.append(len(recs)) 

511 for val, key in zip(_meta, meta_keys): 

512 if level + 1 > len(val): 

513 meta_val = seen_meta[key] 

514 else: 

515 meta_val = _pull_field(obj, val[level:]) 

516 meta_vals[key].append(meta_val) 

517 records.extend(recs) 

518 

519 _recursive_extract(data, record_path, {}, level=0) 

520 

521 result = DataFrame(records) 

522 

523 if record_prefix is not None: 

524 result = result.rename(columns=lambda x: f"{record_prefix}{x}") 

525 

526 # Data types, a problem 

527 for k, v in meta_vals.items(): 

528 if meta_prefix is not None: 

529 k = meta_prefix + k 

530 

531 if k in result: 

532 raise ValueError( 

533 f"Conflicting metadata name {k}, need distinguishing prefix " 

534 ) 

535 result[k] = np.array(v, dtype=object).repeat(lengths) 

536 return result