Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/

1# ---------------------------------------------------------------------

2# JSON normalization routines

3from __future__ import annotations

5from collections import (

6 abc,

7 defaultdict,

9import copy

10import sys

11from typing import (

12 Any,

13 DefaultDict,

14 Iterable,

15)

17import numpy as np

19from pandas._libs.writers import convert_json_to_lines

20from pandas._typing import (

21 IgnoreRaise,

22 Scalar,

23)

25import pandas as pd

26from pandas import DataFrame

29def convert_to_line_delimits(s: str) -> str:

30 """

31 Helper function that converts JSON lists to line delimited JSON.

32 """

33 # Determine we have a JSON list to turn to lines otherwise just return the

34 # json object, only lists can

35 if not s[0] == "[" and s[-1] == "]":

36 return s

37 s = s[1:-1]

39 return convert_json_to_lines(s)

42def nested_to_record(

43 ds,

44 prefix: str = "",

45 sep: str = ".",

46 level: int = 0,

47 max_level: int | None = None,

48):

49 """

50 A simplified json_normalize

52 Converts a nested dict into a flat dict ("record"), unlike json_normalize,

53 it does not attempt to extract a subset of the data.

55 Parameters

56 ----------

57 ds : dict or list of dicts

58 prefix: the prefix, optional, default: ""

59 sep : str, default '.'

60 Nested records will generate names separated by sep,

61 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

62 level: int, optional, default: 0

63 The number of levels in the json string.

65 max_level: int, optional, default: None

66 The max depth to normalize.

68 Returns

69 -------

70 d - dict or list of dicts, matching `ds`

72 Examples

73 --------

74 >>> nested_to_record(

75 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))

76 ... )

77 {\

78'flat1': 1, \

79'dict1.c': 1, \

80'dict1.d': 2, \

81'nested.e.c': 1, \

82'nested.e.d': 2, \

83'nested.d': 2\

84}

85 """

86 singleton = False

87 if isinstance(ds, dict):

88 ds = [ds]

89 singleton = True

90 new_ds = []

91 for d in ds:

92 new_d = copy.deepcopy(d)

93 for k, v in d.items():

94 # each key gets renamed with prefix

95 if not isinstance(k, str):

96 k = str(k)

97 if level == 0:

98 newkey = k

99 else:

100 newkey = prefix + sep + k

101

102 # flatten if type is dict and

103 # current dict level < maximum level provided and

104 # only dicts gets recurse-flattened

105 # only at level>1 do we rename the rest of the keys

106 if not isinstance(v, dict) or (

107 max_level is not None and level >= max_level

108 ):

109 if level != 0: # so we skip copying for top level, common case

110 v = new_d.pop(k)

111 new_d[newkey] = v

112 continue

113

114 v = new_d.pop(k)

115 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))

116 new_ds.append(new_d)

117

118 if singleton:

119 return new_ds[0]

120 return new_ds

121

122

123def _normalise_json(

124 data: Any,

125 key_string: str,

126 normalized_dict: dict[str, Any],

127 separator: str,

128) -> dict[str, Any]:

129 """

130 Main recursive function

131 Designed for the most basic use case of pd.json_normalize(data)

132 intended as a performance improvement, see #15621

133

134 Parameters

135 ----------

136 data : Any

137 Type dependent on types contained within nested Json

138 key_string : str

139 New key (with separator(s) in) for data

140 normalized_dict : dict

141 The new normalized/flattened Json dict

142 separator : str, default '.'

143 Nested records will generate names separated by sep,

144 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

145 """

146 if isinstance(data, dict):

147 for key, value in data.items():

148 new_key = f"{key_string}{separator}{key}"

149

150 if not key_string:

151 if sys.version_info < (3, 9):

152 from pandas.util._str_methods import removeprefix

153

154 new_key = removeprefix(new_key, separator)

155 else:

156 new_key = new_key.removeprefix(separator)

157

158 _normalise_json(

159 data=value,

160 key_string=new_key,

161 normalized_dict=normalized_dict,

162 separator=separator,

163 )

164 else:

165 normalized_dict[key_string] = data

166 return normalized_dict

167

168

169def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:

170 """

171 Order the top level keys and then recursively go to depth

172

173 Parameters

174 ----------

175 data : dict or list of dicts

176 separator : str, default '.'

177 Nested records will generate names separated by sep,

178 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

179

180 Returns

181 -------

182 dict or list of dicts, matching `normalised_json_object`

183 """

184 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}

185 nested_dict_ = _normalise_json(

186 data={k: v for k, v in data.items() if isinstance(v, dict)},

187 key_string="",

188 normalized_dict={},

189 separator=separator,

190 )

191 return {**top_dict_, **nested_dict_}

192

193

194def _simple_json_normalize(

195 ds: dict | list[dict],

196 sep: str = ".",

197) -> dict | list[dict] | Any:

198 """

199 A optimized basic json_normalize

200

201 Converts a nested dict into a flat dict ("record"), unlike

202 json_normalize and nested_to_record it doesn't do anything clever.

203 But for the most basic use cases it enhances performance.

204 E.g. pd.json_normalize(data)

205

206 Parameters

207 ----------

208 ds : dict or list of dicts

209 sep : str, default '.'

210 Nested records will generate names separated by sep,

211 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

212

213 Returns

214 -------

215 frame : DataFrame

216 d - dict or list of dicts, matching `normalised_json_object`

217

218 Examples

219 --------

220 >>> _simple_json_normalize(

221 ... {

222 ... "flat1": 1,

223 ... "dict1": {"c": 1, "d": 2},

224 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},

225 ... }

226 ... )

227 {\

228'flat1': 1, \

229'dict1.c': 1, \

230'dict1.d': 2, \

231'nested.e.c': 1, \

232'nested.e.d': 2, \

233'nested.d': 2\

234}

235

236 """

237 normalised_json_object = {}

238 # expect a dictionary, as most jsons are. However, lists are perfectly valid

239 if isinstance(ds, dict):

240 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)

241 elif isinstance(ds, list):

242 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]

243 return normalised_json_list

244 return normalised_json_object

245

246

247def json_normalize(

248 data: dict | list[dict],

249 record_path: str | list | None = None,

250 meta: str | list[str | list[str]] | None = None,

251 meta_prefix: str | None = None,

252 record_prefix: str | None = None,

253 errors: IgnoreRaise = "raise",

254 sep: str = ".",

255 max_level: int | None = None,

256) -> DataFrame:

257 """

258 Normalize semi-structured JSON data into a flat table.

259

260 Parameters

261 ----------

262 data : dict or list of dicts

263 Unserialized JSON objects.

264 record_path : str or list of str, default None

265 Path in each object to list of records. If not passed, data will be

266 assumed to be an array of records.

267 meta : list of paths (str or list of str), default None

268 Fields to use as metadata for each record in resulting table.

269 meta_prefix : str, default None

270 If True, prefix records with dotted (?) path, e.g. foo.bar.field if

271 meta is ['foo', 'bar'].

272 record_prefix : str, default None

273 If True, prefix records with dotted (?) path, e.g. foo.bar.field if

274 path to records is ['foo', 'bar'].

275 errors : {'raise', 'ignore'}, default 'raise'

276 Configures error handling.

277

278 * 'ignore' : will ignore KeyError if keys listed in meta are not

279 always present.

280 * 'raise' : will raise KeyError if keys listed in meta are not

281 always present.

282 sep : str, default '.'

283 Nested records will generate names separated by sep.

284 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.

285 max_level : int, default None

286 Max number of levels(depth of dict) to normalize.

287 if None, normalizes all levels.

288

289 Returns

290 -------

291 frame : DataFrame

292 Normalize semi-structured JSON data into a flat table.

293

294 Examples

295 --------

296 >>> data = [

297 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},

298 ... {"name": {"given": "Mark", "family": "Regner"}},

299 ... {"id": 2, "name": "Faye Raker"},

300 ... ]

301 >>> pd.json_normalize(data)

302 id name.first name.last name.given name.family name

303 0 1.0 Coleen Volk NaN NaN NaN

304 1 NaN NaN NaN Mark Regner NaN

305 2 2.0 NaN NaN NaN NaN Faye Raker

306

307 >>> data = [

308 ... {

309 ... "id": 1,

310 ... "name": "Cole Volk",

311 ... "fitness": {"height": 130, "weight": 60},

312 ... },

313 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},

314 ... {

315 ... "id": 2,

316 ... "name": "Faye Raker",

317 ... "fitness": {"height": 130, "weight": 60},

318 ... },

319 ... ]

320 >>> pd.json_normalize(data, max_level=0)

321 id name fitness

322 0 1.0 Cole Volk {'height': 130, 'weight': 60}

323 1 NaN Mark Reg {'height': 130, 'weight': 60}

324 2 2.0 Faye Raker {'height': 130, 'weight': 60}

325

326 Normalizes nested data up to level 1.

327

328 >>> data = [

329 ... {

330 ... "id": 1,

331 ... "name": "Cole Volk",

332 ... "fitness": {"height": 130, "weight": 60},

333 ... },

334 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},

335 ... {

336 ... "id": 2,

337 ... "name": "Faye Raker",

338 ... "fitness": {"height": 130, "weight": 60},

339 ... },

340 ... ]

341 >>> pd.json_normalize(data, max_level=1)

342 id name fitness.height fitness.weight

343 0 1.0 Cole Volk 130 60

344 1 NaN Mark Reg 130 60

345 2 2.0 Faye Raker 130 60

346

347 >>> data = [

348 ... {

349 ... "state": "Florida",

350 ... "shortname": "FL",

351 ... "info": {"governor": "Rick Scott"},

352 ... "counties": [

353 ... {"name": "Dade", "population": 12345},

354 ... {"name": "Broward", "population": 40000},

355 ... {"name": "Palm Beach", "population": 60000},

356 ... ],

357 ... },

358 ... {

359 ... "state": "Ohio",

360 ... "shortname": "OH",

361 ... "info": {"governor": "John Kasich"},

362 ... "counties": [

363 ... {"name": "Summit", "population": 1234},

364 ... {"name": "Cuyahoga", "population": 1337},

365 ... ],

366 ... },

367 ... ]

368 >>> result = pd.json_normalize(

369 ... data, "counties", ["state", "shortname", ["info", "governor"]]

370 ... )

371 >>> result

372 name population state shortname info.governor

373 0 Dade 12345 Florida FL Rick Scott

374 1 Broward 40000 Florida FL Rick Scott

375 2 Palm Beach 60000 Florida FL Rick Scott

376 3 Summit 1234 Ohio OH John Kasich

377 4 Cuyahoga 1337 Ohio OH John Kasich

378

379 >>> data = {"A": [1, 2]}

380 >>> pd.json_normalize(data, "A", record_prefix="Prefix.")

381 Prefix.0

382 0 1

383 1 2

384

385 Returns normalized data with columns prefixed with the given string.

386 """

387

388 def _pull_field(

389 js: dict[str, Any], spec: list | str, extract_record: bool = False

390 ) -> Scalar | Iterable:

391 """Internal function to pull field"""

392 result = js

393 try:

394 if isinstance(spec, list):

395 for field in spec:

396 if result is None:

397 raise KeyError(field)

398 result = result[field]

399 else:

400 result = result[spec]

401 except KeyError as e:

402 if extract_record:

403 raise KeyError(

404 f"Key {e} not found. If specifying a record_path, all elements of "

405 f"data should have the path."

406 ) from e

407 if errors == "ignore":

408 return np.nan

409 else:

410 raise KeyError(

411 f"Key {e} not found. To replace missing values of {e} with "

412 f"np.nan, pass in errors='ignore'"

413 ) from e

414

415 return result

416

417 def _pull_records(js: dict[str, Any], spec: list | str) -> list:

418 """

419 Internal function to pull field for records, and similar to

420 _pull_field, but require to return list. And will raise error

421 if has non iterable value.

422 """

423 result = _pull_field(js, spec, extract_record=True)

424

425 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not

426 # null, otherwise return an empty list

427 if not isinstance(result, list):

428 if pd.isnull(result):

429 result = []

430 else:

431 raise TypeError(

432 f"{js} has non list value {result} for path {spec}. "

433 "Must be list or null."

434 )

435 return result

436

437 if isinstance(data, list) and not data:

438 return DataFrame()

439 elif isinstance(data, dict):

440 # A bit of a hackjob

441 data = [data]

442 elif isinstance(data, abc.Iterable) and not isinstance(data, str):

443 # GH35923 Fix pd.json_normalize to not skip the first element of a

444 # generator input

445 data = list(data)

446 else:

447 raise NotImplementedError

448

449 # check to see if a simple recursive function is possible to

450 # improve performance (see #15621) but only for cases such

451 # as pd.Dataframe(data) or pd.Dataframe(data, sep)

452 if (

453 record_path is None

454 and meta is None

455 and meta_prefix is None

456 and record_prefix is None

457 and max_level is None

458 ):

459 return DataFrame(_simple_json_normalize(data, sep=sep))

460

461 if record_path is None:

462 if any([isinstance(x, dict) for x in y.values()] for y in data):

463 # naive normalization, this is idempotent for flat records

464 # and potentially will inflate the data considerably for

465 # deeply nested structures:

466 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}

467 #

468 # TODO: handle record value which are lists, at least error

469 # reasonably

470 data = nested_to_record(data, sep=sep, max_level=max_level)

471 return DataFrame(data)

472 elif not isinstance(record_path, list):

473 record_path = [record_path]

474

475 if meta is None:

476 meta = []

477 elif not isinstance(meta, list):

478 meta = [meta]

479

480 _meta = [m if isinstance(m, list) else [m] for m in meta]

481

482 # Disastrously inefficient for now

483 records: list = []

484 lengths = []

485

486 meta_vals: DefaultDict = defaultdict(list)

487 meta_keys = [sep.join(val) for val in _meta]

488

489 def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:

490 if isinstance(data, dict):

491 data = [data]

492 if len(path) > 1:

493 for obj in data:

494 for val, key in zip(_meta, meta_keys):

495 if level + 1 == len(val):

496 seen_meta[key] = _pull_field(obj, val[-1])

497

498 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)

499 else:

500 for obj in data:

501 recs = _pull_records(obj, path[0])

502 recs = [

503 nested_to_record(r, sep=sep, max_level=max_level)

504 if isinstance(r, dict)

505 else r

506 for r in recs

507 ]

508

509 # For repeating the metadata later

510 lengths.append(len(recs))

511 for val, key in zip(_meta, meta_keys):

512 if level + 1 > len(val):

513 meta_val = seen_meta[key]

514 else:

515 meta_val = _pull_field(obj, val[level:])

516 meta_vals[key].append(meta_val)

517 records.extend(recs)

518

519 _recursive_extract(data, record_path, {}, level=0)

520

521 result = DataFrame(records)

522

523 if record_prefix is not None:

524 result = result.rename(columns=lambda x: f"{record_prefix}{x}")

525

526 # Data types, a problem

527 for k, v in meta_vals.items():

528 if meta_prefix is not None:

529 k = meta_prefix + k

530

531 if k in result:

532 raise ValueError(

533 f"Conflicting metadata name {k}, need distinguishing prefix "

534 )

535 result[k] = np.array(v, dtype=object).repeat(lengths)

536 return result

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/_normalize.py: 11%

142 statements