Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/

1# ---------------------------------------------------------------------

2# JSON normalization routines

3from __future__ import annotations

5from collections import (

6 abc,

7 defaultdict,

9import copy

10from typing import (

11 TYPE_CHECKING,

12 Any,

13 DefaultDict,

14)

16import numpy as np

18from pandas._libs.writers import convert_json_to_lines

20import pandas as pd

21from pandas import DataFrame

23if TYPE_CHECKING:

24 from collections.abc import Iterable

26 from pandas._typing import (

27 IgnoreRaise,

28 Scalar,

29 )

32def convert_to_line_delimits(s: str) -> str:

33 """

34 Helper function that converts JSON lists to line delimited JSON.

35 """

36 # Determine we have a JSON list to turn to lines otherwise just return the

37 # json object, only lists can

38 if not s[0] == "[" and s[-1] == "]":

39 return s

40 s = s[1:-1]

42 return convert_json_to_lines(s)

45def nested_to_record(

46 ds,

47 prefix: str = "",

48 sep: str = ".",

49 level: int = 0,

50 max_level: int | None = None,

51):

52 """

53 A simplified json_normalize

55 Converts a nested dict into a flat dict ("record"), unlike json_normalize,

56 it does not attempt to extract a subset of the data.

58 Parameters

59 ----------

60 ds : dict or list of dicts

61 prefix: the prefix, optional, default: ""

62 sep : str, default '.'

63 Nested records will generate names separated by sep,

64 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

65 level: int, optional, default: 0

66 The number of levels in the json string.

68 max_level: int, optional, default: None

69 The max depth to normalize.

71 Returns

72 -------

73 d - dict or list of dicts, matching `ds`

75 Examples

76 --------

77 >>> nested_to_record(

78 ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))

79 ... )

80 {\

81'flat1': 1, \

82'dict1.c': 1, \

83'dict1.d': 2, \

84'nested.e.c': 1, \

85'nested.e.d': 2, \

86'nested.d': 2\

87}

88 """

89 singleton = False

90 if isinstance(ds, dict):

91 ds = [ds]

92 singleton = True

93 new_ds = []

94 for d in ds:

95 new_d = copy.deepcopy(d)

96 for k, v in d.items():

97 # each key gets renamed with prefix

98 if not isinstance(k, str):

99 k = str(k)

100 if level == 0:

101 newkey = k

102 else:

103 newkey = prefix + sep + k

104

105 # flatten if type is dict and

106 # current dict level < maximum level provided and

107 # only dicts gets recurse-flattened

108 # only at level>1 do we rename the rest of the keys

109 if not isinstance(v, dict) or (

110 max_level is not None and level >= max_level

111 ):

112 if level != 0: # so we skip copying for top level, common case

113 v = new_d.pop(k)

114 new_d[newkey] = v

115 continue

116

117 v = new_d.pop(k)

118 new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))

119 new_ds.append(new_d)

120

121 if singleton:

122 return new_ds[0]

123 return new_ds

124

125

126def _normalise_json(

127 data: Any,

128 key_string: str,

129 normalized_dict: dict[str, Any],

130 separator: str,

131) -> dict[str, Any]:

132 """

133 Main recursive function

134 Designed for the most basic use case of pd.json_normalize(data)

135 intended as a performance improvement, see #15621

136

137 Parameters

138 ----------

139 data : Any

140 Type dependent on types contained within nested Json

141 key_string : str

142 New key (with separator(s) in) for data

143 normalized_dict : dict

144 The new normalized/flattened Json dict

145 separator : str, default '.'

146 Nested records will generate names separated by sep,

147 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

148 """

149 if isinstance(data, dict):

150 for key, value in data.items():

151 new_key = f"{key_string}{separator}{key}"

152

153 if not key_string:

154 new_key = new_key.removeprefix(separator)

155

156 _normalise_json(

157 data=value,

158 key_string=new_key,

159 normalized_dict=normalized_dict,

160 separator=separator,

161 )

162 else:

163 normalized_dict[key_string] = data

164 return normalized_dict

165

166

167def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:

168 """

169 Order the top level keys and then recursively go to depth

170

171 Parameters

172 ----------

173 data : dict or list of dicts

174 separator : str, default '.'

175 Nested records will generate names separated by sep,

176 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

177

178 Returns

179 -------

180 dict or list of dicts, matching `normalised_json_object`

181 """

182 top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}

183 nested_dict_ = _normalise_json(

184 data={k: v for k, v in data.items() if isinstance(v, dict)},

185 key_string="",

186 normalized_dict={},

187 separator=separator,

188 )

189 return {**top_dict_, **nested_dict_}

190

191

192def _simple_json_normalize(

193 ds: dict | list[dict],

194 sep: str = ".",

195) -> dict | list[dict] | Any:

196 """

197 A optimized basic json_normalize

198

199 Converts a nested dict into a flat dict ("record"), unlike

200 json_normalize and nested_to_record it doesn't do anything clever.

201 But for the most basic use cases it enhances performance.

202 E.g. pd.json_normalize(data)

203

204 Parameters

205 ----------

206 ds : dict or list of dicts

207 sep : str, default '.'

208 Nested records will generate names separated by sep,

209 e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

210

211 Returns

212 -------

213 frame : DataFrame

214 d - dict or list of dicts, matching `normalised_json_object`

215

216 Examples

217 --------

218 >>> _simple_json_normalize(

219 ... {

220 ... "flat1": 1,

221 ... "dict1": {"c": 1, "d": 2},

222 ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},

223 ... }

224 ... )

225 {\

226'flat1': 1, \

227'dict1.c': 1, \

228'dict1.d': 2, \

229'nested.e.c': 1, \

230'nested.e.d': 2, \

231'nested.d': 2\

232}

233

234 """

235 normalised_json_object = {}

236 # expect a dictionary, as most jsons are. However, lists are perfectly valid

237 if isinstance(ds, dict):

238 normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)

239 elif isinstance(ds, list):

240 normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]

241 return normalised_json_list

242 return normalised_json_object

243

244

245def json_normalize(

246 data: dict | list[dict],

247 record_path: str | list | None = None,

248 meta: str | list[str | list[str]] | None = None,

249 meta_prefix: str | None = None,

250 record_prefix: str | None = None,

251 errors: IgnoreRaise = "raise",

252 sep: str = ".",

253 max_level: int | None = None,

254) -> DataFrame:

255 """

256 Normalize semi-structured JSON data into a flat table.

257

258 Parameters

259 ----------

260 data : dict or list of dicts

261 Unserialized JSON objects.

262 record_path : str or list of str, default None

263 Path in each object to list of records. If not passed, data will be

264 assumed to be an array of records.

265 meta : list of paths (str or list of str), default None

266 Fields to use as metadata for each record in resulting table.

267 meta_prefix : str, default None

268 If True, prefix records with dotted (?) path, e.g. foo.bar.field if

269 meta is ['foo', 'bar'].

270 record_prefix : str, default None

271 If True, prefix records with dotted (?) path, e.g. foo.bar.field if

272 path to records is ['foo', 'bar'].

273 errors : {'raise', 'ignore'}, default 'raise'

274 Configures error handling.

275

276 * 'ignore' : will ignore KeyError if keys listed in meta are not

277 always present.

278 * 'raise' : will raise KeyError if keys listed in meta are not

279 always present.

280 sep : str, default '.'

281 Nested records will generate names separated by sep.

282 e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.

283 max_level : int, default None

284 Max number of levels(depth of dict) to normalize.

285 if None, normalizes all levels.

286

287 Returns

288 -------

289 frame : DataFrame

290 Normalize semi-structured JSON data into a flat table.

291

292 Examples

293 --------

294 >>> data = [

295 ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},

296 ... {"name": {"given": "Mark", "family": "Regner"}},

297 ... {"id": 2, "name": "Faye Raker"},

298 ... ]

299 >>> pd.json_normalize(data)

300 id name.first name.last name.given name.family name

301 0 1.0 Coleen Volk NaN NaN NaN

302 1 NaN NaN NaN Mark Regner NaN

303 2 2.0 NaN NaN NaN NaN Faye Raker

304

305 >>> data = [

306 ... {

307 ... "id": 1,

308 ... "name": "Cole Volk",

309 ... "fitness": {"height": 130, "weight": 60},

310 ... },

311 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},

312 ... {

313 ... "id": 2,

314 ... "name": "Faye Raker",

315 ... "fitness": {"height": 130, "weight": 60},

316 ... },

317 ... ]

318 >>> pd.json_normalize(data, max_level=0)

319 id name fitness

320 0 1.0 Cole Volk {'height': 130, 'weight': 60}

321 1 NaN Mark Reg {'height': 130, 'weight': 60}

322 2 2.0 Faye Raker {'height': 130, 'weight': 60}

323

324 Normalizes nested data up to level 1.

325

326 >>> data = [

327 ... {

328 ... "id": 1,

329 ... "name": "Cole Volk",

330 ... "fitness": {"height": 130, "weight": 60},

331 ... },

332 ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},

333 ... {

334 ... "id": 2,

335 ... "name": "Faye Raker",

336 ... "fitness": {"height": 130, "weight": 60},

337 ... },

338 ... ]

339 >>> pd.json_normalize(data, max_level=1)

340 id name fitness.height fitness.weight

341 0 1.0 Cole Volk 130 60

342 1 NaN Mark Reg 130 60

343 2 2.0 Faye Raker 130 60

344

345 >>> data = [

346 ... {

347 ... "state": "Florida",

348 ... "shortname": "FL",

349 ... "info": {"governor": "Rick Scott"},

350 ... "counties": [

351 ... {"name": "Dade", "population": 12345},

352 ... {"name": "Broward", "population": 40000},

353 ... {"name": "Palm Beach", "population": 60000},

354 ... ],

355 ... },

356 ... {

357 ... "state": "Ohio",

358 ... "shortname": "OH",

359 ... "info": {"governor": "John Kasich"},

360 ... "counties": [

361 ... {"name": "Summit", "population": 1234},

362 ... {"name": "Cuyahoga", "population": 1337},

363 ... ],

364 ... },

365 ... ]

366 >>> result = pd.json_normalize(

367 ... data, "counties", ["state", "shortname", ["info", "governor"]]

368 ... )

369 >>> result

370 name population state shortname info.governor

371 0 Dade 12345 Florida FL Rick Scott

372 1 Broward 40000 Florida FL Rick Scott

373 2 Palm Beach 60000 Florida FL Rick Scott

374 3 Summit 1234 Ohio OH John Kasich

375 4 Cuyahoga 1337 Ohio OH John Kasich

376

377 >>> data = {"A": [1, 2]}

378 >>> pd.json_normalize(data, "A", record_prefix="Prefix.")

379 Prefix.0

380 0 1

381 1 2

382

383 Returns normalized data with columns prefixed with the given string.

384 """

385

386 def _pull_field(

387 js: dict[str, Any], spec: list | str, extract_record: bool = False

388 ) -> Scalar | Iterable:

389 """Internal function to pull field"""

390 result = js

391 try:

392 if isinstance(spec, list):

393 for field in spec:

394 if result is None:

395 raise KeyError(field)

396 result = result[field]

397 else:

398 result = result[spec]

399 except KeyError as e:

400 if extract_record:

401 raise KeyError(

402 f"Key {e} not found. If specifying a record_path, all elements of "

403 f"data should have the path."

404 ) from e

405 if errors == "ignore":

406 return np.nan

407 else:

408 raise KeyError(

409 f"Key {e} not found. To replace missing values of {e} with "

410 f"np.nan, pass in errors='ignore'"

411 ) from e

412

413 return result

414

415 def _pull_records(js: dict[str, Any], spec: list | str) -> list:

416 """

417 Internal function to pull field for records, and similar to

418 _pull_field, but require to return list. And will raise error

419 if has non iterable value.

420 """

421 result = _pull_field(js, spec, extract_record=True)

422

423 # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not

424 # null, otherwise return an empty list

425 if not isinstance(result, list):

426 if pd.isnull(result):

427 result = []

428 else:

429 raise TypeError(

430 f"{js} has non list value {result} for path {spec}. "

431 "Must be list or null."

432 )

433 return result

434

435 if isinstance(data, list) and not data:

436 return DataFrame()

437 elif isinstance(data, dict):

438 # A bit of a hackjob

439 data = [data]

440 elif isinstance(data, abc.Iterable) and not isinstance(data, str):

441 # GH35923 Fix pd.json_normalize to not skip the first element of a

442 # generator input

443 data = list(data)

444 else:

445 raise NotImplementedError

446

447 # check to see if a simple recursive function is possible to

448 # improve performance (see #15621) but only for cases such

449 # as pd.Dataframe(data) or pd.Dataframe(data, sep)

450 if (

451 record_path is None

452 and meta is None

453 and meta_prefix is None

454 and record_prefix is None

455 and max_level is None

456 ):

457 return DataFrame(_simple_json_normalize(data, sep=sep))

458

459 if record_path is None:

460 if any([isinstance(x, dict) for x in y.values()] for y in data):

461 # naive normalization, this is idempotent for flat records

462 # and potentially will inflate the data considerably for

463 # deeply nested structures:

464 # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}

465 #

466 # TODO: handle record value which are lists, at least error

467 # reasonably

468 data = nested_to_record(data, sep=sep, max_level=max_level)

469 return DataFrame(data)

470 elif not isinstance(record_path, list):

471 record_path = [record_path]

472

473 if meta is None:

474 meta = []

475 elif not isinstance(meta, list):

476 meta = [meta]

477

478 _meta = [m if isinstance(m, list) else [m] for m in meta]

479

480 # Disastrously inefficient for now

481 records: list = []

482 lengths = []

483

484 meta_vals: DefaultDict = defaultdict(list)

485 meta_keys = [sep.join(val) for val in _meta]

486

487 def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:

488 if isinstance(data, dict):

489 data = [data]

490 if len(path) > 1:

491 for obj in data:

492 for val, key in zip(_meta, meta_keys):

493 if level + 1 == len(val):

494 seen_meta[key] = _pull_field(obj, val[-1])

495

496 _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)

497 else:

498 for obj in data:

499 recs = _pull_records(obj, path[0])

500 recs = [

501 nested_to_record(r, sep=sep, max_level=max_level)

502 if isinstance(r, dict)

503 else r

504 for r in recs

505 ]

506

507 # For repeating the metadata later

508 lengths.append(len(recs))

509 for val, key in zip(_meta, meta_keys):

510 if level + 1 > len(val):

511 meta_val = seen_meta[key]

512 else:

513 meta_val = _pull_field(obj, val[level:])

514 meta_vals[key].append(meta_val)

515 records.extend(recs)

516

517 _recursive_extract(data, record_path, {}, level=0)

518

519 result = DataFrame(records)

520

521 if record_prefix is not None:

522 result = result.rename(columns=lambda x: f"{record_prefix}{x}")

523

524 # Data types, a problem

525 for k, v in meta_vals.items():

526 if meta_prefix is not None:

527 k = meta_prefix + k

528

529 if k in result:

530 raise ValueError(

531 f"Conflicting metadata name {k}, need distinguishing prefix "

532 )

533 # GH 37782

534

535 values = np.array(v, dtype=object)

536

537 if values.ndim > 1:

538 # GH 37782

539 values = np.empty((len(v),), dtype=object)

540 for i, v in enumerate(v):

541 values[i] = v

542

543 result[k] = values.repeat(lengths)

544 return result

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/_normalize.py: 10%

145 statements