Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/_table_schema.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

138 statements  

1""" 

2Table Schema builders 

3 

4https://specs.frictionlessdata.io/table-schema/ 

5""" 

6from __future__ import annotations 

7 

8from typing import ( 

9 TYPE_CHECKING, 

10 Any, 

11 cast, 

12) 

13import warnings 

14 

15from pandas._libs import lib 

16from pandas._libs.json import ujson_loads 

17from pandas._libs.tslibs import timezones 

18from pandas._libs.tslibs.dtypes import freq_to_period_freqstr 

19from pandas.util._exceptions import find_stack_level 

20 

21from pandas.core.dtypes.base import _registry as registry 

22from pandas.core.dtypes.common import ( 

23 is_bool_dtype, 

24 is_integer_dtype, 

25 is_numeric_dtype, 

26 is_string_dtype, 

27) 

28from pandas.core.dtypes.dtypes import ( 

29 CategoricalDtype, 

30 DatetimeTZDtype, 

31 ExtensionDtype, 

32 PeriodDtype, 

33) 

34 

35from pandas import DataFrame 

36import pandas.core.common as com 

37 

38from pandas.tseries.frequencies import to_offset 

39 

40if TYPE_CHECKING: 

41 from pandas._typing import ( 

42 DtypeObj, 

43 JSONSerializable, 

44 ) 

45 

46 from pandas import Series 

47 from pandas.core.indexes.multi import MultiIndex 

48 

49 

50TABLE_SCHEMA_VERSION = "1.4.0" 

51 

52 

53def as_json_table_type(x: DtypeObj) -> str: 

54 """ 

55 Convert a NumPy / pandas type to its corresponding json_table. 

56 

57 Parameters 

58 ---------- 

59 x : np.dtype or ExtensionDtype 

60 

61 Returns 

62 ------- 

63 str 

64 the Table Schema data types 

65 

66 Notes 

67 ----- 

68 This table shows the relationship between NumPy / pandas dtypes, 

69 and Table Schema dtypes. 

70 

71 ============== ================= 

72 Pandas type Table Schema type 

73 ============== ================= 

74 int64 integer 

75 float64 number 

76 bool boolean 

77 datetime64[ns] datetime 

78 timedelta64[ns] duration 

79 object str 

80 categorical any 

81 =============== ================= 

82 """ 

83 if is_integer_dtype(x): 

84 return "integer" 

85 elif is_bool_dtype(x): 

86 return "boolean" 

87 elif is_numeric_dtype(x): 

88 return "number" 

89 elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)): 

90 return "datetime" 

91 elif lib.is_np_dtype(x, "m"): 

92 return "duration" 

93 elif isinstance(x, ExtensionDtype): 

94 return "any" 

95 elif is_string_dtype(x): 

96 return "string" 

97 else: 

98 return "any" 

99 

100 

101def set_default_names(data): 

102 """Sets index names to 'index' for regular, or 'level_x' for Multi""" 

103 if com.all_not_none(*data.index.names): 

104 nms = data.index.names 

105 if len(nms) == 1 and data.index.name == "index": 

106 warnings.warn( 

107 "Index name of 'index' is not round-trippable.", 

108 stacklevel=find_stack_level(), 

109 ) 

110 elif len(nms) > 1 and any(x.startswith("level_") for x in nms): 

111 warnings.warn( 

112 "Index names beginning with 'level_' are not round-trippable.", 

113 stacklevel=find_stack_level(), 

114 ) 

115 return data 

116 

117 data = data.copy() 

118 if data.index.nlevels > 1: 

119 data.index.names = com.fill_missing_names(data.index.names) 

120 else: 

121 data.index.name = data.index.name or "index" 

122 return data 

123 

124 

125def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]: 

126 dtype = arr.dtype 

127 name: JSONSerializable 

128 if arr.name is None: 

129 name = "values" 

130 else: 

131 name = arr.name 

132 field: dict[str, JSONSerializable] = { 

133 "name": name, 

134 "type": as_json_table_type(dtype), 

135 } 

136 

137 if isinstance(dtype, CategoricalDtype): 

138 cats = dtype.categories 

139 ordered = dtype.ordered 

140 

141 field["constraints"] = {"enum": list(cats)} 

142 field["ordered"] = ordered 

143 elif isinstance(dtype, PeriodDtype): 

144 field["freq"] = dtype.freq.freqstr 

145 elif isinstance(dtype, DatetimeTZDtype): 

146 if timezones.is_utc(dtype.tz): 

147 # timezone.utc has no "zone" attr 

148 field["tz"] = "UTC" 

149 else: 

150 # error: "tzinfo" has no attribute "zone" 

151 field["tz"] = dtype.tz.zone # type: ignore[attr-defined] 

152 elif isinstance(dtype, ExtensionDtype): 

153 field["extDtype"] = dtype.name 

154 return field 

155 

156 

157def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: 

158 """ 

159 Converts a JSON field descriptor into its corresponding NumPy / pandas type 

160 

161 Parameters 

162 ---------- 

163 field 

164 A JSON field descriptor 

165 

166 Returns 

167 ------- 

168 dtype 

169 

170 Raises 

171 ------ 

172 ValueError 

173 If the type of the provided field is unknown or currently unsupported 

174 

175 Examples 

176 -------- 

177 >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"}) 

178 'int64' 

179 

180 >>> convert_json_field_to_pandas_type( 

181 ... { 

182 ... "name": "a_categorical", 

183 ... "type": "any", 

184 ... "constraints": {"enum": ["a", "b", "c"]}, 

185 ... "ordered": True, 

186 ... } 

187 ... ) 

188 CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object) 

189 

190 >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"}) 

191 'datetime64[ns]' 

192 

193 >>> convert_json_field_to_pandas_type( 

194 ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"} 

195 ... ) 

196 'datetime64[ns, US/Central]' 

197 """ 

198 typ = field["type"] 

199 if typ == "string": 

200 return "object" 

201 elif typ == "integer": 

202 return field.get("extDtype", "int64") 

203 elif typ == "number": 

204 return field.get("extDtype", "float64") 

205 elif typ == "boolean": 

206 return field.get("extDtype", "bool") 

207 elif typ == "duration": 

208 return "timedelta64" 

209 elif typ == "datetime": 

210 if field.get("tz"): 

211 return f"datetime64[ns, {field['tz']}]" 

212 elif field.get("freq"): 

213 # GH#9586 rename frequency M to ME for offsets 

214 offset = to_offset(field["freq"]) 

215 freq_n, freq_name = offset.n, offset.name 

216 freq = freq_to_period_freqstr(freq_n, freq_name) 

217 # GH#47747 using datetime over period to minimize the change surface 

218 return f"period[{freq}]" 

219 else: 

220 return "datetime64[ns]" 

221 elif typ == "any": 

222 if "constraints" in field and "ordered" in field: 

223 return CategoricalDtype( 

224 categories=field["constraints"]["enum"], ordered=field["ordered"] 

225 ) 

226 elif "extDtype" in field: 

227 return registry.find(field["extDtype"]) 

228 else: 

229 return "object" 

230 

231 raise ValueError(f"Unsupported or invalid field type: {typ}") 

232 

233 

234def build_table_schema( 

235 data: DataFrame | Series, 

236 index: bool = True, 

237 primary_key: bool | None = None, 

238 version: bool = True, 

239) -> dict[str, JSONSerializable]: 

240 """ 

241 Create a Table schema from ``data``. 

242 

243 Parameters 

244 ---------- 

245 data : Series, DataFrame 

246 index : bool, default True 

247 Whether to include ``data.index`` in the schema. 

248 primary_key : bool or None, default True 

249 Column names to designate as the primary key. 

250 The default `None` will set `'primaryKey'` to the index 

251 level or levels if the index is unique. 

252 version : bool, default True 

253 Whether to include a field `pandas_version` with the version 

254 of pandas that last revised the table schema. This version 

255 can be different from the installed pandas version. 

256 

257 Returns 

258 ------- 

259 dict 

260 

261 Notes 

262 ----- 

263 See `Table Schema 

264 <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for 

265 conversion types. 

266 Timedeltas as converted to ISO8601 duration format with 

267 9 decimal places after the seconds field for nanosecond precision. 

268 

269 Categoricals are converted to the `any` dtype, and use the `enum` field 

270 constraint to list the allowed values. The `ordered` attribute is included 

271 in an `ordered` field. 

272 

273 Examples 

274 -------- 

275 >>> from pandas.io.json._table_schema import build_table_schema 

276 >>> df = pd.DataFrame( 

277 ... {'A': [1, 2, 3], 

278 ... 'B': ['a', 'b', 'c'], 

279 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), 

280 ... }, index=pd.Index(range(3), name='idx')) 

281 >>> build_table_schema(df) 

282 {'fields': \ 

283[{'name': 'idx', 'type': 'integer'}, \ 

284{'name': 'A', 'type': 'integer'}, \ 

285{'name': 'B', 'type': 'string'}, \ 

286{'name': 'C', 'type': 'datetime'}], \ 

287'primaryKey': ['idx'], \ 

288'pandas_version': '1.4.0'} 

289 """ 

290 if index is True: 

291 data = set_default_names(data) 

292 

293 schema: dict[str, Any] = {} 

294 fields = [] 

295 

296 if index: 

297 if data.index.nlevels > 1: 

298 data.index = cast("MultiIndex", data.index) 

299 for level, name in zip(data.index.levels, data.index.names): 

300 new_field = convert_pandas_type_to_json_field(level) 

301 new_field["name"] = name 

302 fields.append(new_field) 

303 else: 

304 fields.append(convert_pandas_type_to_json_field(data.index)) 

305 

306 if data.ndim > 1: 

307 for column, s in data.items(): 

308 fields.append(convert_pandas_type_to_json_field(s)) 

309 else: 

310 fields.append(convert_pandas_type_to_json_field(data)) 

311 

312 schema["fields"] = fields 

313 if index and data.index.is_unique and primary_key is None: 

314 if data.index.nlevels == 1: 

315 schema["primaryKey"] = [data.index.name] 

316 else: 

317 schema["primaryKey"] = data.index.names 

318 elif primary_key is not None: 

319 schema["primaryKey"] = primary_key 

320 

321 if version: 

322 schema["pandas_version"] = TABLE_SCHEMA_VERSION 

323 return schema 

324 

325 

326def parse_table_schema(json, precise_float: bool) -> DataFrame: 

327 """ 

328 Builds a DataFrame from a given schema 

329 

330 Parameters 

331 ---------- 

332 json : 

333 A JSON table schema 

334 precise_float : bool 

335 Flag controlling precision when decoding string to double values, as 

336 dictated by ``read_json`` 

337 

338 Returns 

339 ------- 

340 df : DataFrame 

341 

342 Raises 

343 ------ 

344 NotImplementedError 

345 If the JSON table schema contains either timezone or timedelta data 

346 

347 Notes 

348 ----- 

349 Because :func:`DataFrame.to_json` uses the string 'index' to denote a 

350 name-less :class:`Index`, this function sets the name of the returned 

351 :class:`DataFrame` to ``None`` when said string is encountered with a 

352 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation 

353 applies to any strings beginning with 'level_'. Therefore, an 

354 :class:`Index` name of 'index' and :class:`MultiIndex` names starting 

355 with 'level_' are not supported. 

356 

357 See Also 

358 -------- 

359 build_table_schema : Inverse function. 

360 pandas.read_json 

361 """ 

362 table = ujson_loads(json, precise_float=precise_float) 

363 col_order = [field["name"] for field in table["schema"]["fields"]] 

364 df = DataFrame(table["data"], columns=col_order)[col_order] 

365 

366 dtypes = { 

367 field["name"]: convert_json_field_to_pandas_type(field) 

368 for field in table["schema"]["fields"] 

369 } 

370 

371 # No ISO constructor for Timedelta as of yet, so need to raise 

372 if "timedelta64" in dtypes.values(): 

373 raise NotImplementedError( 

374 'table="orient" can not yet read ISO-formatted Timedelta data' 

375 ) 

376 

377 df = df.astype(dtypes) 

378 

379 if "primaryKey" in table["schema"]: 

380 df = df.set_index(table["schema"]["primaryKey"]) 

381 if len(df.index.names) == 1: 

382 if df.index.name == "index": 

383 df.index.name = None 

384 else: 

385 df.index.names = [ 

386 None if x.startswith("level_") else x for x in df.index.names 

387 ] 

388 

389 return df