Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/_table_schema.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

134 statements  

1""" 

2Table Schema builders 

3 

4https://specs.frictionlessdata.io/table-schema/ 

5""" 

6from __future__ import annotations 

7 

8from typing import ( 

9 TYPE_CHECKING, 

10 Any, 

11 cast, 

12) 

13import warnings 

14 

15from pandas._libs.json import loads 

16from pandas._libs.tslibs import timezones 

17from pandas._typing import ( 

18 DtypeObj, 

19 JSONSerializable, 

20) 

21from pandas.util._exceptions import find_stack_level 

22 

23from pandas.core.dtypes.base import _registry as registry 

24from pandas.core.dtypes.common import ( 

25 is_bool_dtype, 

26 is_categorical_dtype, 

27 is_datetime64_dtype, 

28 is_datetime64tz_dtype, 

29 is_extension_array_dtype, 

30 is_integer_dtype, 

31 is_numeric_dtype, 

32 is_period_dtype, 

33 is_string_dtype, 

34 is_timedelta64_dtype, 

35) 

36from pandas.core.dtypes.dtypes import CategoricalDtype 

37 

38from pandas import DataFrame 

39import pandas.core.common as com 

40 

41if TYPE_CHECKING: 

42 from pandas import Series 

43 from pandas.core.indexes.multi import MultiIndex 

44 

45 

46TABLE_SCHEMA_VERSION = "1.4.0" 

47 

48 

49def as_json_table_type(x: DtypeObj) -> str: 

50 """ 

51 Convert a NumPy / pandas type to its corresponding json_table. 

52 

53 Parameters 

54 ---------- 

55 x : np.dtype or ExtensionDtype 

56 

57 Returns 

58 ------- 

59 str 

60 the Table Schema data types 

61 

62 Notes 

63 ----- 

64 This table shows the relationship between NumPy / pandas dtypes, 

65 and Table Schema dtypes. 

66 

67 ============== ================= 

68 Pandas type Table Schema type 

69 ============== ================= 

70 int64 integer 

71 float64 number 

72 bool boolean 

73 datetime64[ns] datetime 

74 timedelta64[ns] duration 

75 object str 

76 categorical any 

77 =============== ================= 

78 """ 

79 if is_integer_dtype(x): 

80 return "integer" 

81 elif is_bool_dtype(x): 

82 return "boolean" 

83 elif is_numeric_dtype(x): 

84 return "number" 

85 elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x): 

86 return "datetime" 

87 elif is_timedelta64_dtype(x): 

88 return "duration" 

89 elif is_categorical_dtype(x): 

90 return "any" 

91 elif is_extension_array_dtype(x): 

92 return "any" 

93 elif is_string_dtype(x): 

94 return "string" 

95 else: 

96 return "any" 

97 

98 

99def set_default_names(data): 

100 """Sets index names to 'index' for regular, or 'level_x' for Multi""" 

101 if com.all_not_none(*data.index.names): 

102 nms = data.index.names 

103 if len(nms) == 1 and data.index.name == "index": 

104 warnings.warn( 

105 "Index name of 'index' is not round-trippable.", 

106 stacklevel=find_stack_level(), 

107 ) 

108 elif len(nms) > 1 and any(x.startswith("level_") for x in nms): 

109 warnings.warn( 

110 "Index names beginning with 'level_' are not round-trippable.", 

111 stacklevel=find_stack_level(), 

112 ) 

113 return data 

114 

115 data = data.copy() 

116 if data.index.nlevels > 1: 

117 data.index.names = com.fill_missing_names(data.index.names) 

118 else: 

119 data.index.name = data.index.name or "index" 

120 return data 

121 

122 

123def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]: 

124 dtype = arr.dtype 

125 name: JSONSerializable 

126 if arr.name is None: 

127 name = "values" 

128 else: 

129 name = arr.name 

130 field: dict[str, JSONSerializable] = { 

131 "name": name, 

132 "type": as_json_table_type(dtype), 

133 } 

134 

135 if is_categorical_dtype(dtype): 

136 cats = dtype.categories 

137 ordered = dtype.ordered 

138 

139 field["constraints"] = {"enum": list(cats)} 

140 field["ordered"] = ordered 

141 elif is_period_dtype(dtype): 

142 field["freq"] = dtype.freq.freqstr 

143 elif is_datetime64tz_dtype(dtype): 

144 if timezones.is_utc(dtype.tz): 

145 # timezone.utc has no "zone" attr 

146 field["tz"] = "UTC" 

147 else: 

148 field["tz"] = dtype.tz.zone 

149 elif is_extension_array_dtype(dtype): 

150 field["extDtype"] = dtype.name 

151 return field 

152 

153 

154def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: 

155 """ 

156 Converts a JSON field descriptor into its corresponding NumPy / pandas type 

157 

158 Parameters 

159 ---------- 

160 field 

161 A JSON field descriptor 

162 

163 Returns 

164 ------- 

165 dtype 

166 

167 Raises 

168 ------ 

169 ValueError 

170 If the type of the provided field is unknown or currently unsupported 

171 

172 Examples 

173 -------- 

174 >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"}) 

175 'int64' 

176 

177 >>> convert_json_field_to_pandas_type( 

178 ... { 

179 ... "name": "a_categorical", 

180 ... "type": "any", 

181 ... "constraints": {"enum": ["a", "b", "c"]}, 

182 ... "ordered": True, 

183 ... } 

184 ... ) 

185 CategoricalDtype(categories=['a', 'b', 'c'], ordered=True) 

186 

187 >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"}) 

188 'datetime64[ns]' 

189 

190 >>> convert_json_field_to_pandas_type( 

191 ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"} 

192 ... ) 

193 'datetime64[ns, US/Central]' 

194 """ 

195 typ = field["type"] 

196 if typ == "string": 

197 return "object" 

198 elif typ == "integer": 

199 return field.get("extDtype", "int64") 

200 elif typ == "number": 

201 return field.get("extDtype", "float64") 

202 elif typ == "boolean": 

203 return field.get("extDtype", "bool") 

204 elif typ == "duration": 

205 return "timedelta64" 

206 elif typ == "datetime": 

207 if field.get("tz"): 

208 return f"datetime64[ns, {field['tz']}]" 

209 elif field.get("freq"): 

210 # GH#47747 using datetime over period to minimize the change surface 

211 return f"period[{field['freq']}]" 

212 else: 

213 return "datetime64[ns]" 

214 elif typ == "any": 

215 if "constraints" in field and "ordered" in field: 

216 return CategoricalDtype( 

217 categories=field["constraints"]["enum"], ordered=field["ordered"] 

218 ) 

219 elif "extDtype" in field: 

220 return registry.find(field["extDtype"]) 

221 else: 

222 return "object" 

223 

224 raise ValueError(f"Unsupported or invalid field type: {typ}") 

225 

226 

227def build_table_schema( 

228 data: DataFrame | Series, 

229 index: bool = True, 

230 primary_key: bool | None = None, 

231 version: bool = True, 

232) -> dict[str, JSONSerializable]: 

233 """ 

234 Create a Table schema from ``data``. 

235 

236 Parameters 

237 ---------- 

238 data : Series, DataFrame 

239 index : bool, default True 

240 Whether to include ``data.index`` in the schema. 

241 primary_key : bool or None, default True 

242 Column names to designate as the primary key. 

243 The default `None` will set `'primaryKey'` to the index 

244 level or levels if the index is unique. 

245 version : bool, default True 

246 Whether to include a field `pandas_version` with the version 

247 of pandas that last revised the table schema. This version 

248 can be different from the installed pandas version. 

249 

250 Returns 

251 ------- 

252 dict 

253 

254 Notes 

255 ----- 

256 See `Table Schema 

257 <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for 

258 conversion types. 

259 Timedeltas as converted to ISO8601 duration format with 

260 9 decimal places after the seconds field for nanosecond precision. 

261 

262 Categoricals are converted to the `any` dtype, and use the `enum` field 

263 constraint to list the allowed values. The `ordered` attribute is included 

264 in an `ordered` field. 

265 

266 Examples 

267 -------- 

268 >>> from pandas.io.json._table_schema import build_table_schema 

269 >>> df = pd.DataFrame( 

270 ... {'A': [1, 2, 3], 

271 ... 'B': ['a', 'b', 'c'], 

272 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), 

273 ... }, index=pd.Index(range(3), name='idx')) 

274 >>> build_table_schema(df) 

275 {'fields': \ 

276[{'name': 'idx', 'type': 'integer'}, \ 

277{'name': 'A', 'type': 'integer'}, \ 

278{'name': 'B', 'type': 'string'}, \ 

279{'name': 'C', 'type': 'datetime'}], \ 

280'primaryKey': ['idx'], \ 

281'pandas_version': '1.4.0'} 

282 """ 

283 if index is True: 

284 data = set_default_names(data) 

285 

286 schema: dict[str, Any] = {} 

287 fields = [] 

288 

289 if index: 

290 if data.index.nlevels > 1: 

291 data.index = cast("MultiIndex", data.index) 

292 for level, name in zip(data.index.levels, data.index.names): 

293 new_field = convert_pandas_type_to_json_field(level) 

294 new_field["name"] = name 

295 fields.append(new_field) 

296 else: 

297 fields.append(convert_pandas_type_to_json_field(data.index)) 

298 

299 if data.ndim > 1: 

300 for column, s in data.items(): 

301 fields.append(convert_pandas_type_to_json_field(s)) 

302 else: 

303 fields.append(convert_pandas_type_to_json_field(data)) 

304 

305 schema["fields"] = fields 

306 if index and data.index.is_unique and primary_key is None: 

307 if data.index.nlevels == 1: 

308 schema["primaryKey"] = [data.index.name] 

309 else: 

310 schema["primaryKey"] = data.index.names 

311 elif primary_key is not None: 

312 schema["primaryKey"] = primary_key 

313 

314 if version: 

315 schema["pandas_version"] = TABLE_SCHEMA_VERSION 

316 return schema 

317 

318 

319def parse_table_schema(json, precise_float): 

320 """ 

321 Builds a DataFrame from a given schema 

322 

323 Parameters 

324 ---------- 

325 json : 

326 A JSON table schema 

327 precise_float : bool 

328 Flag controlling precision when decoding string to double values, as 

329 dictated by ``read_json`` 

330 

331 Returns 

332 ------- 

333 df : DataFrame 

334 

335 Raises 

336 ------ 

337 NotImplementedError 

338 If the JSON table schema contains either timezone or timedelta data 

339 

340 Notes 

341 ----- 

342 Because :func:`DataFrame.to_json` uses the string 'index' to denote a 

343 name-less :class:`Index`, this function sets the name of the returned 

344 :class:`DataFrame` to ``None`` when said string is encountered with a 

345 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation 

346 applies to any strings beginning with 'level_'. Therefore, an 

347 :class:`Index` name of 'index' and :class:`MultiIndex` names starting 

348 with 'level_' are not supported. 

349 

350 See Also 

351 -------- 

352 build_table_schema : Inverse function. 

353 pandas.read_json 

354 """ 

355 table = loads(json, precise_float=precise_float) 

356 col_order = [field["name"] for field in table["schema"]["fields"]] 

357 df = DataFrame(table["data"], columns=col_order)[col_order] 

358 

359 dtypes = { 

360 field["name"]: convert_json_field_to_pandas_type(field) 

361 for field in table["schema"]["fields"] 

362 } 

363 

364 # No ISO constructor for Timedelta as of yet, so need to raise 

365 if "timedelta64" in dtypes.values(): 

366 raise NotImplementedError( 

367 'table="orient" can not yet read ISO-formatted Timedelta data' 

368 ) 

369 

370 df = df.astype(dtypes) 

371 

372 if "primaryKey" in table["schema"]: 

373 df = df.set_index(table["schema"]["primaryKey"]) 

374 if len(df.index.names) == 1: 

375 if df.index.name == "index": 

376 df.index.name = None 

377 else: 

378 df.index.names = [ 

379 None if x.startswith("level_") else x for x in df.index.names 

380 ] 

381 

382 return df