Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/_table

1"""

2Table Schema builders

4https://specs.frictionlessdata.io/table-schema/

5"""

6from __future__ import annotations

8from typing import (

9 TYPE_CHECKING,

10 Any,

11 cast,

12)

13import warnings

15from pandas._libs.json import loads

16from pandas._libs.tslibs import timezones

17from pandas._typing import (

18 DtypeObj,

19 JSONSerializable,

20)

21from pandas.util._exceptions import find_stack_level

23from pandas.core.dtypes.base import _registry as registry

24from pandas.core.dtypes.common import (

25 is_bool_dtype,

26 is_categorical_dtype,

27 is_datetime64_dtype,

28 is_datetime64tz_dtype,

29 is_extension_array_dtype,

30 is_integer_dtype,

31 is_numeric_dtype,

32 is_period_dtype,

33 is_string_dtype,

34 is_timedelta64_dtype,

35)

36from pandas.core.dtypes.dtypes import CategoricalDtype

38from pandas import DataFrame

39import pandas.core.common as com

41if TYPE_CHECKING:

42 from pandas import Series

43 from pandas.core.indexes.multi import MultiIndex

46TABLE_SCHEMA_VERSION = "1.4.0"

49def as_json_table_type(x: DtypeObj) -> str:

50 """

51 Convert a NumPy / pandas type to its corresponding json_table.

53 Parameters

54 ----------

55 x : np.dtype or ExtensionDtype

57 Returns

58 -------

59 str

60 the Table Schema data types

62 Notes

63 -----

64 This table shows the relationship between NumPy / pandas dtypes,

65 and Table Schema dtypes.

67 ============== =================

68 Pandas type Table Schema type

69 ============== =================

70 int64 integer

71 float64 number

72 bool boolean

73 datetime64[ns] datetime

74 timedelta64[ns] duration

75 object str

76 categorical any

77 =============== =================

78 """

79 if is_integer_dtype(x):

80 return "integer"

81 elif is_bool_dtype(x):

82 return "boolean"

83 elif is_numeric_dtype(x):

84 return "number"

85 elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):

86 return "datetime"

87 elif is_timedelta64_dtype(x):

88 return "duration"

89 elif is_categorical_dtype(x):

90 return "any"

91 elif is_extension_array_dtype(x):

92 return "any"

93 elif is_string_dtype(x):

94 return "string"

95 else:

96 return "any"

99def set_default_names(data):

100 """Sets index names to 'index' for regular, or 'level_x' for Multi"""

101 if com.all_not_none(*data.index.names):

102 nms = data.index.names

103 if len(nms) == 1 and data.index.name == "index":

104 warnings.warn(

105 "Index name of 'index' is not round-trippable.",

106 stacklevel=find_stack_level(),

107 )

108 elif len(nms) > 1 and any(x.startswith("level_") for x in nms):

109 warnings.warn(

110 "Index names beginning with 'level_' are not round-trippable.",

111 stacklevel=find_stack_level(),

112 )

113 return data

114

115 data = data.copy()

116 if data.index.nlevels > 1:

117 data.index.names = com.fill_missing_names(data.index.names)

118 else:

119 data.index.name = data.index.name or "index"

120 return data

121

122

123def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:

124 dtype = arr.dtype

125 name: JSONSerializable

126 if arr.name is None:

127 name = "values"

128 else:

129 name = arr.name

130 field: dict[str, JSONSerializable] = {

131 "name": name,

132 "type": as_json_table_type(dtype),

133 }

134

135 if is_categorical_dtype(dtype):

136 cats = dtype.categories

137 ordered = dtype.ordered

138

139 field["constraints"] = {"enum": list(cats)}

140 field["ordered"] = ordered

141 elif is_period_dtype(dtype):

142 field["freq"] = dtype.freq.freqstr

143 elif is_datetime64tz_dtype(dtype):

144 if timezones.is_utc(dtype.tz):

145 # timezone.utc has no "zone" attr

146 field["tz"] = "UTC"

147 else:

148 field["tz"] = dtype.tz.zone

149 elif is_extension_array_dtype(dtype):

150 field["extDtype"] = dtype.name

151 return field

152

153

154def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:

155 """

156 Converts a JSON field descriptor into its corresponding NumPy / pandas type

157

158 Parameters

159 ----------

160 field

161 A JSON field descriptor

162

163 Returns

164 -------

165 dtype

166

167 Raises

168 ------

169 ValueError

170 If the type of the provided field is unknown or currently unsupported

171

172 Examples

173 --------

174 >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})

175 'int64'

176

177 >>> convert_json_field_to_pandas_type(

178 ... {

179 ... "name": "a_categorical",

180 ... "type": "any",

181 ... "constraints": {"enum": ["a", "b", "c"]},

182 ... "ordered": True,

183 ... }

184 ... )

185 CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)

186

187 >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})

188 'datetime64[ns]'

189

190 >>> convert_json_field_to_pandas_type(

191 ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}

192 ... )

193 'datetime64[ns, US/Central]'

194 """

195 typ = field["type"]

196 if typ == "string":

197 return "object"

198 elif typ == "integer":

199 return field.get("extDtype", "int64")

200 elif typ == "number":

201 return field.get("extDtype", "float64")

202 elif typ == "boolean":

203 return field.get("extDtype", "bool")

204 elif typ == "duration":

205 return "timedelta64"

206 elif typ == "datetime":

207 if field.get("tz"):

208 return f"datetime64[ns, {field['tz']}]"

209 elif field.get("freq"):

210 # GH#47747 using datetime over period to minimize the change surface

211 return f"period[{field['freq']}]"

212 else:

213 return "datetime64[ns]"

214 elif typ == "any":

215 if "constraints" in field and "ordered" in field:

216 return CategoricalDtype(

217 categories=field["constraints"]["enum"], ordered=field["ordered"]

218 )

219 elif "extDtype" in field:

220 return registry.find(field["extDtype"])

221 else:

222 return "object"

223

224 raise ValueError(f"Unsupported or invalid field type: {typ}")

225

226

227def build_table_schema(

228 data: DataFrame | Series,

229 index: bool = True,

230 primary_key: bool | None = None,

231 version: bool = True,

232) -> dict[str, JSONSerializable]:

233 """

234 Create a Table schema from ``data``.

235

236 Parameters

237 ----------

238 data : Series, DataFrame

239 index : bool, default True

240 Whether to include ``data.index`` in the schema.

241 primary_key : bool or None, default True

242 Column names to designate as the primary key.

243 The default `None` will set `'primaryKey'` to the index

244 level or levels if the index is unique.

245 version : bool, default True

246 Whether to include a field `pandas_version` with the version

247 of pandas that last revised the table schema. This version

248 can be different from the installed pandas version.

249

250 Returns

251 -------

252 dict

253

254 Notes

255 -----

256 See `Table Schema

257 <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for

258 conversion types.

259 Timedeltas as converted to ISO8601 duration format with

260 9 decimal places after the seconds field for nanosecond precision.

261

262 Categoricals are converted to the `any` dtype, and use the `enum` field

263 constraint to list the allowed values. The `ordered` attribute is included

264 in an `ordered` field.

265

266 Examples

267 --------

268 >>> from pandas.io.json._table_schema import build_table_schema

269 >>> df = pd.DataFrame(

270 ... {'A': [1, 2, 3],

271 ... 'B': ['a', 'b', 'c'],

272 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),

273 ... }, index=pd.Index(range(3), name='idx'))

274 >>> build_table_schema(df)

275 {'fields': \

276[{'name': 'idx', 'type': 'integer'}, \

277{'name': 'A', 'type': 'integer'}, \

278{'name': 'B', 'type': 'string'}, \

279{'name': 'C', 'type': 'datetime'}], \

280'primaryKey': ['idx'], \

281'pandas_version': '1.4.0'}

282 """

283 if index is True:

284 data = set_default_names(data)

285

286 schema: dict[str, Any] = {}

287 fields = []

288

289 if index:

290 if data.index.nlevels > 1:

291 data.index = cast("MultiIndex", data.index)

292 for level, name in zip(data.index.levels, data.index.names):

293 new_field = convert_pandas_type_to_json_field(level)

294 new_field["name"] = name

295 fields.append(new_field)

296 else:

297 fields.append(convert_pandas_type_to_json_field(data.index))

298

299 if data.ndim > 1:

300 for column, s in data.items():

301 fields.append(convert_pandas_type_to_json_field(s))

302 else:

303 fields.append(convert_pandas_type_to_json_field(data))

304

305 schema["fields"] = fields

306 if index and data.index.is_unique and primary_key is None:

307 if data.index.nlevels == 1:

308 schema["primaryKey"] = [data.index.name]

309 else:

310 schema["primaryKey"] = data.index.names

311 elif primary_key is not None:

312 schema["primaryKey"] = primary_key

313

314 if version:

315 schema["pandas_version"] = TABLE_SCHEMA_VERSION

316 return schema

317

318

319def parse_table_schema(json, precise_float):

320 """

321 Builds a DataFrame from a given schema

322

323 Parameters

324 ----------

325 json :

326 A JSON table schema

327 precise_float : bool

328 Flag controlling precision when decoding string to double values, as

329 dictated by ``read_json``

330

331 Returns

332 -------

333 df : DataFrame

334

335 Raises

336 ------

337 NotImplementedError

338 If the JSON table schema contains either timezone or timedelta data

339

340 Notes

341 -----

342 Because :func:`DataFrame.to_json` uses the string 'index' to denote a

343 name-less :class:`Index`, this function sets the name of the returned

344 :class:`DataFrame` to ``None`` when said string is encountered with a

345 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation

346 applies to any strings beginning with 'level_'. Therefore, an

347 :class:`Index` name of 'index' and :class:`MultiIndex` names starting

348 with 'level_' are not supported.

349

350 See Also

351 --------

352 build_table_schema : Inverse function.

353 pandas.read_json

354 """

355 table = loads(json, precise_float=precise_float)

356 col_order = [field["name"] for field in table["schema"]["fields"]]

357 df = DataFrame(table["data"], columns=col_order)[col_order]

358

359 dtypes = {

360 field["name"]: convert_json_field_to_pandas_type(field)

361 for field in table["schema"]["fields"]

362 }

363

364 # No ISO constructor for Timedelta as of yet, so need to raise

365 if "timedelta64" in dtypes.values():

366 raise NotImplementedError(

367 'table="orient" can not yet read ISO-formatted Timedelta data'

368 )

369

370 df = df.astype(dtypes)

371

372 if "primaryKey" in table["schema"]:

373 df = df.set_index(table["schema"]["primaryKey"])

374 if len(df.index.names) == 1:

375 if df.index.name == "index":

376 df.index.name = None

377 else:

378 df.index.names = [

379 None if x.startswith("level_") else x for x in df.index.names

380 ]

381

382 return df

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/json/_table_schema.py: 15%

134 statements