Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/_table

1"""

2Table Schema builders

4https://specs.frictionlessdata.io/table-schema/

5"""

6from __future__ import annotations

8from typing import (

9 TYPE_CHECKING,

10 Any,

11 cast,

12)

13import warnings

15from pandas._libs import lib

16from pandas._libs.json import ujson_loads

17from pandas._libs.tslibs import timezones

18from pandas._libs.tslibs.dtypes import freq_to_period_freqstr

19from pandas.util._exceptions import find_stack_level

21from pandas.core.dtypes.base import _registry as registry

22from pandas.core.dtypes.common import (

23 is_bool_dtype,

24 is_integer_dtype,

25 is_numeric_dtype,

26 is_string_dtype,

27)

28from pandas.core.dtypes.dtypes import (

29 CategoricalDtype,

30 DatetimeTZDtype,

31 ExtensionDtype,

32 PeriodDtype,

33)

35from pandas import DataFrame

36import pandas.core.common as com

38from pandas.tseries.frequencies import to_offset

40if TYPE_CHECKING:

41 from pandas._typing import (

42 DtypeObj,

43 JSONSerializable,

44 )

46 from pandas import Series

47 from pandas.core.indexes.multi import MultiIndex

50TABLE_SCHEMA_VERSION = "1.4.0"

53def as_json_table_type(x: DtypeObj) -> str:

54 """

55 Convert a NumPy / pandas type to its corresponding json_table.

57 Parameters

58 ----------

59 x : np.dtype or ExtensionDtype

61 Returns

62 -------

63 str

64 the Table Schema data types

66 Notes

67 -----

68 This table shows the relationship between NumPy / pandas dtypes,

69 and Table Schema dtypes.

71 ============== =================

72 Pandas type Table Schema type

73 ============== =================

74 int64 integer

75 float64 number

76 bool boolean

77 datetime64[ns] datetime

78 timedelta64[ns] duration

79 object str

80 categorical any

81 =============== =================

82 """

83 if is_integer_dtype(x):

84 return "integer"

85 elif is_bool_dtype(x):

86 return "boolean"

87 elif is_numeric_dtype(x):

88 return "number"

89 elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)):

90 return "datetime"

91 elif lib.is_np_dtype(x, "m"):

92 return "duration"

93 elif isinstance(x, ExtensionDtype):

94 return "any"

95 elif is_string_dtype(x):

96 return "string"

97 else:

98 return "any"

100

101def set_default_names(data):

102 """Sets index names to 'index' for regular, or 'level_x' for Multi"""

103 if com.all_not_none(*data.index.names):

104 nms = data.index.names

105 if len(nms) == 1 and data.index.name == "index":

106 warnings.warn(

107 "Index name of 'index' is not round-trippable.",

108 stacklevel=find_stack_level(),

109 )

110 elif len(nms) > 1 and any(x.startswith("level_") for x in nms):

111 warnings.warn(

112 "Index names beginning with 'level_' are not round-trippable.",

113 stacklevel=find_stack_level(),

114 )

115 return data

116

117 data = data.copy()

118 if data.index.nlevels > 1:

119 data.index.names = com.fill_missing_names(data.index.names)

120 else:

121 data.index.name = data.index.name or "index"

122 return data

123

124

125def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:

126 dtype = arr.dtype

127 name: JSONSerializable

128 if arr.name is None:

129 name = "values"

130 else:

131 name = arr.name

132 field: dict[str, JSONSerializable] = {

133 "name": name,

134 "type": as_json_table_type(dtype),

135 }

136

137 if isinstance(dtype, CategoricalDtype):

138 cats = dtype.categories

139 ordered = dtype.ordered

140

141 field["constraints"] = {"enum": list(cats)}

142 field["ordered"] = ordered

143 elif isinstance(dtype, PeriodDtype):

144 field["freq"] = dtype.freq.freqstr

145 elif isinstance(dtype, DatetimeTZDtype):

146 if timezones.is_utc(dtype.tz):

147 # timezone.utc has no "zone" attr

148 field["tz"] = "UTC"

149 else:

150 # error: "tzinfo" has no attribute "zone"

151 field["tz"] = dtype.tz.zone # type: ignore[attr-defined]

152 elif isinstance(dtype, ExtensionDtype):

153 field["extDtype"] = dtype.name

154 return field

155

156

157def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:

158 """

159 Converts a JSON field descriptor into its corresponding NumPy / pandas type

160

161 Parameters

162 ----------

163 field

164 A JSON field descriptor

165

166 Returns

167 -------

168 dtype

169

170 Raises

171 ------

172 ValueError

173 If the type of the provided field is unknown or currently unsupported

174

175 Examples

176 --------

177 >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})

178 'int64'

179

180 >>> convert_json_field_to_pandas_type(

181 ... {

182 ... "name": "a_categorical",

183 ... "type": "any",

184 ... "constraints": {"enum": ["a", "b", "c"]},

185 ... "ordered": True,

186 ... }

187 ... )

188 CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object)

189

190 >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})

191 'datetime64[ns]'

192

193 >>> convert_json_field_to_pandas_type(

194 ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}

195 ... )

196 'datetime64[ns, US/Central]'

197 """

198 typ = field["type"]

199 if typ == "string":

200 return "object"

201 elif typ == "integer":

202 return field.get("extDtype", "int64")

203 elif typ == "number":

204 return field.get("extDtype", "float64")

205 elif typ == "boolean":

206 return field.get("extDtype", "bool")

207 elif typ == "duration":

208 return "timedelta64"

209 elif typ == "datetime":

210 if field.get("tz"):

211 return f"datetime64[ns, {field['tz']}]"

212 elif field.get("freq"):

213 # GH#9586 rename frequency M to ME for offsets

214 offset = to_offset(field["freq"])

215 freq_n, freq_name = offset.n, offset.name

216 freq = freq_to_period_freqstr(freq_n, freq_name)

217 # GH#47747 using datetime over period to minimize the change surface

218 return f"period[{freq}]"

219 else:

220 return "datetime64[ns]"

221 elif typ == "any":

222 if "constraints" in field and "ordered" in field:

223 return CategoricalDtype(

224 categories=field["constraints"]["enum"], ordered=field["ordered"]

225 )

226 elif "extDtype" in field:

227 return registry.find(field["extDtype"])

228 else:

229 return "object"

230

231 raise ValueError(f"Unsupported or invalid field type: {typ}")

232

233

234def build_table_schema(

235 data: DataFrame | Series,

236 index: bool = True,

237 primary_key: bool | None = None,

238 version: bool = True,

239) -> dict[str, JSONSerializable]:

240 """

241 Create a Table schema from ``data``.

242

243 Parameters

244 ----------

245 data : Series, DataFrame

246 index : bool, default True

247 Whether to include ``data.index`` in the schema.

248 primary_key : bool or None, default True

249 Column names to designate as the primary key.

250 The default `None` will set `'primaryKey'` to the index

251 level or levels if the index is unique.

252 version : bool, default True

253 Whether to include a field `pandas_version` with the version

254 of pandas that last revised the table schema. This version

255 can be different from the installed pandas version.

256

257 Returns

258 -------

259 dict

260

261 Notes

262 -----

263 See `Table Schema

264 <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for

265 conversion types.

266 Timedeltas as converted to ISO8601 duration format with

267 9 decimal places after the seconds field for nanosecond precision.

268

269 Categoricals are converted to the `any` dtype, and use the `enum` field

270 constraint to list the allowed values. The `ordered` attribute is included

271 in an `ordered` field.

272

273 Examples

274 --------

275 >>> from pandas.io.json._table_schema import build_table_schema

276 >>> df = pd.DataFrame(

277 ... {'A': [1, 2, 3],

278 ... 'B': ['a', 'b', 'c'],

279 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),

280 ... }, index=pd.Index(range(3), name='idx'))

281 >>> build_table_schema(df)

282 {'fields': \

283[{'name': 'idx', 'type': 'integer'}, \

284{'name': 'A', 'type': 'integer'}, \

285{'name': 'B', 'type': 'string'}, \

286{'name': 'C', 'type': 'datetime'}], \

287'primaryKey': ['idx'], \

288'pandas_version': '1.4.0'}

289 """

290 if index is True:

291 data = set_default_names(data)

292

293 schema: dict[str, Any] = {}

294 fields = []

295

296 if index:

297 if data.index.nlevels > 1:

298 data.index = cast("MultiIndex", data.index)

299 for level, name in zip(data.index.levels, data.index.names):

300 new_field = convert_pandas_type_to_json_field(level)

301 new_field["name"] = name

302 fields.append(new_field)

303 else:

304 fields.append(convert_pandas_type_to_json_field(data.index))

305

306 if data.ndim > 1:

307 for column, s in data.items():

308 fields.append(convert_pandas_type_to_json_field(s))

309 else:

310 fields.append(convert_pandas_type_to_json_field(data))

311

312 schema["fields"] = fields

313 if index and data.index.is_unique and primary_key is None:

314 if data.index.nlevels == 1:

315 schema["primaryKey"] = [data.index.name]

316 else:

317 schema["primaryKey"] = data.index.names

318 elif primary_key is not None:

319 schema["primaryKey"] = primary_key

320

321 if version:

322 schema["pandas_version"] = TABLE_SCHEMA_VERSION

323 return schema

324

325

326def parse_table_schema(json, precise_float: bool) -> DataFrame:

327 """

328 Builds a DataFrame from a given schema

329

330 Parameters

331 ----------

332 json :

333 A JSON table schema

334 precise_float : bool

335 Flag controlling precision when decoding string to double values, as

336 dictated by ``read_json``

337

338 Returns

339 -------

340 df : DataFrame

341

342 Raises

343 ------

344 NotImplementedError

345 If the JSON table schema contains either timezone or timedelta data

346

347 Notes

348 -----

349 Because :func:`DataFrame.to_json` uses the string 'index' to denote a

350 name-less :class:`Index`, this function sets the name of the returned

351 :class:`DataFrame` to ``None`` when said string is encountered with a

352 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation

353 applies to any strings beginning with 'level_'. Therefore, an

354 :class:`Index` name of 'index' and :class:`MultiIndex` names starting

355 with 'level_' are not supported.

356

357 See Also

358 --------

359 build_table_schema : Inverse function.

360 pandas.read_json

361 """

362 table = ujson_loads(json, precise_float=precise_float)

363 col_order = [field["name"] for field in table["schema"]["fields"]]

364 df = DataFrame(table["data"], columns=col_order)[col_order]

365

366 dtypes = {

367 field["name"]: convert_json_field_to_pandas_type(field)

368 for field in table["schema"]["fields"]

369 }

370

371 # No ISO constructor for Timedelta as of yet, so need to raise

372 if "timedelta64" in dtypes.values():

373 raise NotImplementedError(

374 'table="orient" can not yet read ISO-formatted Timedelta data'

375 )

376

377 df = df.astype(dtypes)

378

379 if "primaryKey" in table["schema"]:

380 df = df.set_index(table["schema"]["primaryKey"])

381 if len(df.index.names) == 1:

382 if df.index.name == "index":

383 df.index.name = None

384 else:

385 df.index.names = [

386 None if x.startswith("level_") else x for x in df.index.names

387 ]

388

389 return df

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/json/_table_schema.py: 16%

138 statements