1"""
2Table Schema builders
3
4https://specs.frictionlessdata.io/table-schema/
5"""
6from __future__ import annotations
7
8from typing import (
9 TYPE_CHECKING,
10 Any,
11 cast,
12)
13import warnings
14
15from pandas._libs.json import loads
16from pandas._libs.tslibs import timezones
17from pandas._typing import (
18 DtypeObj,
19 JSONSerializable,
20)
21from pandas.util._exceptions import find_stack_level
22
23from pandas.core.dtypes.base import _registry as registry
24from pandas.core.dtypes.common import (
25 is_bool_dtype,
26 is_categorical_dtype,
27 is_datetime64_dtype,
28 is_datetime64tz_dtype,
29 is_extension_array_dtype,
30 is_integer_dtype,
31 is_numeric_dtype,
32 is_period_dtype,
33 is_string_dtype,
34 is_timedelta64_dtype,
35)
36from pandas.core.dtypes.dtypes import CategoricalDtype
37
38from pandas import DataFrame
39import pandas.core.common as com
40
41if TYPE_CHECKING:
42 from pandas import Series
43 from pandas.core.indexes.multi import MultiIndex
44
45
46TABLE_SCHEMA_VERSION = "1.4.0"
47
48
49def as_json_table_type(x: DtypeObj) -> str:
50 """
51 Convert a NumPy / pandas type to its corresponding json_table.
52
53 Parameters
54 ----------
55 x : np.dtype or ExtensionDtype
56
57 Returns
58 -------
59 str
60 the Table Schema data types
61
62 Notes
63 -----
64 This table shows the relationship between NumPy / pandas dtypes,
65 and Table Schema dtypes.
66
67 ============== =================
68 Pandas type Table Schema type
69 ============== =================
70 int64 integer
71 float64 number
72 bool boolean
73 datetime64[ns] datetime
74 timedelta64[ns] duration
75 object str
76 categorical any
77 =============== =================
78 """
79 if is_integer_dtype(x):
80 return "integer"
81 elif is_bool_dtype(x):
82 return "boolean"
83 elif is_numeric_dtype(x):
84 return "number"
85 elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
86 return "datetime"
87 elif is_timedelta64_dtype(x):
88 return "duration"
89 elif is_categorical_dtype(x):
90 return "any"
91 elif is_extension_array_dtype(x):
92 return "any"
93 elif is_string_dtype(x):
94 return "string"
95 else:
96 return "any"
97
98
99def set_default_names(data):
100 """Sets index names to 'index' for regular, or 'level_x' for Multi"""
101 if com.all_not_none(*data.index.names):
102 nms = data.index.names
103 if len(nms) == 1 and data.index.name == "index":
104 warnings.warn(
105 "Index name of 'index' is not round-trippable.",
106 stacklevel=find_stack_level(),
107 )
108 elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
109 warnings.warn(
110 "Index names beginning with 'level_' are not round-trippable.",
111 stacklevel=find_stack_level(),
112 )
113 return data
114
115 data = data.copy()
116 if data.index.nlevels > 1:
117 data.index.names = com.fill_missing_names(data.index.names)
118 else:
119 data.index.name = data.index.name or "index"
120 return data
121
122
123def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
124 dtype = arr.dtype
125 name: JSONSerializable
126 if arr.name is None:
127 name = "values"
128 else:
129 name = arr.name
130 field: dict[str, JSONSerializable] = {
131 "name": name,
132 "type": as_json_table_type(dtype),
133 }
134
135 if is_categorical_dtype(dtype):
136 cats = dtype.categories
137 ordered = dtype.ordered
138
139 field["constraints"] = {"enum": list(cats)}
140 field["ordered"] = ordered
141 elif is_period_dtype(dtype):
142 field["freq"] = dtype.freq.freqstr
143 elif is_datetime64tz_dtype(dtype):
144 if timezones.is_utc(dtype.tz):
145 # timezone.utc has no "zone" attr
146 field["tz"] = "UTC"
147 else:
148 field["tz"] = dtype.tz.zone
149 elif is_extension_array_dtype(dtype):
150 field["extDtype"] = dtype.name
151 return field
152
153
154def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
155 """
156 Converts a JSON field descriptor into its corresponding NumPy / pandas type
157
158 Parameters
159 ----------
160 field
161 A JSON field descriptor
162
163 Returns
164 -------
165 dtype
166
167 Raises
168 ------
169 ValueError
170 If the type of the provided field is unknown or currently unsupported
171
172 Examples
173 --------
174 >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
175 'int64'
176
177 >>> convert_json_field_to_pandas_type(
178 ... {
179 ... "name": "a_categorical",
180 ... "type": "any",
181 ... "constraints": {"enum": ["a", "b", "c"]},
182 ... "ordered": True,
183 ... }
184 ... )
185 CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)
186
187 >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
188 'datetime64[ns]'
189
190 >>> convert_json_field_to_pandas_type(
191 ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
192 ... )
193 'datetime64[ns, US/Central]'
194 """
195 typ = field["type"]
196 if typ == "string":
197 return "object"
198 elif typ == "integer":
199 return field.get("extDtype", "int64")
200 elif typ == "number":
201 return field.get("extDtype", "float64")
202 elif typ == "boolean":
203 return field.get("extDtype", "bool")
204 elif typ == "duration":
205 return "timedelta64"
206 elif typ == "datetime":
207 if field.get("tz"):
208 return f"datetime64[ns, {field['tz']}]"
209 elif field.get("freq"):
210 # GH#47747 using datetime over period to minimize the change surface
211 return f"period[{field['freq']}]"
212 else:
213 return "datetime64[ns]"
214 elif typ == "any":
215 if "constraints" in field and "ordered" in field:
216 return CategoricalDtype(
217 categories=field["constraints"]["enum"], ordered=field["ordered"]
218 )
219 elif "extDtype" in field:
220 return registry.find(field["extDtype"])
221 else:
222 return "object"
223
224 raise ValueError(f"Unsupported or invalid field type: {typ}")
225
226
227def build_table_schema(
228 data: DataFrame | Series,
229 index: bool = True,
230 primary_key: bool | None = None,
231 version: bool = True,
232) -> dict[str, JSONSerializable]:
233 """
234 Create a Table schema from ``data``.
235
236 Parameters
237 ----------
238 data : Series, DataFrame
239 index : bool, default True
240 Whether to include ``data.index`` in the schema.
241 primary_key : bool or None, default True
242 Column names to designate as the primary key.
243 The default `None` will set `'primaryKey'` to the index
244 level or levels if the index is unique.
245 version : bool, default True
246 Whether to include a field `pandas_version` with the version
247 of pandas that last revised the table schema. This version
248 can be different from the installed pandas version.
249
250 Returns
251 -------
252 dict
253
254 Notes
255 -----
256 See `Table Schema
257 <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
258 conversion types.
259 Timedeltas as converted to ISO8601 duration format with
260 9 decimal places after the seconds field for nanosecond precision.
261
262 Categoricals are converted to the `any` dtype, and use the `enum` field
263 constraint to list the allowed values. The `ordered` attribute is included
264 in an `ordered` field.
265
266 Examples
267 --------
268 >>> from pandas.io.json._table_schema import build_table_schema
269 >>> df = pd.DataFrame(
270 ... {'A': [1, 2, 3],
271 ... 'B': ['a', 'b', 'c'],
272 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
273 ... }, index=pd.Index(range(3), name='idx'))
274 >>> build_table_schema(df)
275 {'fields': \
276[{'name': 'idx', 'type': 'integer'}, \
277{'name': 'A', 'type': 'integer'}, \
278{'name': 'B', 'type': 'string'}, \
279{'name': 'C', 'type': 'datetime'}], \
280'primaryKey': ['idx'], \
281'pandas_version': '1.4.0'}
282 """
283 if index is True:
284 data = set_default_names(data)
285
286 schema: dict[str, Any] = {}
287 fields = []
288
289 if index:
290 if data.index.nlevels > 1:
291 data.index = cast("MultiIndex", data.index)
292 for level, name in zip(data.index.levels, data.index.names):
293 new_field = convert_pandas_type_to_json_field(level)
294 new_field["name"] = name
295 fields.append(new_field)
296 else:
297 fields.append(convert_pandas_type_to_json_field(data.index))
298
299 if data.ndim > 1:
300 for column, s in data.items():
301 fields.append(convert_pandas_type_to_json_field(s))
302 else:
303 fields.append(convert_pandas_type_to_json_field(data))
304
305 schema["fields"] = fields
306 if index and data.index.is_unique and primary_key is None:
307 if data.index.nlevels == 1:
308 schema["primaryKey"] = [data.index.name]
309 else:
310 schema["primaryKey"] = data.index.names
311 elif primary_key is not None:
312 schema["primaryKey"] = primary_key
313
314 if version:
315 schema["pandas_version"] = TABLE_SCHEMA_VERSION
316 return schema
317
318
319def parse_table_schema(json, precise_float):
320 """
321 Builds a DataFrame from a given schema
322
323 Parameters
324 ----------
325 json :
326 A JSON table schema
327 precise_float : bool
328 Flag controlling precision when decoding string to double values, as
329 dictated by ``read_json``
330
331 Returns
332 -------
333 df : DataFrame
334
335 Raises
336 ------
337 NotImplementedError
338 If the JSON table schema contains either timezone or timedelta data
339
340 Notes
341 -----
342 Because :func:`DataFrame.to_json` uses the string 'index' to denote a
343 name-less :class:`Index`, this function sets the name of the returned
344 :class:`DataFrame` to ``None`` when said string is encountered with a
345 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
346 applies to any strings beginning with 'level_'. Therefore, an
347 :class:`Index` name of 'index' and :class:`MultiIndex` names starting
348 with 'level_' are not supported.
349
350 See Also
351 --------
352 build_table_schema : Inverse function.
353 pandas.read_json
354 """
355 table = loads(json, precise_float=precise_float)
356 col_order = [field["name"] for field in table["schema"]["fields"]]
357 df = DataFrame(table["data"], columns=col_order)[col_order]
358
359 dtypes = {
360 field["name"]: convert_json_field_to_pandas_type(field)
361 for field in table["schema"]["fields"]
362 }
363
364 # No ISO constructor for Timedelta as of yet, so need to raise
365 if "timedelta64" in dtypes.values():
366 raise NotImplementedError(
367 'table="orient" can not yet read ISO-formatted Timedelta data'
368 )
369
370 df = df.astype(dtypes)
371
372 if "primaryKey" in table["schema"]:
373 df = df.set_index(table["schema"]["primaryKey"])
374 if len(df.index.names) == 1:
375 if df.index.name == "index":
376 df.index.name = None
377 else:
378 df.index.names = [
379 None if x.startswith("level_") else x for x in df.index.names
380 ]
381
382 return df