1"""
2Table Schema builders
3
4https://specs.frictionlessdata.io/table-schema/
5"""
6from __future__ import annotations
7
8from typing import (
9 TYPE_CHECKING,
10 Any,
11 cast,
12)
13import warnings
14
15from pandas._libs import lib
16from pandas._libs.json import ujson_loads
17from pandas._libs.tslibs import timezones
18from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
19from pandas.util._exceptions import find_stack_level
20
21from pandas.core.dtypes.base import _registry as registry
22from pandas.core.dtypes.common import (
23 is_bool_dtype,
24 is_integer_dtype,
25 is_numeric_dtype,
26 is_string_dtype,
27)
28from pandas.core.dtypes.dtypes import (
29 CategoricalDtype,
30 DatetimeTZDtype,
31 ExtensionDtype,
32 PeriodDtype,
33)
34
35from pandas import DataFrame
36import pandas.core.common as com
37
38from pandas.tseries.frequencies import to_offset
39
40if TYPE_CHECKING:
41 from pandas._typing import (
42 DtypeObj,
43 JSONSerializable,
44 )
45
46 from pandas import Series
47 from pandas.core.indexes.multi import MultiIndex
48
49
50TABLE_SCHEMA_VERSION = "1.4.0"
51
52
53def as_json_table_type(x: DtypeObj) -> str:
54 """
55 Convert a NumPy / pandas type to its corresponding json_table.
56
57 Parameters
58 ----------
59 x : np.dtype or ExtensionDtype
60
61 Returns
62 -------
63 str
64 the Table Schema data types
65
66 Notes
67 -----
68 This table shows the relationship between NumPy / pandas dtypes,
69 and Table Schema dtypes.
70
71 ============== =================
72 Pandas type Table Schema type
73 ============== =================
74 int64 integer
75 float64 number
76 bool boolean
77 datetime64[ns] datetime
78 timedelta64[ns] duration
79 object str
80 categorical any
81 =============== =================
82 """
83 if is_integer_dtype(x):
84 return "integer"
85 elif is_bool_dtype(x):
86 return "boolean"
87 elif is_numeric_dtype(x):
88 return "number"
89 elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)):
90 return "datetime"
91 elif lib.is_np_dtype(x, "m"):
92 return "duration"
93 elif isinstance(x, ExtensionDtype):
94 return "any"
95 elif is_string_dtype(x):
96 return "string"
97 else:
98 return "any"
99
100
101def set_default_names(data):
102 """Sets index names to 'index' for regular, or 'level_x' for Multi"""
103 if com.all_not_none(*data.index.names):
104 nms = data.index.names
105 if len(nms) == 1 and data.index.name == "index":
106 warnings.warn(
107 "Index name of 'index' is not round-trippable.",
108 stacklevel=find_stack_level(),
109 )
110 elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
111 warnings.warn(
112 "Index names beginning with 'level_' are not round-trippable.",
113 stacklevel=find_stack_level(),
114 )
115 return data
116
117 data = data.copy()
118 if data.index.nlevels > 1:
119 data.index.names = com.fill_missing_names(data.index.names)
120 else:
121 data.index.name = data.index.name or "index"
122 return data
123
124
125def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
126 dtype = arr.dtype
127 name: JSONSerializable
128 if arr.name is None:
129 name = "values"
130 else:
131 name = arr.name
132 field: dict[str, JSONSerializable] = {
133 "name": name,
134 "type": as_json_table_type(dtype),
135 }
136
137 if isinstance(dtype, CategoricalDtype):
138 cats = dtype.categories
139 ordered = dtype.ordered
140
141 field["constraints"] = {"enum": list(cats)}
142 field["ordered"] = ordered
143 elif isinstance(dtype, PeriodDtype):
144 field["freq"] = dtype.freq.freqstr
145 elif isinstance(dtype, DatetimeTZDtype):
146 if timezones.is_utc(dtype.tz):
147 # timezone.utc has no "zone" attr
148 field["tz"] = "UTC"
149 else:
150 # error: "tzinfo" has no attribute "zone"
151 field["tz"] = dtype.tz.zone # type: ignore[attr-defined]
152 elif isinstance(dtype, ExtensionDtype):
153 field["extDtype"] = dtype.name
154 return field
155
156
157def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
158 """
159 Converts a JSON field descriptor into its corresponding NumPy / pandas type
160
161 Parameters
162 ----------
163 field
164 A JSON field descriptor
165
166 Returns
167 -------
168 dtype
169
170 Raises
171 ------
172 ValueError
173 If the type of the provided field is unknown or currently unsupported
174
175 Examples
176 --------
177 >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
178 'int64'
179
180 >>> convert_json_field_to_pandas_type(
181 ... {
182 ... "name": "a_categorical",
183 ... "type": "any",
184 ... "constraints": {"enum": ["a", "b", "c"]},
185 ... "ordered": True,
186 ... }
187 ... )
188 CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object)
189
190 >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
191 'datetime64[ns]'
192
193 >>> convert_json_field_to_pandas_type(
194 ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
195 ... )
196 'datetime64[ns, US/Central]'
197 """
198 typ = field["type"]
199 if typ == "string":
200 return "object"
201 elif typ == "integer":
202 return field.get("extDtype", "int64")
203 elif typ == "number":
204 return field.get("extDtype", "float64")
205 elif typ == "boolean":
206 return field.get("extDtype", "bool")
207 elif typ == "duration":
208 return "timedelta64"
209 elif typ == "datetime":
210 if field.get("tz"):
211 return f"datetime64[ns, {field['tz']}]"
212 elif field.get("freq"):
213 # GH#9586 rename frequency M to ME for offsets
214 offset = to_offset(field["freq"])
215 freq_n, freq_name = offset.n, offset.name
216 freq = freq_to_period_freqstr(freq_n, freq_name)
217 # GH#47747 using datetime over period to minimize the change surface
218 return f"period[{freq}]"
219 else:
220 return "datetime64[ns]"
221 elif typ == "any":
222 if "constraints" in field and "ordered" in field:
223 return CategoricalDtype(
224 categories=field["constraints"]["enum"], ordered=field["ordered"]
225 )
226 elif "extDtype" in field:
227 return registry.find(field["extDtype"])
228 else:
229 return "object"
230
231 raise ValueError(f"Unsupported or invalid field type: {typ}")
232
233
234def build_table_schema(
235 data: DataFrame | Series,
236 index: bool = True,
237 primary_key: bool | None = None,
238 version: bool = True,
239) -> dict[str, JSONSerializable]:
240 """
241 Create a Table schema from ``data``.
242
243 Parameters
244 ----------
245 data : Series, DataFrame
246 index : bool, default True
247 Whether to include ``data.index`` in the schema.
248 primary_key : bool or None, default True
249 Column names to designate as the primary key.
250 The default `None` will set `'primaryKey'` to the index
251 level or levels if the index is unique.
252 version : bool, default True
253 Whether to include a field `pandas_version` with the version
254 of pandas that last revised the table schema. This version
255 can be different from the installed pandas version.
256
257 Returns
258 -------
259 dict
260
261 Notes
262 -----
263 See `Table Schema
264 <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
265 conversion types.
266 Timedeltas as converted to ISO8601 duration format with
267 9 decimal places after the seconds field for nanosecond precision.
268
269 Categoricals are converted to the `any` dtype, and use the `enum` field
270 constraint to list the allowed values. The `ordered` attribute is included
271 in an `ordered` field.
272
273 Examples
274 --------
275 >>> from pandas.io.json._table_schema import build_table_schema
276 >>> df = pd.DataFrame(
277 ... {'A': [1, 2, 3],
278 ... 'B': ['a', 'b', 'c'],
279 ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
280 ... }, index=pd.Index(range(3), name='idx'))
281 >>> build_table_schema(df)
282 {'fields': \
283[{'name': 'idx', 'type': 'integer'}, \
284{'name': 'A', 'type': 'integer'}, \
285{'name': 'B', 'type': 'string'}, \
286{'name': 'C', 'type': 'datetime'}], \
287'primaryKey': ['idx'], \
288'pandas_version': '1.4.0'}
289 """
290 if index is True:
291 data = set_default_names(data)
292
293 schema: dict[str, Any] = {}
294 fields = []
295
296 if index:
297 if data.index.nlevels > 1:
298 data.index = cast("MultiIndex", data.index)
299 for level, name in zip(data.index.levels, data.index.names):
300 new_field = convert_pandas_type_to_json_field(level)
301 new_field["name"] = name
302 fields.append(new_field)
303 else:
304 fields.append(convert_pandas_type_to_json_field(data.index))
305
306 if data.ndim > 1:
307 for column, s in data.items():
308 fields.append(convert_pandas_type_to_json_field(s))
309 else:
310 fields.append(convert_pandas_type_to_json_field(data))
311
312 schema["fields"] = fields
313 if index and data.index.is_unique and primary_key is None:
314 if data.index.nlevels == 1:
315 schema["primaryKey"] = [data.index.name]
316 else:
317 schema["primaryKey"] = data.index.names
318 elif primary_key is not None:
319 schema["primaryKey"] = primary_key
320
321 if version:
322 schema["pandas_version"] = TABLE_SCHEMA_VERSION
323 return schema
324
325
326def parse_table_schema(json, precise_float: bool) -> DataFrame:
327 """
328 Builds a DataFrame from a given schema
329
330 Parameters
331 ----------
332 json :
333 A JSON table schema
334 precise_float : bool
335 Flag controlling precision when decoding string to double values, as
336 dictated by ``read_json``
337
338 Returns
339 -------
340 df : DataFrame
341
342 Raises
343 ------
344 NotImplementedError
345 If the JSON table schema contains either timezone or timedelta data
346
347 Notes
348 -----
349 Because :func:`DataFrame.to_json` uses the string 'index' to denote a
350 name-less :class:`Index`, this function sets the name of the returned
351 :class:`DataFrame` to ``None`` when said string is encountered with a
352 normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
353 applies to any strings beginning with 'level_'. Therefore, an
354 :class:`Index` name of 'index' and :class:`MultiIndex` names starting
355 with 'level_' are not supported.
356
357 See Also
358 --------
359 build_table_schema : Inverse function.
360 pandas.read_json
361 """
362 table = ujson_loads(json, precise_float=precise_float)
363 col_order = [field["name"] for field in table["schema"]["fields"]]
364 df = DataFrame(table["data"], columns=col_order)[col_order]
365
366 dtypes = {
367 field["name"]: convert_json_field_to_pandas_type(field)
368 for field in table["schema"]["fields"]
369 }
370
371 # No ISO constructor for Timedelta as of yet, so need to raise
372 if "timedelta64" in dtypes.values():
373 raise NotImplementedError(
374 'table="orient" can not yet read ISO-formatted Timedelta data'
375 )
376
377 df = df.astype(dtypes)
378
379 if "primaryKey" in table["schema"]:
380 df = df.set_index(table["schema"]["primaryKey"])
381 if len(df.index.names) == 1:
382 if df.index.name == "index":
383 df.index.name = None
384 else:
385 df.index.names = [
386 None if x.startswith("level_") else x for x in df.index.names
387 ]
388
389 return df