Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/orc.py: 22%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1""" orc compat """
2from __future__ import annotations
4import io
5from types import ModuleType
6from typing import (
7 TYPE_CHECKING,
8 Any,
9 Literal,
10)
12from pandas._config import using_pyarrow_string_dtype
14from pandas._libs import lib
15from pandas.compat._optional import import_optional_dependency
16from pandas.util._validators import check_dtype_backend
18import pandas as pd
19from pandas.core.indexes.api import default_index
21from pandas.io._util import arrow_string_types_mapper
22from pandas.io.common import (
23 get_handle,
24 is_fsspec_url,
25)
27if TYPE_CHECKING:
28 import fsspec
29 import pyarrow.fs
31 from pandas._typing import (
32 DtypeBackend,
33 FilePath,
34 ReadBuffer,
35 WriteBuffer,
36 )
38 from pandas.core.frame import DataFrame
41def read_orc(
42 path: FilePath | ReadBuffer[bytes],
43 columns: list[str] | None = None,
44 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
45 filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,
46 **kwargs: Any,
47) -> DataFrame:
48 """
49 Load an ORC object from the file path, returning a DataFrame.
51 Parameters
52 ----------
53 path : str, path object, or file-like object
54 String, path object (implementing ``os.PathLike[str]``), or file-like
55 object implementing a binary ``read()`` function. The string could be a URL.
56 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
57 expected. A local file could be:
58 ``file://localhost/path/to/table.orc``.
59 columns : list, default None
60 If not None, only these columns will be read from the file.
61 Output always follows the ordering of the file and not the columns list.
62 This mirrors the original behaviour of
63 :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
64 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
65 Back-end data type applied to the resultant :class:`DataFrame`
66 (still experimental). Behaviour is as follows:
68 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
69 (default).
70 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
71 DataFrame.
73 .. versionadded:: 2.0
75 filesystem : fsspec or pyarrow filesystem, default None
76 Filesystem object to use when reading the parquet file.
78 .. versionadded:: 2.1.0
80 **kwargs
81 Any additional kwargs are passed to pyarrow.
83 Returns
84 -------
85 DataFrame
87 Notes
88 -----
89 Before using this function you should read the :ref:`user guide about ORC <io.orc>`
90 and :ref:`install optional dependencies <install.warn_orc>`.
92 If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
93 a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
94 pyarrow or fsspec filesystem object into the filesystem keyword to override this
95 behavior.
97 Examples
98 --------
99 >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP
100 """
101 # we require a newer version of pyarrow than we support for parquet
103 orc = import_optional_dependency("pyarrow.orc")
105 check_dtype_backend(dtype_backend)
107 with get_handle(path, "rb", is_text=False) as handles:
108 source = handles.handle
109 if is_fsspec_url(path) and filesystem is None:
110 pa = import_optional_dependency("pyarrow")
111 pa_fs = import_optional_dependency("pyarrow.fs")
112 try:
113 filesystem, source = pa_fs.FileSystem.from_uri(path)
114 except (TypeError, pa.ArrowInvalid):
115 pass
117 pa_table = orc.read_table(
118 source=source, columns=columns, filesystem=filesystem, **kwargs
119 )
120 if dtype_backend is not lib.no_default:
121 if dtype_backend == "pyarrow":
122 df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
123 else:
124 from pandas.io._util import _arrow_dtype_mapping
126 mapping = _arrow_dtype_mapping()
127 df = pa_table.to_pandas(types_mapper=mapping.get)
128 return df
129 else:
130 if using_pyarrow_string_dtype():
131 types_mapper = arrow_string_types_mapper()
132 else:
133 types_mapper = None
134 return pa_table.to_pandas(types_mapper=types_mapper)
137def to_orc(
138 df: DataFrame,
139 path: FilePath | WriteBuffer[bytes] | None = None,
140 *,
141 engine: Literal["pyarrow"] = "pyarrow",
142 index: bool | None = None,
143 engine_kwargs: dict[str, Any] | None = None,
144) -> bytes | None:
145 """
146 Write a DataFrame to the ORC format.
148 .. versionadded:: 1.5.0
150 Parameters
151 ----------
152 df : DataFrame
153 The dataframe to be written to ORC. Raises NotImplementedError
154 if dtype of one or more columns is category, unsigned integers,
155 intervals, periods or sparse.
156 path : str, file-like object or None, default None
157 If a string, it will be used as Root Directory path
158 when writing a partitioned dataset. By file-like object,
159 we refer to objects with a write() method, such as a file handle
160 (e.g. via builtin open function). If path is None,
161 a bytes object is returned.
162 engine : str, default 'pyarrow'
163 ORC library to use.
164 index : bool, optional
165 If ``True``, include the dataframe's index(es) in the file output. If
166 ``False``, they will not be written to the file.
167 If ``None``, similar to ``infer`` the dataframe's index(es)
168 will be saved. However, instead of being saved as values,
169 the RangeIndex will be stored as a range in the metadata so it
170 doesn't require much space and is faster. Other indexes will
171 be included as columns in the file output.
172 engine_kwargs : dict[str, Any] or None, default None
173 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
175 Returns
176 -------
177 bytes if no path argument is provided else None
179 Raises
180 ------
181 NotImplementedError
182 Dtype of one or more columns is category, unsigned integers, interval,
183 period or sparse.
184 ValueError
185 engine is not pyarrow.
187 Notes
188 -----
189 * Before using this function you should read the
190 :ref:`user guide about ORC <io.orc>` and
191 :ref:`install optional dependencies <install.warn_orc>`.
192 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
193 library.
194 * For supported dtypes please refer to `supported ORC features in Arrow
195 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
196 * Currently timezones in datetime columns are not preserved when a
197 dataframe is converted into ORC files.
198 """
199 if index is None:
200 index = df.index.names[0] is not None
201 if engine_kwargs is None:
202 engine_kwargs = {}
204 # validate index
205 # --------------
207 # validate that we have only a default index
208 # raise on anything else as we don't serialize the index
210 if not df.index.equals(default_index(len(df))):
211 raise ValueError(
212 "orc does not support serializing a non-default index for the index; "
213 "you can .reset_index() to make the index into column(s)"
214 )
216 if df.index.name is not None:
217 raise ValueError("orc does not serialize index meta-data on a default index")
219 if engine != "pyarrow":
220 raise ValueError("engine must be 'pyarrow'")
221 engine = import_optional_dependency(engine, min_version="10.0.1")
222 pa = import_optional_dependency("pyarrow")
223 orc = import_optional_dependency("pyarrow.orc")
225 was_none = path is None
226 if was_none:
227 path = io.BytesIO()
228 assert path is not None # For mypy
229 with get_handle(path, "wb", is_text=False) as handles:
230 assert isinstance(engine, ModuleType) # For mypy
231 try:
232 orc.write_table(
233 engine.Table.from_pandas(df, preserve_index=index),
234 handles.handle,
235 **engine_kwargs,
236 )
237 except (TypeError, pa.ArrowNotImplementedError) as e:
238 raise NotImplementedError(
239 "The dtype of one or more columns is not supported yet."
240 ) from e
242 if was_none:
243 assert isinstance(path, io.BytesIO) # For mypy
244 return path.getvalue()
245 return None