Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/orc.py: 27%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1""" orc compat """
2from __future__ import annotations
4import io
5from types import ModuleType
6from typing import (
7 Any,
8 Literal,
9)
11from pandas._libs import lib
12from pandas._typing import (
13 DtypeBackend,
14 FilePath,
15 ReadBuffer,
16 WriteBuffer,
17)
18from pandas.compat._optional import import_optional_dependency
19from pandas.util._validators import check_dtype_backend
21from pandas.core.dtypes.common import (
22 is_categorical_dtype,
23 is_interval_dtype,
24 is_period_dtype,
25 is_unsigned_integer_dtype,
26)
28import pandas as pd
29from pandas.core.frame import DataFrame
31from pandas.io.common import get_handle
34def read_orc(
35 path: FilePath | ReadBuffer[bytes],
36 columns: list[str] | None = None,
37 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
38 **kwargs,
39) -> DataFrame:
40 """
41 Load an ORC object from the file path, returning a DataFrame.
43 Parameters
44 ----------
45 path : str, path object, or file-like object
46 String, path object (implementing ``os.PathLike[str]``), or file-like
47 object implementing a binary ``read()`` function. The string could be a URL.
48 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
49 expected. A local file could be:
50 ``file://localhost/path/to/table.orc``.
51 columns : list, default None
52 If not None, only these columns will be read from the file.
53 Output always follows the ordering of the file and not the columns list.
54 This mirrors the original behaviour of
55 :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
56 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
57 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
58 arrays, nullable dtypes are used for all dtypes that have a nullable
59 implementation when "numpy_nullable" is set, pyarrow is used for all
60 dtypes if "pyarrow" is set.
62 The dtype_backends are still experimential.
64 .. versionadded:: 2.0
66 **kwargs
67 Any additional kwargs are passed to pyarrow.
69 Returns
70 -------
71 DataFrame
73 Notes
74 -----
75 Before using this function you should read the :ref:`user guide about ORC <io.orc>`
76 and :ref:`install optional dependencies <install.warn_orc>`.
77 """
78 # we require a newer version of pyarrow than we support for parquet
80 orc = import_optional_dependency("pyarrow.orc")
82 check_dtype_backend(dtype_backend)
84 with get_handle(path, "rb", is_text=False) as handles:
85 orc_file = orc.ORCFile(handles.handle)
86 pa_table = orc_file.read(columns=columns, **kwargs)
87 if dtype_backend is not lib.no_default:
88 if dtype_backend == "pyarrow":
89 df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
90 else:
91 from pandas.io._util import _arrow_dtype_mapping
93 mapping = _arrow_dtype_mapping()
94 df = pa_table.to_pandas(types_mapper=mapping.get)
95 return df
96 else:
97 return pa_table.to_pandas()
100def to_orc(
101 df: DataFrame,
102 path: FilePath | WriteBuffer[bytes] | None = None,
103 *,
104 engine: Literal["pyarrow"] = "pyarrow",
105 index: bool | None = None,
106 engine_kwargs: dict[str, Any] | None = None,
107) -> bytes | None:
108 """
109 Write a DataFrame to the ORC format.
111 .. versionadded:: 1.5.0
113 Parameters
114 ----------
115 df : DataFrame
116 The dataframe to be written to ORC. Raises NotImplementedError
117 if dtype of one or more columns is category, unsigned integers,
118 intervals, periods or sparse.
119 path : str, file-like object or None, default None
120 If a string, it will be used as Root Directory path
121 when writing a partitioned dataset. By file-like object,
122 we refer to objects with a write() method, such as a file handle
123 (e.g. via builtin open function). If path is None,
124 a bytes object is returned.
125 engine : str, default 'pyarrow'
126 ORC library to use. Pyarrow must be >= 7.0.0.
127 index : bool, optional
128 If ``True``, include the dataframe's index(es) in the file output. If
129 ``False``, they will not be written to the file.
130 If ``None``, similar to ``infer`` the dataframe's index(es)
131 will be saved. However, instead of being saved as values,
132 the RangeIndex will be stored as a range in the metadata so it
133 doesn't require much space and is faster. Other indexes will
134 be included as columns in the file output.
135 engine_kwargs : dict[str, Any] or None, default None
136 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
138 Returns
139 -------
140 bytes if no path argument is provided else None
142 Raises
143 ------
144 NotImplementedError
145 Dtype of one or more columns is category, unsigned integers, interval,
146 period or sparse.
147 ValueError
148 engine is not pyarrow.
150 Notes
151 -----
152 * Before using this function you should read the
153 :ref:`user guide about ORC <io.orc>` and
154 :ref:`install optional dependencies <install.warn_orc>`.
155 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
156 library.
157 * For supported dtypes please refer to `supported ORC features in Arrow
158 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
159 * Currently timezones in datetime columns are not preserved when a
160 dataframe is converted into ORC files.
161 """
162 if index is None:
163 index = df.index.names[0] is not None
164 if engine_kwargs is None:
165 engine_kwargs = {}
167 # If unsupported dtypes are found raise NotImplementedError
168 # In Pyarrow 9.0.0 this check will no longer be needed
169 for dtype in df.dtypes:
170 if (
171 is_categorical_dtype(dtype)
172 or is_interval_dtype(dtype)
173 or is_period_dtype(dtype)
174 or is_unsigned_integer_dtype(dtype)
175 ):
176 raise NotImplementedError(
177 "The dtype of one or more columns is not supported yet."
178 )
180 if engine != "pyarrow":
181 raise ValueError("engine must be 'pyarrow'")
182 engine = import_optional_dependency(engine, min_version="7.0.0")
183 orc = import_optional_dependency("pyarrow.orc")
185 was_none = path is None
186 if was_none:
187 path = io.BytesIO()
188 assert path is not None # For mypy
189 with get_handle(path, "wb", is_text=False) as handles:
190 assert isinstance(engine, ModuleType) # For mypy
191 try:
192 orc.write_table(
193 engine.Table.from_pandas(df, preserve_index=index),
194 handles.handle,
195 **engine_kwargs,
196 )
197 except TypeError as e:
198 raise NotImplementedError(
199 "The dtype of one or more columns is not supported yet."
200 ) from e
202 if was_none:
203 assert isinstance(path, io.BytesIO) # For mypy
204 return path.getvalue()
205 return None