Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/orc.py: 22%

1""" orc compat """

2from __future__ import annotations

4import io

5from types import ModuleType

6from typing import (

7 TYPE_CHECKING,

8 Any,

9 Literal,

10)

12from pandas._config import using_pyarrow_string_dtype

14from pandas._libs import lib

15from pandas.compat._optional import import_optional_dependency

16from pandas.util._validators import check_dtype_backend

18import pandas as pd

19from pandas.core.indexes.api import default_index

21from pandas.io._util import arrow_string_types_mapper

22from pandas.io.common import (

23 get_handle,

24 is_fsspec_url,

25)

27if TYPE_CHECKING:

28 import fsspec

29 import pyarrow.fs

31 from pandas._typing import (

32 DtypeBackend,

33 FilePath,

34 ReadBuffer,

35 WriteBuffer,

36 )

38 from pandas.core.frame import DataFrame

41def read_orc(

42 path: FilePath | ReadBuffer[bytes],

43 columns: list[str] | None = None,

44 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

45 filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,

46 **kwargs: Any,

47) -> DataFrame:

48 """

49 Load an ORC object from the file path, returning a DataFrame.

51 Parameters

52 ----------

53 path : str, path object, or file-like object

54 String, path object (implementing ``os.PathLike[str]``), or file-like

55 object implementing a binary ``read()`` function. The string could be a URL.

56 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is

57 expected. A local file could be:

58 ``file://localhost/path/to/table.orc``.

59 columns : list, default None

60 If not None, only these columns will be read from the file.

61 Output always follows the ordering of the file and not the columns list.

62 This mirrors the original behaviour of

63 :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.

64 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'

65 Back-end data type applied to the resultant :class:`DataFrame`

66 (still experimental). Behaviour is as follows:

68 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

69 (default).

70 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

71 DataFrame.

73 .. versionadded:: 2.0

75 filesystem : fsspec or pyarrow filesystem, default None

76 Filesystem object to use when reading the parquet file.

78 .. versionadded:: 2.1.0

80 **kwargs

81 Any additional kwargs are passed to pyarrow.

83 Returns

84 -------

85 DataFrame

87 Notes

88 -----

89 Before using this function you should read the :ref:`user guide about ORC <io.orc>`

90 and :ref:`install optional dependencies <install.warn_orc>`.

92 If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),

93 a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a

94 pyarrow or fsspec filesystem object into the filesystem keyword to override this

95 behavior.

97 Examples

98 --------

99 >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP

100 """

101 # we require a newer version of pyarrow than we support for parquet

102

103 orc = import_optional_dependency("pyarrow.orc")

104

105 check_dtype_backend(dtype_backend)

106

107 with get_handle(path, "rb", is_text=False) as handles:

108 source = handles.handle

109 if is_fsspec_url(path) and filesystem is None:

110 pa = import_optional_dependency("pyarrow")

111 pa_fs = import_optional_dependency("pyarrow.fs")

112 try:

113 filesystem, source = pa_fs.FileSystem.from_uri(path)

114 except (TypeError, pa.ArrowInvalid):

115 pass

116

117 pa_table = orc.read_table(

118 source=source, columns=columns, filesystem=filesystem, **kwargs

119 )

120 if dtype_backend is not lib.no_default:

121 if dtype_backend == "pyarrow":

122 df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)

123 else:

124 from pandas.io._util import _arrow_dtype_mapping

125

126 mapping = _arrow_dtype_mapping()

127 df = pa_table.to_pandas(types_mapper=mapping.get)

128 return df

129 else:

130 if using_pyarrow_string_dtype():

131 types_mapper = arrow_string_types_mapper()

132 else:

133 types_mapper = None

134 return pa_table.to_pandas(types_mapper=types_mapper)

135

136

137def to_orc(

138 df: DataFrame,

139 path: FilePath | WriteBuffer[bytes] | None = None,

140 *,

141 engine: Literal["pyarrow"] = "pyarrow",

142 index: bool | None = None,

143 engine_kwargs: dict[str, Any] | None = None,

144) -> bytes | None:

145 """

146 Write a DataFrame to the ORC format.

147

148 .. versionadded:: 1.5.0

149

150 Parameters

151 ----------

152 df : DataFrame

153 The dataframe to be written to ORC. Raises NotImplementedError

154 if dtype of one or more columns is category, unsigned integers,

155 intervals, periods or sparse.

156 path : str, file-like object or None, default None

157 If a string, it will be used as Root Directory path

158 when writing a partitioned dataset. By file-like object,

159 we refer to objects with a write() method, such as a file handle

160 (e.g. via builtin open function). If path is None,

161 a bytes object is returned.

162 engine : str, default 'pyarrow'

163 ORC library to use.

164 index : bool, optional

165 If ``True``, include the dataframe's index(es) in the file output. If

166 ``False``, they will not be written to the file.

167 If ``None``, similar to ``infer`` the dataframe's index(es)

168 will be saved. However, instead of being saved as values,

169 the RangeIndex will be stored as a range in the metadata so it

170 doesn't require much space and is faster. Other indexes will

171 be included as columns in the file output.

172 engine_kwargs : dict[str, Any] or None, default None

173 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

174

175 Returns

176 -------

177 bytes if no path argument is provided else None

178

179 Raises

180 ------

181 NotImplementedError

182 Dtype of one or more columns is category, unsigned integers, interval,

183 period or sparse.

184 ValueError

185 engine is not pyarrow.

186

187 Notes

188 -----

189 * Before using this function you should read the

190 :ref:`user guide about ORC <io.orc>` and

191 :ref:`install optional dependencies <install.warn_orc>`.

192 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_

193 library.

194 * For supported dtypes please refer to `supported ORC features in Arrow

195 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.

196 * Currently timezones in datetime columns are not preserved when a

197 dataframe is converted into ORC files.

198 """

199 if index is None:

200 index = df.index.names[0] is not None

201 if engine_kwargs is None:

202 engine_kwargs = {}

203

204 # validate index

205 # --------------

206

207 # validate that we have only a default index

208 # raise on anything else as we don't serialize the index

209

210 if not df.index.equals(default_index(len(df))):

211 raise ValueError(

212 "orc does not support serializing a non-default index for the index; "

213 "you can .reset_index() to make the index into column(s)"

214 )

215

216 if df.index.name is not None:

217 raise ValueError("orc does not serialize index meta-data on a default index")

218

219 if engine != "pyarrow":

220 raise ValueError("engine must be 'pyarrow'")

221 engine = import_optional_dependency(engine, min_version="10.0.1")

222 pa = import_optional_dependency("pyarrow")

223 orc = import_optional_dependency("pyarrow.orc")

224

225 was_none = path is None

226 if was_none:

227 path = io.BytesIO()

228 assert path is not None # For mypy

229 with get_handle(path, "wb", is_text=False) as handles:

230 assert isinstance(engine, ModuleType) # For mypy

231 try:

232 orc.write_table(

233 engine.Table.from_pandas(df, preserve_index=index),

234 handles.handle,

235 **engine_kwargs,

236 )

237 except (TypeError, pa.ArrowNotImplementedError) as e:

238 raise NotImplementedError(

239 "The dtype of one or more columns is not supported yet."

240 ) from e

241

242 if was_none:

243 assert isinstance(path, io.BytesIO) # For mypy

244 return path.getvalue()

245 return None