Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/orc.py: 27%

1""" orc compat """

2from __future__ import annotations

4import io

5from types import ModuleType

6from typing import (

7 Any,

8 Literal,

11from pandas._libs import lib

12from pandas._typing import (

13 DtypeBackend,

14 FilePath,

15 ReadBuffer,

16 WriteBuffer,

17)

18from pandas.compat._optional import import_optional_dependency

19from pandas.util._validators import check_dtype_backend

21from pandas.core.dtypes.common import (

22 is_categorical_dtype,

23 is_interval_dtype,

24 is_period_dtype,

25 is_unsigned_integer_dtype,

26)

28import pandas as pd

29from pandas.core.frame import DataFrame

31from pandas.io.common import get_handle

34def read_orc(

35 path: FilePath | ReadBuffer[bytes],

36 columns: list[str] | None = None,

37 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

38 **kwargs,

39) -> DataFrame:

40 """

41 Load an ORC object from the file path, returning a DataFrame.

43 Parameters

44 ----------

45 path : str, path object, or file-like object

46 String, path object (implementing ``os.PathLike[str]``), or file-like

47 object implementing a binary ``read()`` function. The string could be a URL.

48 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is

49 expected. A local file could be:

50 ``file://localhost/path/to/table.orc``.

51 columns : list, default None

52 If not None, only these columns will be read from the file.

53 Output always follows the ordering of the file and not the columns list.

54 This mirrors the original behaviour of

55 :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.

56 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames

57 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy

58 arrays, nullable dtypes are used for all dtypes that have a nullable

59 implementation when "numpy_nullable" is set, pyarrow is used for all

60 dtypes if "pyarrow" is set.

62 The dtype_backends are still experimential.

64 .. versionadded:: 2.0

66 **kwargs

67 Any additional kwargs are passed to pyarrow.

69 Returns

70 -------

71 DataFrame

73 Notes

74 -----

75 Before using this function you should read the :ref:`user guide about ORC <io.orc>`

76 and :ref:`install optional dependencies <install.warn_orc>`.

77 """

78 # we require a newer version of pyarrow than we support for parquet

80 orc = import_optional_dependency("pyarrow.orc")

82 check_dtype_backend(dtype_backend)

84 with get_handle(path, "rb", is_text=False) as handles:

85 orc_file = orc.ORCFile(handles.handle)

86 pa_table = orc_file.read(columns=columns, **kwargs)

87 if dtype_backend is not lib.no_default:

88 if dtype_backend == "pyarrow":

89 df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)

90 else:

91 from pandas.io._util import _arrow_dtype_mapping

93 mapping = _arrow_dtype_mapping()

94 df = pa_table.to_pandas(types_mapper=mapping.get)

95 return df

96 else:

97 return pa_table.to_pandas()

100def to_orc(

101 df: DataFrame,

102 path: FilePath | WriteBuffer[bytes] | None = None,

103 *,

104 engine: Literal["pyarrow"] = "pyarrow",

105 index: bool | None = None,

106 engine_kwargs: dict[str, Any] | None = None,

107) -> bytes | None:

108 """

109 Write a DataFrame to the ORC format.

110

111 .. versionadded:: 1.5.0

112

113 Parameters

114 ----------

115 df : DataFrame

116 The dataframe to be written to ORC. Raises NotImplementedError

117 if dtype of one or more columns is category, unsigned integers,

118 intervals, periods or sparse.

119 path : str, file-like object or None, default None

120 If a string, it will be used as Root Directory path

121 when writing a partitioned dataset. By file-like object,

122 we refer to objects with a write() method, such as a file handle

123 (e.g. via builtin open function). If path is None,

124 a bytes object is returned.

125 engine : str, default 'pyarrow'

126 ORC library to use. Pyarrow must be >= 7.0.0.

127 index : bool, optional

128 If ``True``, include the dataframe's index(es) in the file output. If

129 ``False``, they will not be written to the file.

130 If ``None``, similar to ``infer`` the dataframe's index(es)

131 will be saved. However, instead of being saved as values,

132 the RangeIndex will be stored as a range in the metadata so it

133 doesn't require much space and is faster. Other indexes will

134 be included as columns in the file output.

135 engine_kwargs : dict[str, Any] or None, default None

136 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

137

138 Returns

139 -------

140 bytes if no path argument is provided else None

141

142 Raises

143 ------

144 NotImplementedError

145 Dtype of one or more columns is category, unsigned integers, interval,

146 period or sparse.

147 ValueError

148 engine is not pyarrow.

149

150 Notes

151 -----

152 * Before using this function you should read the

153 :ref:`user guide about ORC <io.orc>` and

154 :ref:`install optional dependencies <install.warn_orc>`.

155 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_

156 library.

157 * For supported dtypes please refer to `supported ORC features in Arrow

158 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.

159 * Currently timezones in datetime columns are not preserved when a

160 dataframe is converted into ORC files.

161 """

162 if index is None:

163 index = df.index.names[0] is not None

164 if engine_kwargs is None:

165 engine_kwargs = {}

166

167 # If unsupported dtypes are found raise NotImplementedError

168 # In Pyarrow 9.0.0 this check will no longer be needed

169 for dtype in df.dtypes:

170 if (

171 is_categorical_dtype(dtype)

172 or is_interval_dtype(dtype)

173 or is_period_dtype(dtype)

174 or is_unsigned_integer_dtype(dtype)

175 ):

176 raise NotImplementedError(

177 "The dtype of one or more columns is not supported yet."

178 )

179

180 if engine != "pyarrow":

181 raise ValueError("engine must be 'pyarrow'")

182 engine = import_optional_dependency(engine, min_version="7.0.0")

183 orc = import_optional_dependency("pyarrow.orc")

184

185 was_none = path is None

186 if was_none:

187 path = io.BytesIO()

188 assert path is not None # For mypy

189 with get_handle(path, "wb", is_text=False) as handles:

190 assert isinstance(engine, ModuleType) # For mypy

191 try:

192 orc.write_table(

193 engine.Table.from_pandas(df, preserve_index=index),

194 handles.handle,

195 **engine_kwargs,

196 )

197 except TypeError as e:

198 raise NotImplementedError(

199 "The dtype of one or more columns is not supported yet."

200 ) from e

201

202 if was_none:

203 assert isinstance(path, io.BytesIO) # For mypy

204 return path.getvalue()

205 return None