Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/orc.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

69 statements  

1""" orc compat """ 

2from __future__ import annotations 

3 

4import io 

5from types import ModuleType 

6from typing import ( 

7 TYPE_CHECKING, 

8 Any, 

9 Literal, 

10) 

11 

12from pandas._config import using_pyarrow_string_dtype 

13 

14from pandas._libs import lib 

15from pandas.compat._optional import import_optional_dependency 

16from pandas.util._validators import check_dtype_backend 

17 

18import pandas as pd 

19from pandas.core.indexes.api import default_index 

20 

21from pandas.io._util import arrow_string_types_mapper 

22from pandas.io.common import ( 

23 get_handle, 

24 is_fsspec_url, 

25) 

26 

27if TYPE_CHECKING: 

28 import fsspec 

29 import pyarrow.fs 

30 

31 from pandas._typing import ( 

32 DtypeBackend, 

33 FilePath, 

34 ReadBuffer, 

35 WriteBuffer, 

36 ) 

37 

38 from pandas.core.frame import DataFrame 

39 

40 

41def read_orc( 

42 path: FilePath | ReadBuffer[bytes], 

43 columns: list[str] | None = None, 

44 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

45 filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None, 

46 **kwargs: Any, 

47) -> DataFrame: 

48 """ 

49 Load an ORC object from the file path, returning a DataFrame. 

50 

51 Parameters 

52 ---------- 

53 path : str, path object, or file-like object 

54 String, path object (implementing ``os.PathLike[str]``), or file-like 

55 object implementing a binary ``read()`` function. The string could be a URL. 

56 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is 

57 expected. A local file could be: 

58 ``file://localhost/path/to/table.orc``. 

59 columns : list, default None 

60 If not None, only these columns will be read from the file. 

61 Output always follows the ordering of the file and not the columns list. 

62 This mirrors the original behaviour of 

63 :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. 

64 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' 

65 Back-end data type applied to the resultant :class:`DataFrame` 

66 (still experimental). Behaviour is as follows: 

67 

68 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` 

69 (default). 

70 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` 

71 DataFrame. 

72 

73 .. versionadded:: 2.0 

74 

75 filesystem : fsspec or pyarrow filesystem, default None 

76 Filesystem object to use when reading the parquet file. 

77 

78 .. versionadded:: 2.1.0 

79 

80 **kwargs 

81 Any additional kwargs are passed to pyarrow. 

82 

83 Returns 

84 ------- 

85 DataFrame 

86 

87 Notes 

88 ----- 

89 Before using this function you should read the :ref:`user guide about ORC <io.orc>` 

90 and :ref:`install optional dependencies <install.warn_orc>`. 

91 

92 If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"), 

93 a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a 

94 pyarrow or fsspec filesystem object into the filesystem keyword to override this 

95 behavior. 

96 

97 Examples 

98 -------- 

99 >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP 

100 """ 

101 # we require a newer version of pyarrow than we support for parquet 

102 

103 orc = import_optional_dependency("pyarrow.orc") 

104 

105 check_dtype_backend(dtype_backend) 

106 

107 with get_handle(path, "rb", is_text=False) as handles: 

108 source = handles.handle 

109 if is_fsspec_url(path) and filesystem is None: 

110 pa = import_optional_dependency("pyarrow") 

111 pa_fs = import_optional_dependency("pyarrow.fs") 

112 try: 

113 filesystem, source = pa_fs.FileSystem.from_uri(path) 

114 except (TypeError, pa.ArrowInvalid): 

115 pass 

116 

117 pa_table = orc.read_table( 

118 source=source, columns=columns, filesystem=filesystem, **kwargs 

119 ) 

120 if dtype_backend is not lib.no_default: 

121 if dtype_backend == "pyarrow": 

122 df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) 

123 else: 

124 from pandas.io._util import _arrow_dtype_mapping 

125 

126 mapping = _arrow_dtype_mapping() 

127 df = pa_table.to_pandas(types_mapper=mapping.get) 

128 return df 

129 else: 

130 if using_pyarrow_string_dtype(): 

131 types_mapper = arrow_string_types_mapper() 

132 else: 

133 types_mapper = None 

134 return pa_table.to_pandas(types_mapper=types_mapper) 

135 

136 

137def to_orc( 

138 df: DataFrame, 

139 path: FilePath | WriteBuffer[bytes] | None = None, 

140 *, 

141 engine: Literal["pyarrow"] = "pyarrow", 

142 index: bool | None = None, 

143 engine_kwargs: dict[str, Any] | None = None, 

144) -> bytes | None: 

145 """ 

146 Write a DataFrame to the ORC format. 

147 

148 .. versionadded:: 1.5.0 

149 

150 Parameters 

151 ---------- 

152 df : DataFrame 

153 The dataframe to be written to ORC. Raises NotImplementedError 

154 if dtype of one or more columns is category, unsigned integers, 

155 intervals, periods or sparse. 

156 path : str, file-like object or None, default None 

157 If a string, it will be used as Root Directory path 

158 when writing a partitioned dataset. By file-like object, 

159 we refer to objects with a write() method, such as a file handle 

160 (e.g. via builtin open function). If path is None, 

161 a bytes object is returned. 

162 engine : str, default 'pyarrow' 

163 ORC library to use. 

164 index : bool, optional 

165 If ``True``, include the dataframe's index(es) in the file output. If 

166 ``False``, they will not be written to the file. 

167 If ``None``, similar to ``infer`` the dataframe's index(es) 

168 will be saved. However, instead of being saved as values, 

169 the RangeIndex will be stored as a range in the metadata so it 

170 doesn't require much space and is faster. Other indexes will 

171 be included as columns in the file output. 

172 engine_kwargs : dict[str, Any] or None, default None 

173 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. 

174 

175 Returns 

176 ------- 

177 bytes if no path argument is provided else None 

178 

179 Raises 

180 ------ 

181 NotImplementedError 

182 Dtype of one or more columns is category, unsigned integers, interval, 

183 period or sparse. 

184 ValueError 

185 engine is not pyarrow. 

186 

187 Notes 

188 ----- 

189 * Before using this function you should read the 

190 :ref:`user guide about ORC <io.orc>` and 

191 :ref:`install optional dependencies <install.warn_orc>`. 

192 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ 

193 library. 

194 * For supported dtypes please refer to `supported ORC features in Arrow 

195 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. 

196 * Currently timezones in datetime columns are not preserved when a 

197 dataframe is converted into ORC files. 

198 """ 

199 if index is None: 

200 index = df.index.names[0] is not None 

201 if engine_kwargs is None: 

202 engine_kwargs = {} 

203 

204 # validate index 

205 # -------------- 

206 

207 # validate that we have only a default index 

208 # raise on anything else as we don't serialize the index 

209 

210 if not df.index.equals(default_index(len(df))): 

211 raise ValueError( 

212 "orc does not support serializing a non-default index for the index; " 

213 "you can .reset_index() to make the index into column(s)" 

214 ) 

215 

216 if df.index.name is not None: 

217 raise ValueError("orc does not serialize index meta-data on a default index") 

218 

219 if engine != "pyarrow": 

220 raise ValueError("engine must be 'pyarrow'") 

221 engine = import_optional_dependency(engine, min_version="10.0.1") 

222 pa = import_optional_dependency("pyarrow") 

223 orc = import_optional_dependency("pyarrow.orc") 

224 

225 was_none = path is None 

226 if was_none: 

227 path = io.BytesIO() 

228 assert path is not None # For mypy 

229 with get_handle(path, "wb", is_text=False) as handles: 

230 assert isinstance(engine, ModuleType) # For mypy 

231 try: 

232 orc.write_table( 

233 engine.Table.from_pandas(df, preserve_index=index), 

234 handles.handle, 

235 **engine_kwargs, 

236 ) 

237 except (TypeError, pa.ArrowNotImplementedError) as e: 

238 raise NotImplementedError( 

239 "The dtype of one or more columns is not supported yet." 

240 ) from e 

241 

242 if was_none: 

243 assert isinstance(path, io.BytesIO) # For mypy 

244 return path.getvalue() 

245 return None