Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/orc.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

52 statements  

1""" orc compat """ 

2from __future__ import annotations 

3 

4import io 

5from types import ModuleType 

6from typing import ( 

7 Any, 

8 Literal, 

9) 

10 

11from pandas._libs import lib 

12from pandas._typing import ( 

13 DtypeBackend, 

14 FilePath, 

15 ReadBuffer, 

16 WriteBuffer, 

17) 

18from pandas.compat._optional import import_optional_dependency 

19from pandas.util._validators import check_dtype_backend 

20 

21from pandas.core.dtypes.common import ( 

22 is_categorical_dtype, 

23 is_interval_dtype, 

24 is_period_dtype, 

25 is_unsigned_integer_dtype, 

26) 

27 

28import pandas as pd 

29from pandas.core.frame import DataFrame 

30 

31from pandas.io.common import get_handle 

32 

33 

34def read_orc( 

35 path: FilePath | ReadBuffer[bytes], 

36 columns: list[str] | None = None, 

37 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

38 **kwargs, 

39) -> DataFrame: 

40 """ 

41 Load an ORC object from the file path, returning a DataFrame. 

42 

43 Parameters 

44 ---------- 

45 path : str, path object, or file-like object 

46 String, path object (implementing ``os.PathLike[str]``), or file-like 

47 object implementing a binary ``read()`` function. The string could be a URL. 

48 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is 

49 expected. A local file could be: 

50 ``file://localhost/path/to/table.orc``. 

51 columns : list, default None 

52 If not None, only these columns will be read from the file. 

53 Output always follows the ordering of the file and not the columns list. 

54 This mirrors the original behaviour of 

55 :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. 

56 dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames 

57 Which dtype_backend to use, e.g. whether a DataFrame should have NumPy 

58 arrays, nullable dtypes are used for all dtypes that have a nullable 

59 implementation when "numpy_nullable" is set, pyarrow is used for all 

60 dtypes if "pyarrow" is set. 

61 

62 The dtype_backends are still experimential. 

63 

64 .. versionadded:: 2.0 

65 

66 **kwargs 

67 Any additional kwargs are passed to pyarrow. 

68 

69 Returns 

70 ------- 

71 DataFrame 

72 

73 Notes 

74 ----- 

75 Before using this function you should read the :ref:`user guide about ORC <io.orc>` 

76 and :ref:`install optional dependencies <install.warn_orc>`. 

77 """ 

78 # we require a newer version of pyarrow than we support for parquet 

79 

80 orc = import_optional_dependency("pyarrow.orc") 

81 

82 check_dtype_backend(dtype_backend) 

83 

84 with get_handle(path, "rb", is_text=False) as handles: 

85 orc_file = orc.ORCFile(handles.handle) 

86 pa_table = orc_file.read(columns=columns, **kwargs) 

87 if dtype_backend is not lib.no_default: 

88 if dtype_backend == "pyarrow": 

89 df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) 

90 else: 

91 from pandas.io._util import _arrow_dtype_mapping 

92 

93 mapping = _arrow_dtype_mapping() 

94 df = pa_table.to_pandas(types_mapper=mapping.get) 

95 return df 

96 else: 

97 return pa_table.to_pandas() 

98 

99 

100def to_orc( 

101 df: DataFrame, 

102 path: FilePath | WriteBuffer[bytes] | None = None, 

103 *, 

104 engine: Literal["pyarrow"] = "pyarrow", 

105 index: bool | None = None, 

106 engine_kwargs: dict[str, Any] | None = None, 

107) -> bytes | None: 

108 """ 

109 Write a DataFrame to the ORC format. 

110 

111 .. versionadded:: 1.5.0 

112 

113 Parameters 

114 ---------- 

115 df : DataFrame 

116 The dataframe to be written to ORC. Raises NotImplementedError 

117 if dtype of one or more columns is category, unsigned integers, 

118 intervals, periods or sparse. 

119 path : str, file-like object or None, default None 

120 If a string, it will be used as Root Directory path 

121 when writing a partitioned dataset. By file-like object, 

122 we refer to objects with a write() method, such as a file handle 

123 (e.g. via builtin open function). If path is None, 

124 a bytes object is returned. 

125 engine : str, default 'pyarrow' 

126 ORC library to use. Pyarrow must be >= 7.0.0. 

127 index : bool, optional 

128 If ``True``, include the dataframe's index(es) in the file output. If 

129 ``False``, they will not be written to the file. 

130 If ``None``, similar to ``infer`` the dataframe's index(es) 

131 will be saved. However, instead of being saved as values, 

132 the RangeIndex will be stored as a range in the metadata so it 

133 doesn't require much space and is faster. Other indexes will 

134 be included as columns in the file output. 

135 engine_kwargs : dict[str, Any] or None, default None 

136 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. 

137 

138 Returns 

139 ------- 

140 bytes if no path argument is provided else None 

141 

142 Raises 

143 ------ 

144 NotImplementedError 

145 Dtype of one or more columns is category, unsigned integers, interval, 

146 period or sparse. 

147 ValueError 

148 engine is not pyarrow. 

149 

150 Notes 

151 ----- 

152 * Before using this function you should read the 

153 :ref:`user guide about ORC <io.orc>` and 

154 :ref:`install optional dependencies <install.warn_orc>`. 

155 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ 

156 library. 

157 * For supported dtypes please refer to `supported ORC features in Arrow 

158 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. 

159 * Currently timezones in datetime columns are not preserved when a 

160 dataframe is converted into ORC files. 

161 """ 

162 if index is None: 

163 index = df.index.names[0] is not None 

164 if engine_kwargs is None: 

165 engine_kwargs = {} 

166 

167 # If unsupported dtypes are found raise NotImplementedError 

168 # In Pyarrow 9.0.0 this check will no longer be needed 

169 for dtype in df.dtypes: 

170 if ( 

171 is_categorical_dtype(dtype) 

172 or is_interval_dtype(dtype) 

173 or is_period_dtype(dtype) 

174 or is_unsigned_integer_dtype(dtype) 

175 ): 

176 raise NotImplementedError( 

177 "The dtype of one or more columns is not supported yet." 

178 ) 

179 

180 if engine != "pyarrow": 

181 raise ValueError("engine must be 'pyarrow'") 

182 engine = import_optional_dependency(engine, min_version="7.0.0") 

183 orc = import_optional_dependency("pyarrow.orc") 

184 

185 was_none = path is None 

186 if was_none: 

187 path = io.BytesIO() 

188 assert path is not None # For mypy 

189 with get_handle(path, "wb", is_text=False) as handles: 

190 assert isinstance(engine, ModuleType) # For mypy 

191 try: 

192 orc.write_table( 

193 engine.Table.from_pandas(df, preserve_index=index), 

194 handles.handle, 

195 **engine_kwargs, 

196 ) 

197 except TypeError as e: 

198 raise NotImplementedError( 

199 "The dtype of one or more columns is not supported yet." 

200 ) from e 

201 

202 if was_none: 

203 assert isinstance(path, io.BytesIO) # For mypy 

204 return path.getvalue() 

205 return None