1"""
2Read SAS sas7bdat or xport files.
3"""
4from __future__ import annotations
5
6from abc import (
7 ABC,
8 abstractmethod,
9)
10from typing import (
11 TYPE_CHECKING,
12 overload,
13)
14
15from pandas.util._decorators import doc
16
17from pandas.core.shared_docs import _shared_docs
18
19from pandas.io.common import stringify_path
20
21if TYPE_CHECKING:
22 from collections.abc import Hashable
23 from types import TracebackType
24
25 from pandas._typing import (
26 CompressionOptions,
27 FilePath,
28 ReadBuffer,
29 Self,
30 )
31
32 from pandas import DataFrame
33
34
35class ReaderBase(ABC):
36 """
37 Protocol for XportReader and SAS7BDATReader classes.
38 """
39
40 @abstractmethod
41 def read(self, nrows: int | None = None) -> DataFrame:
42 ...
43
44 @abstractmethod
45 def close(self) -> None:
46 ...
47
48 def __enter__(self) -> Self:
49 return self
50
51 def __exit__(
52 self,
53 exc_type: type[BaseException] | None,
54 exc_value: BaseException | None,
55 traceback: TracebackType | None,
56 ) -> None:
57 self.close()
58
59
60@overload
61def read_sas(
62 filepath_or_buffer: FilePath | ReadBuffer[bytes],
63 *,
64 format: str | None = ...,
65 index: Hashable | None = ...,
66 encoding: str | None = ...,
67 chunksize: int = ...,
68 iterator: bool = ...,
69 compression: CompressionOptions = ...,
70) -> ReaderBase:
71 ...
72
73
74@overload
75def read_sas(
76 filepath_or_buffer: FilePath | ReadBuffer[bytes],
77 *,
78 format: str | None = ...,
79 index: Hashable | None = ...,
80 encoding: str | None = ...,
81 chunksize: None = ...,
82 iterator: bool = ...,
83 compression: CompressionOptions = ...,
84) -> DataFrame | ReaderBase:
85 ...
86
87
88@doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer")
89def read_sas(
90 filepath_or_buffer: FilePath | ReadBuffer[bytes],
91 *,
92 format: str | None = None,
93 index: Hashable | None = None,
94 encoding: str | None = None,
95 chunksize: int | None = None,
96 iterator: bool = False,
97 compression: CompressionOptions = "infer",
98) -> DataFrame | ReaderBase:
99 """
100 Read SAS files stored as either XPORT or SAS7BDAT format files.
101
102 Parameters
103 ----------
104 filepath_or_buffer : str, path object, or file-like object
105 String, path object (implementing ``os.PathLike[str]``), or file-like
106 object implementing a binary ``read()`` function. The string could be a URL.
107 Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
108 expected. A local file could be:
109 ``file://localhost/path/to/table.sas7bdat``.
110 format : str {{'xport', 'sas7bdat'}} or None
111 If None, file format is inferred from file extension. If 'xport' or
112 'sas7bdat', uses the corresponding format.
113 index : identifier of index column, defaults to None
114 Identifier of column that should be used as index of the DataFrame.
115 encoding : str, default is None
116 Encoding for text data. If None, text data are stored as raw bytes.
117 chunksize : int
118 Read file `chunksize` lines at a time, returns iterator.
119 iterator : bool, defaults to False
120 If True, returns an iterator for reading the file incrementally.
121 {decompression_options}
122
123 Returns
124 -------
125 DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
126 or XportReader
127
128 Examples
129 --------
130 >>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP
131 """
132 if format is None:
133 buffer_error_msg = (
134 "If this is a buffer object rather "
135 "than a string name, you must specify a format string"
136 )
137 filepath_or_buffer = stringify_path(filepath_or_buffer)
138 if not isinstance(filepath_or_buffer, str):
139 raise ValueError(buffer_error_msg)
140 fname = filepath_or_buffer.lower()
141 if ".xpt" in fname:
142 format = "xport"
143 elif ".sas7bdat" in fname:
144 format = "sas7bdat"
145 else:
146 raise ValueError(
147 f"unable to infer format of SAS file from filename: {repr(fname)}"
148 )
149
150 reader: ReaderBase
151 if format.lower() == "xport":
152 from pandas.io.sas.sas_xport import XportReader
153
154 reader = XportReader(
155 filepath_or_buffer,
156 index=index,
157 encoding=encoding,
158 chunksize=chunksize,
159 compression=compression,
160 )
161 elif format.lower() == "sas7bdat":
162 from pandas.io.sas.sas7bdat import SAS7BDATReader
163
164 reader = SAS7BDATReader(
165 filepath_or_buffer,
166 index=index,
167 encoding=encoding,
168 chunksize=chunksize,
169 compression=compression,
170 )
171 else:
172 raise ValueError("unknown SAS format")
173
174 if iterator or chunksize:
175 return reader
176
177 with reader:
178 return reader.read()