1from contextlib import nullcontext
2
3import numpy as np
4from .._utils import set_module
5from .numeric import uint8, ndarray, dtype
6from numpy.compat import os_fspath, is_pathlib_path
7
8__all__ = ['memmap']
9
10dtypedescr = dtype
11valid_filemodes = ["r", "c", "r+", "w+"]
12writeable_filemodes = ["r+", "w+"]
13
14mode_equivalents = {
15 "readonly":"r",
16 "copyonwrite":"c",
17 "readwrite":"r+",
18 "write":"w+"
19 }
20
21
22@set_module('numpy')
23class memmap(ndarray):
24 """Create a memory-map to an array stored in a *binary* file on disk.
25
26 Memory-mapped files are used for accessing small segments of large files
27 on disk, without reading the entire file into memory. NumPy's
28 memmap's are array-like objects. This differs from Python's ``mmap``
29 module, which uses file-like objects.
30
31 This subclass of ndarray has some unpleasant interactions with
32 some operations, because it doesn't quite fit properly as a subclass.
33 An alternative to using this subclass is to create the ``mmap``
34 object yourself, then create an ndarray with ndarray.__new__ directly,
35 passing the object created in its 'buffer=' parameter.
36
37 This class may at some point be turned into a factory function
38 which returns a view into an mmap buffer.
39
40 Flush the memmap instance to write the changes to the file. Currently there
41 is no API to close the underlying ``mmap``. It is tricky to ensure the
42 resource is actually closed, since it may be shared between different
43 memmap instances.
44
45
46 Parameters
47 ----------
48 filename : str, file-like object, or pathlib.Path instance
49 The file name or file object to be used as the array data buffer.
50 dtype : data-type, optional
51 The data-type used to interpret the file contents.
52 Default is `uint8`.
53 mode : {'r+', 'r', 'w+', 'c'}, optional
54 The file is opened in this mode:
55
56 +------+-------------------------------------------------------------+
57 | 'r' | Open existing file for reading only. |
58 +------+-------------------------------------------------------------+
59 | 'r+' | Open existing file for reading and writing. |
60 +------+-------------------------------------------------------------+
61 | 'w+' | Create or overwrite existing file for reading and writing. |
62 | | If ``mode == 'w+'`` then `shape` must also be specified. |
63 +------+-------------------------------------------------------------+
64 | 'c' | Copy-on-write: assignments affect data in memory, but |
65 | | changes are not saved to disk. The file on disk is |
66 | | read-only. |
67 +------+-------------------------------------------------------------+
68
69 Default is 'r+'.
70 offset : int, optional
71 In the file, array data starts at this offset. Since `offset` is
72 measured in bytes, it should normally be a multiple of the byte-size
73 of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of
74 file are valid; The file will be extended to accommodate the
75 additional data. By default, ``memmap`` will start at the beginning of
76 the file, even if ``filename`` is a file pointer ``fp`` and
77 ``fp.tell() != 0``.
78 shape : tuple, optional
79 The desired shape of the array. If ``mode == 'r'`` and the number
80 of remaining bytes after `offset` is not a multiple of the byte-size
81 of `dtype`, you must specify `shape`. By default, the returned array
82 will be 1-D with the number of elements determined by file size
83 and data-type.
84 order : {'C', 'F'}, optional
85 Specify the order of the ndarray memory layout:
86 :term:`row-major`, C-style or :term:`column-major`,
87 Fortran-style. This only has an effect if the shape is
88 greater than 1-D. The default order is 'C'.
89
90 Attributes
91 ----------
92 filename : str or pathlib.Path instance
93 Path to the mapped file.
94 offset : int
95 Offset position in the file.
96 mode : str
97 File mode.
98
99 Methods
100 -------
101 flush
102 Flush any changes in memory to file on disk.
103 When you delete a memmap object, flush is called first to write
104 changes to disk.
105
106
107 See also
108 --------
109 lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
110
111 Notes
112 -----
113 The memmap object can be used anywhere an ndarray is accepted.
114 Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
115 ``True``.
116
117 Memory-mapped files cannot be larger than 2GB on 32-bit systems.
118
119 When a memmap causes a file to be created or extended beyond its
120 current size in the filesystem, the contents of the new part are
121 unspecified. On systems with POSIX filesystem semantics, the extended
122 part will be filled with zero bytes.
123
124 Examples
125 --------
126 >>> data = np.arange(12, dtype='float32')
127 >>> data.resize((3,4))
128
129 This example uses a temporary file so that doctest doesn't write
130 files to your directory. You would use a 'normal' filename.
131
132 >>> from tempfile import mkdtemp
133 >>> import os.path as path
134 >>> filename = path.join(mkdtemp(), 'newfile.dat')
135
136 Create a memmap with dtype and shape that matches our data:
137
138 >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
139 >>> fp
140 memmap([[0., 0., 0., 0.],
141 [0., 0., 0., 0.],
142 [0., 0., 0., 0.]], dtype=float32)
143
144 Write data to memmap array:
145
146 >>> fp[:] = data[:]
147 >>> fp
148 memmap([[ 0., 1., 2., 3.],
149 [ 4., 5., 6., 7.],
150 [ 8., 9., 10., 11.]], dtype=float32)
151
152 >>> fp.filename == path.abspath(filename)
153 True
154
155 Flushes memory changes to disk in order to read them back
156
157 >>> fp.flush()
158
159 Load the memmap and verify data was stored:
160
161 >>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
162 >>> newfp
163 memmap([[ 0., 1., 2., 3.],
164 [ 4., 5., 6., 7.],
165 [ 8., 9., 10., 11.]], dtype=float32)
166
167 Read-only memmap:
168
169 >>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
170 >>> fpr.flags.writeable
171 False
172
173 Copy-on-write memmap:
174
175 >>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
176 >>> fpc.flags.writeable
177 True
178
179 It's possible to assign to copy-on-write array, but values are only
180 written into the memory copy of the array, and not written to disk:
181
182 >>> fpc
183 memmap([[ 0., 1., 2., 3.],
184 [ 4., 5., 6., 7.],
185 [ 8., 9., 10., 11.]], dtype=float32)
186 >>> fpc[0,:] = 0
187 >>> fpc
188 memmap([[ 0., 0., 0., 0.],
189 [ 4., 5., 6., 7.],
190 [ 8., 9., 10., 11.]], dtype=float32)
191
192 File on disk is unchanged:
193
194 >>> fpr
195 memmap([[ 0., 1., 2., 3.],
196 [ 4., 5., 6., 7.],
197 [ 8., 9., 10., 11.]], dtype=float32)
198
199 Offset into a memmap:
200
201 >>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
202 >>> fpo
203 memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
204
205 """
206
207 __array_priority__ = -100.0
208
209 def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
210 shape=None, order='C'):
211 # Import here to minimize 'import numpy' overhead
212 import mmap
213 import os.path
214 try:
215 mode = mode_equivalents[mode]
216 except KeyError as e:
217 if mode not in valid_filemodes:
218 raise ValueError(
219 "mode must be one of {!r} (got {!r})"
220 .format(valid_filemodes + list(mode_equivalents.keys()), mode)
221 ) from None
222
223 if mode == 'w+' and shape is None:
224 raise ValueError("shape must be given if mode == 'w+'")
225
226 if hasattr(filename, 'read'):
227 f_ctx = nullcontext(filename)
228 else:
229 f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
230
231 with f_ctx as fid:
232 fid.seek(0, 2)
233 flen = fid.tell()
234 descr = dtypedescr(dtype)
235 _dbytes = descr.itemsize
236
237 if shape is None:
238 bytes = flen - offset
239 if bytes % _dbytes:
240 raise ValueError("Size of available data is not a "
241 "multiple of the data-type size.")
242 size = bytes // _dbytes
243 shape = (size,)
244 else:
245 if not isinstance(shape, tuple):
246 shape = (shape,)
247 size = np.intp(1) # avoid default choice of np.int_, which might overflow
248 for k in shape:
249 size *= k
250
251 bytes = int(offset + size*_dbytes)
252
253 if mode in ('w+', 'r+') and flen < bytes:
254 fid.seek(bytes - 1, 0)
255 fid.write(b'\0')
256 fid.flush()
257
258 if mode == 'c':
259 acc = mmap.ACCESS_COPY
260 elif mode == 'r':
261 acc = mmap.ACCESS_READ
262 else:
263 acc = mmap.ACCESS_WRITE
264
265 start = offset - offset % mmap.ALLOCATIONGRANULARITY
266 bytes -= start
267 array_offset = offset - start
268 mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
269
270 self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
271 offset=array_offset, order=order)
272 self._mmap = mm
273 self.offset = offset
274 self.mode = mode
275
276 if is_pathlib_path(filename):
277 # special case - if we were constructed with a pathlib.path,
278 # then filename is a path object, not a string
279 self.filename = filename.resolve()
280 elif hasattr(fid, "name") and isinstance(fid.name, str):
281 # py3 returns int for TemporaryFile().name
282 self.filename = os.path.abspath(fid.name)
283 # same as memmap copies (e.g. memmap + 1)
284 else:
285 self.filename = None
286
287 return self
288
289 def __array_finalize__(self, obj):
290 if hasattr(obj, '_mmap') and np.may_share_memory(self, obj):
291 self._mmap = obj._mmap
292 self.filename = obj.filename
293 self.offset = obj.offset
294 self.mode = obj.mode
295 else:
296 self._mmap = None
297 self.filename = None
298 self.offset = None
299 self.mode = None
300
301 def flush(self):
302 """
303 Write any changes in the array to the file on disk.
304
305 For further information, see `memmap`.
306
307 Parameters
308 ----------
309 None
310
311 See Also
312 --------
313 memmap
314
315 """
316 if self.base is not None and hasattr(self.base, 'flush'):
317 self.base.flush()
318
319 def __array_wrap__(self, arr, context=None):
320 arr = super().__array_wrap__(arr, context)
321
322 # Return a memmap if a memmap was given as the output of the
323 # ufunc. Leave the arr class unchanged if self is not a memmap
324 # to keep original memmap subclasses behavior
325 if self is arr or type(self) is not memmap:
326 return arr
327 # Return scalar instead of 0d memmap, e.g. for np.sum with
328 # axis=None
329 if arr.shape == ():
330 return arr[()]
331 # Return ndarray otherwise
332 return arr.view(np.ndarray)
333
334 def __getitem__(self, index):
335 res = super().__getitem__(index)
336 if type(res) is memmap and res._mmap is None:
337 return res.view(type=ndarray)
338 return res