1from contextlib import nullcontext
2
3import numpy as np
4from .numeric import uint8, ndarray, dtype
5from numpy.compat import os_fspath, is_pathlib_path
6from numpy.core.overrides import set_module
7
8__all__ = ['memmap']
9
10dtypedescr = dtype
11valid_filemodes = ["r", "c", "r+", "w+"]
12writeable_filemodes = ["r+", "w+"]
13
14mode_equivalents = {
15 "readonly":"r",
16 "copyonwrite":"c",
17 "readwrite":"r+",
18 "write":"w+"
19 }
20
21
22@set_module('numpy')
23class memmap(ndarray):
24 """Create a memory-map to an array stored in a *binary* file on disk.
25
26 Memory-mapped files are used for accessing small segments of large files
27 on disk, without reading the entire file into memory. NumPy's
28 memmap's are array-like objects. This differs from Python's ``mmap``
29 module, which uses file-like objects.
30
31 This subclass of ndarray has some unpleasant interactions with
32 some operations, because it doesn't quite fit properly as a subclass.
33 An alternative to using this subclass is to create the ``mmap``
34 object yourself, then create an ndarray with ndarray.__new__ directly,
35 passing the object created in its 'buffer=' parameter.
36
37 This class may at some point be turned into a factory function
38 which returns a view into an mmap buffer.
39
40 Flush the memmap instance to write the changes to the file. Currently there
41 is no API to close the underlying ``mmap``. It is tricky to ensure the
42 resource is actually closed, since it may be shared between different
43 memmap instances.
44
45
46 Parameters
47 ----------
48 filename : str, file-like object, or pathlib.Path instance
49 The file name or file object to be used as the array data buffer.
50 dtype : data-type, optional
51 The data-type used to interpret the file contents.
52 Default is `uint8`.
53 mode : {'r+', 'r', 'w+', 'c'}, optional
54 The file is opened in this mode:
55
56 +------+-------------------------------------------------------------+
57 | 'r' | Open existing file for reading only. |
58 +------+-------------------------------------------------------------+
59 | 'r+' | Open existing file for reading and writing. |
60 +------+-------------------------------------------------------------+
61 | 'w+' | Create or overwrite existing file for reading and writing. |
62 +------+-------------------------------------------------------------+
63 | 'c' | Copy-on-write: assignments affect data in memory, but |
64 | | changes are not saved to disk. The file on disk is |
65 | | read-only. |
66 +------+-------------------------------------------------------------+
67
68 Default is 'r+'.
69 offset : int, optional
70 In the file, array data starts at this offset. Since `offset` is
71 measured in bytes, it should normally be a multiple of the byte-size
72 of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of
73 file are valid; The file will be extended to accommodate the
74 additional data. By default, ``memmap`` will start at the beginning of
75 the file, even if ``filename`` is a file pointer ``fp`` and
76 ``fp.tell() != 0``.
77 shape : tuple, optional
78 The desired shape of the array. If ``mode == 'r'`` and the number
79 of remaining bytes after `offset` is not a multiple of the byte-size
80 of `dtype`, you must specify `shape`. By default, the returned array
81 will be 1-D with the number of elements determined by file size
82 and data-type.
83 order : {'C', 'F'}, optional
84 Specify the order of the ndarray memory layout:
85 :term:`row-major`, C-style or :term:`column-major`,
86 Fortran-style. This only has an effect if the shape is
87 greater than 1-D. The default order is 'C'.
88
89 Attributes
90 ----------
91 filename : str or pathlib.Path instance
92 Path to the mapped file.
93 offset : int
94 Offset position in the file.
95 mode : str
96 File mode.
97
98 Methods
99 -------
100 flush
101 Flush any changes in memory to file on disk.
102 When you delete a memmap object, flush is called first to write
103 changes to disk.
104
105
106 See also
107 --------
108 lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
109
110 Notes
111 -----
112 The memmap object can be used anywhere an ndarray is accepted.
113 Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
114 ``True``.
115
116 Memory-mapped files cannot be larger than 2GB on 32-bit systems.
117
118 When a memmap causes a file to be created or extended beyond its
119 current size in the filesystem, the contents of the new part are
120 unspecified. On systems with POSIX filesystem semantics, the extended
121 part will be filled with zero bytes.
122
123 Examples
124 --------
125 >>> data = np.arange(12, dtype='float32')
126 >>> data.resize((3,4))
127
128 This example uses a temporary file so that doctest doesn't write
129 files to your directory. You would use a 'normal' filename.
130
131 >>> from tempfile import mkdtemp
132 >>> import os.path as path
133 >>> filename = path.join(mkdtemp(), 'newfile.dat')
134
135 Create a memmap with dtype and shape that matches our data:
136
137 >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
138 >>> fp
139 memmap([[0., 0., 0., 0.],
140 [0., 0., 0., 0.],
141 [0., 0., 0., 0.]], dtype=float32)
142
143 Write data to memmap array:
144
145 >>> fp[:] = data[:]
146 >>> fp
147 memmap([[ 0., 1., 2., 3.],
148 [ 4., 5., 6., 7.],
149 [ 8., 9., 10., 11.]], dtype=float32)
150
151 >>> fp.filename == path.abspath(filename)
152 True
153
154 Flushes memory changes to disk in order to read them back
155
156 >>> fp.flush()
157
158 Load the memmap and verify data was stored:
159
160 >>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
161 >>> newfp
162 memmap([[ 0., 1., 2., 3.],
163 [ 4., 5., 6., 7.],
164 [ 8., 9., 10., 11.]], dtype=float32)
165
166 Read-only memmap:
167
168 >>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
169 >>> fpr.flags.writeable
170 False
171
172 Copy-on-write memmap:
173
174 >>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
175 >>> fpc.flags.writeable
176 True
177
178 It's possible to assign to copy-on-write array, but values are only
179 written into the memory copy of the array, and not written to disk:
180
181 >>> fpc
182 memmap([[ 0., 1., 2., 3.],
183 [ 4., 5., 6., 7.],
184 [ 8., 9., 10., 11.]], dtype=float32)
185 >>> fpc[0,:] = 0
186 >>> fpc
187 memmap([[ 0., 0., 0., 0.],
188 [ 4., 5., 6., 7.],
189 [ 8., 9., 10., 11.]], dtype=float32)
190
191 File on disk is unchanged:
192
193 >>> fpr
194 memmap([[ 0., 1., 2., 3.],
195 [ 4., 5., 6., 7.],
196 [ 8., 9., 10., 11.]], dtype=float32)
197
198 Offset into a memmap:
199
200 >>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
201 >>> fpo
202 memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
203
204 """
205
206 __array_priority__ = -100.0
207
208 def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
209 shape=None, order='C'):
210 # Import here to minimize 'import numpy' overhead
211 import mmap
212 import os.path
213 try:
214 mode = mode_equivalents[mode]
215 except KeyError as e:
216 if mode not in valid_filemodes:
217 raise ValueError(
218 "mode must be one of {!r} (got {!r})"
219 .format(valid_filemodes + list(mode_equivalents.keys()), mode)
220 ) from None
221
222 if mode == 'w+' and shape is None:
223 raise ValueError("shape must be given")
224
225 if hasattr(filename, 'read'):
226 f_ctx = nullcontext(filename)
227 else:
228 f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
229
230 with f_ctx as fid:
231 fid.seek(0, 2)
232 flen = fid.tell()
233 descr = dtypedescr(dtype)
234 _dbytes = descr.itemsize
235
236 if shape is None:
237 bytes = flen - offset
238 if bytes % _dbytes:
239 raise ValueError("Size of available data is not a "
240 "multiple of the data-type size.")
241 size = bytes // _dbytes
242 shape = (size,)
243 else:
244 if not isinstance(shape, tuple):
245 shape = (shape,)
246 size = np.intp(1) # avoid default choice of np.int_, which might overflow
247 for k in shape:
248 size *= k
249
250 bytes = int(offset + size*_dbytes)
251
252 if mode in ('w+', 'r+') and flen < bytes:
253 fid.seek(bytes - 1, 0)
254 fid.write(b'\0')
255 fid.flush()
256
257 if mode == 'c':
258 acc = mmap.ACCESS_COPY
259 elif mode == 'r':
260 acc = mmap.ACCESS_READ
261 else:
262 acc = mmap.ACCESS_WRITE
263
264 start = offset - offset % mmap.ALLOCATIONGRANULARITY
265 bytes -= start
266 array_offset = offset - start
267 mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
268
269 self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
270 offset=array_offset, order=order)
271 self._mmap = mm
272 self.offset = offset
273 self.mode = mode
274
275 if is_pathlib_path(filename):
276 # special case - if we were constructed with a pathlib.path,
277 # then filename is a path object, not a string
278 self.filename = filename.resolve()
279 elif hasattr(fid, "name") and isinstance(fid.name, str):
280 # py3 returns int for TemporaryFile().name
281 self.filename = os.path.abspath(fid.name)
282 # same as memmap copies (e.g. memmap + 1)
283 else:
284 self.filename = None
285
286 return self
287
288 def __array_finalize__(self, obj):
289 if hasattr(obj, '_mmap') and np.may_share_memory(self, obj):
290 self._mmap = obj._mmap
291 self.filename = obj.filename
292 self.offset = obj.offset
293 self.mode = obj.mode
294 else:
295 self._mmap = None
296 self.filename = None
297 self.offset = None
298 self.mode = None
299
300 def flush(self):
301 """
302 Write any changes in the array to the file on disk.
303
304 For further information, see `memmap`.
305
306 Parameters
307 ----------
308 None
309
310 See Also
311 --------
312 memmap
313
314 """
315 if self.base is not None and hasattr(self.base, 'flush'):
316 self.base.flush()
317
318 def __array_wrap__(self, arr, context=None):
319 arr = super().__array_wrap__(arr, context)
320
321 # Return a memmap if a memmap was given as the output of the
322 # ufunc. Leave the arr class unchanged if self is not a memmap
323 # to keep original memmap subclasses behavior
324 if self is arr or type(self) is not memmap:
325 return arr
326 # Return scalar instead of 0d memmap, e.g. for np.sum with
327 # axis=None
328 if arr.shape == ():
329 return arr[()]
330 # Return ndarray otherwise
331 return arr.view(np.ndarray)
332
333 def __getitem__(self, index):
334 res = super().__getitem__(index)
335 if type(res) is memmap and res._mmap is None:
336 return res.view(type=ndarray)
337 return res