1"""
2A buffered iterator for big arrays.
3
4This module solves the problem of iterating over a big file-based array
5without having to read it into memory. The `Arrayterator` class wraps
6an array object, and when iterated it will return sub-arrays with at most
7a user-specified number of elements.
8
9"""
10from operator import mul
11from functools import reduce
12
13__all__ = ['Arrayterator']
14
15
16class Arrayterator:
17 """
18 Buffered iterator for big arrays.
19
20 `Arrayterator` creates a buffered iterator for reading big arrays in small
21 contiguous blocks. The class is useful for objects stored in the
22 file system. It allows iteration over the object *without* reading
23 everything in memory; instead, small blocks are read and iterated over.
24
25 `Arrayterator` can be used with any object that supports multidimensional
26 slices. This includes NumPy arrays, but also variables from
27 Scientific.IO.NetCDF or pynetcdf for example.
28
29 Parameters
30 ----------
31 var : array_like
32 The object to iterate over.
33 buf_size : int, optional
34 The buffer size. If `buf_size` is supplied, the maximum amount of
35 data that will be read into memory is `buf_size` elements.
36 Default is None, which will read as many element as possible
37 into memory.
38
39 Attributes
40 ----------
41 var
42 buf_size
43 start
44 stop
45 step
46 shape
47 flat
48
49 See Also
50 --------
51 numpy.ndenumerate : Multidimensional array iterator.
52 numpy.flatiter : Flat array iterator.
53 numpy.memmap : Create a memory-map to an array stored
54 in a binary file on disk.
55
56 Notes
57 -----
58 The algorithm works by first finding a "running dimension", along which
59 the blocks will be extracted. Given an array of dimensions
60 ``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the
61 first dimension will be used. If, on the other hand,
62 ``d1 < buf_size < d1*d2`` the second dimension will be used, and so on.
63 Blocks are extracted along this dimension, and when the last block is
64 returned the process continues from the next dimension, until all
65 elements have been read.
66
67 Examples
68 --------
69 >>> import numpy as np
70 >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
71 >>> a_itor = np.lib.Arrayterator(a, 2)
72 >>> a_itor.shape
73 (3, 4, 5, 6)
74
75 Now we can iterate over ``a_itor``, and it will return arrays of size
76 two. Since `buf_size` was smaller than any dimension, the first
77 dimension will be iterated over first:
78
79 >>> for subarr in a_itor:
80 ... if not subarr.all():
81 ... print(subarr, subarr.shape) # doctest: +SKIP
82 >>> # [[[[0 1]]]] (1, 1, 1, 2)
83
84 """
85
86 __module__ = "numpy.lib"
87
88 def __init__(self, var, buf_size=None):
89 self.var = var
90 self.buf_size = buf_size
91
92 self.start = [0 for dim in var.shape]
93 self.stop = list(var.shape)
94 self.step = [1 for dim in var.shape]
95
96 def __getattr__(self, attr):
97 return getattr(self.var, attr)
98
99 def __getitem__(self, index):
100 """
101 Return a new arrayterator.
102
103 """
104 # Fix index, handling ellipsis and incomplete slices.
105 if not isinstance(index, tuple):
106 index = (index,)
107 fixed = []
108 length, dims = len(index), self.ndim
109 for slice_ in index:
110 if slice_ is Ellipsis:
111 fixed.extend([slice(None)] * (dims-length+1))
112 length = len(fixed)
113 elif isinstance(slice_, int):
114 fixed.append(slice(slice_, slice_+1, 1))
115 else:
116 fixed.append(slice_)
117 index = tuple(fixed)
118 if len(index) < dims:
119 index += (slice(None),) * (dims-len(index))
120
121 # Return a new arrayterator object.
122 out = self.__class__(self.var, self.buf_size)
123 for i, (start, stop, step, slice_) in enumerate(
124 zip(self.start, self.stop, self.step, index)):
125 out.start[i] = start + (slice_.start or 0)
126 out.step[i] = step * (slice_.step or 1)
127 out.stop[i] = start + (slice_.stop or stop-start)
128 out.stop[i] = min(stop, out.stop[i])
129 return out
130
131 def __array__(self, dtype=None, copy=None):
132 """
133 Return corresponding data.
134
135 """
136 slice_ = tuple(slice(*t) for t in zip(
137 self.start, self.stop, self.step))
138 return self.var[slice_]
139
140 @property
141 def flat(self):
142 """
143 A 1-D flat iterator for Arrayterator objects.
144
145 This iterator returns elements of the array to be iterated over in
146 `~lib.Arrayterator` one by one.
147 It is similar to `flatiter`.
148
149 See Also
150 --------
151 lib.Arrayterator
152 flatiter
153
154 Examples
155 --------
156 >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
157 >>> a_itor = np.lib.Arrayterator(a, 2)
158
159 >>> for subarr in a_itor.flat:
160 ... if not subarr:
161 ... print(subarr, type(subarr))
162 ...
163 0 <class 'numpy.int64'>
164
165 """
166 for block in self:
167 yield from block.flat
168
169 @property
170 def shape(self):
171 """
172 The shape of the array to be iterated over.
173
174 For an example, see `Arrayterator`.
175
176 """
177 return tuple(((stop-start-1)//step+1) for start, stop, step in
178 zip(self.start, self.stop, self.step))
179
180 def __iter__(self):
181 # Skip arrays with degenerate dimensions
182 if [dim for dim in self.shape if dim <= 0]:
183 return
184
185 start = self.start[:]
186 stop = self.stop[:]
187 step = self.step[:]
188 ndims = self.var.ndim
189
190 while True:
191 count = self.buf_size or reduce(mul, self.shape)
192
193 # iterate over each dimension, looking for the
194 # running dimension (ie, the dimension along which
195 # the blocks will be built from)
196 rundim = 0
197 for i in range(ndims-1, -1, -1):
198 # if count is zero we ran out of elements to read
199 # along higher dimensions, so we read only a single position
200 if count == 0:
201 stop[i] = start[i]+1
202 elif count <= self.shape[i]:
203 # limit along this dimension
204 stop[i] = start[i] + count*step[i]
205 rundim = i
206 else:
207 # read everything along this dimension
208 stop[i] = self.stop[i]
209 stop[i] = min(self.stop[i], stop[i])
210 count = count//self.shape[i]
211
212 # yield a block
213 slice_ = tuple(slice(*t) for t in zip(start, stop, step))
214 yield self.var[slice_]
215
216 # Update start position, taking care of overflow to
217 # other dimensions
218 start[rundim] = stop[rundim] # start where we stopped
219 for i in range(ndims-1, 0, -1):
220 if start[i] >= self.stop[i]:
221 start[i] = self.start[i]
222 start[i-1] += self.step[i-1]
223 if start[0] >= self.stop[0]:
224 return