1# Copyright (c) 2013, Mahmoud Hashemi
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are
5# met:
6#
7# * Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9#
10# * Redistributions in binary form must reproduce the above
11# copyright notice, this list of conditions and the following
12# disclaimer in the documentation and/or other materials provided
13# with the distribution.
14#
15# * The names of the contributors may not be used to endorse or
16# promote products derived from this software without specific
17# prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31""":mod:`itertools` is full of great examples of Python generator
32usage. However, there are still some critical gaps. ``iterutils``
33fills many of those gaps with featureful, tested, and Pythonic
34solutions.
35
36Many of the functions below have two versions, one which
37returns an iterator (denoted by the ``*_iter`` naming pattern), and a
38shorter-named convenience form that returns a list. Some of the
39following are based on examples in itertools docs.
40"""
41
42import os
43import math
44import time
45import codecs
46import random
47import itertools
48from itertools import zip_longest
49from collections.abc import Mapping, Sequence, Set, ItemsView, Iterable
50
51
52try:
53 from .typeutils import make_sentinel
54 _UNSET = make_sentinel('_UNSET')
55 _REMAP_EXIT = make_sentinel('_REMAP_EXIT')
56except ImportError:
57 _REMAP_EXIT = object()
58 _UNSET = object()
59
60
61def is_iterable(obj):
62 """Similar in nature to :func:`callable`, ``is_iterable`` returns
63 ``True`` if an object is `iterable`_, ``False`` if not.
64
65 >>> is_iterable([])
66 True
67 >>> is_iterable(object())
68 False
69
70 .. _iterable: https://docs.python.org/2/glossary.html#term-iterable
71 """
72 try:
73 iter(obj)
74 except TypeError:
75 return False
76 return True
77
78
79def is_scalar(obj):
80 """A near-mirror of :func:`is_iterable`. Returns ``False`` if an
81 object is an iterable container type. Strings are considered
82 scalar as well, because strings are more often treated as whole
83 values as opposed to iterables of 1-character substrings.
84
85 >>> is_scalar(object())
86 True
87 >>> is_scalar(range(10))
88 False
89 >>> is_scalar('hello')
90 True
91 """
92 return not is_iterable(obj) or isinstance(obj, (str, bytes))
93
94
95def is_collection(obj):
96 """The opposite of :func:`is_scalar`. Returns ``True`` if an object
97 is an iterable other than a string.
98
99 >>> is_collection(object())
100 False
101 >>> is_collection(range(10))
102 True
103 >>> is_collection('hello')
104 False
105 """
106 return is_iterable(obj) and not isinstance(obj, (str, bytes))
107
108
109def split(src, sep=None, maxsplit=None):
110 """Splits an iterable based on a separator. Like :meth:`str.split`,
111 but for all iterables. Returns a list of lists.
112
113 >>> split(['hi', 'hello', None, None, 'sup', None, 'soap', None])
114 [['hi', 'hello'], ['sup'], ['soap']]
115
116 See :func:`split_iter` docs for more info.
117 """
118 return list(split_iter(src, sep, maxsplit))
119
120
121def split_iter(src, sep=None, maxsplit=None):
122 """Splits an iterable based on a separator, *sep*, a max of
123 *maxsplit* times (no max by default). *sep* can be:
124
125 * a single value
126 * an iterable of separators
127 * a single-argument callable that returns True when a separator is
128 encountered
129
130 ``split_iter()`` yields lists of non-separator values. A separator will
131 never appear in the output.
132
133 >>> list(split_iter(['hi', 'hello', None, None, 'sup', None, 'soap', None]))
134 [['hi', 'hello'], ['sup'], ['soap']]
135
136 Note that ``split_iter`` is based on :func:`str.split`, so if
137 *sep* is ``None``, ``split()`` **groups** separators. If empty lists
138 are desired between two contiguous ``None`` values, simply use
139 ``sep=[None]``:
140
141 >>> list(split_iter(['hi', 'hello', None, None, 'sup', None]))
142 [['hi', 'hello'], ['sup']]
143 >>> list(split_iter(['hi', 'hello', None, None, 'sup', None], sep=[None]))
144 [['hi', 'hello'], [], ['sup'], []]
145
146 Using a callable separator:
147
148 >>> falsy_sep = lambda x: not x
149 >>> list(split_iter(['hi', 'hello', None, '', 'sup', False], falsy_sep))
150 [['hi', 'hello'], [], ['sup'], []]
151
152 See :func:`split` for a list-returning version.
153
154 """
155 if not is_iterable(src):
156 raise TypeError('expected an iterable')
157
158 if maxsplit is not None:
159 maxsplit = int(maxsplit)
160 if maxsplit == 0:
161 yield [src]
162 return
163
164 if callable(sep):
165 sep_func = sep
166 elif not is_scalar(sep):
167 sep = frozenset(sep)
168 def sep_func(x): return x in sep
169 else:
170 def sep_func(x): return x == sep
171
172 cur_group = []
173 split_count = 0
174 for s in src:
175 if maxsplit is not None and split_count >= maxsplit:
176 def sep_func(x): return False
177 if sep_func(s):
178 if sep is None and not cur_group:
179 # If sep is none, str.split() "groups" separators
180 # check the str.split() docs for more info
181 continue
182 split_count += 1
183 yield cur_group
184 cur_group = []
185 else:
186 cur_group.append(s)
187
188 if cur_group or sep is not None:
189 yield cur_group
190 return
191
192
193def lstrip(iterable, strip_value=None):
194 """Strips values from the beginning of an iterable. Stripped items will
195 match the value of the argument strip_value. Functionality is analogous
196 to that of the method str.lstrip. Returns a list.
197
198 >>> lstrip(['Foo', 'Bar', 'Bam'], 'Foo')
199 ['Bar', 'Bam']
200
201 """
202 return list(lstrip_iter(iterable, strip_value))
203
204
205def lstrip_iter(iterable, strip_value=None):
206 """Strips values from the beginning of an iterable. Stripped items will
207 match the value of the argument strip_value. Functionality is analogous
208 to that of the method str.lstrip. Returns a generator.
209
210 >>> list(lstrip_iter(['Foo', 'Bar', 'Bam'], 'Foo'))
211 ['Bar', 'Bam']
212
213 """
214 iterator = iter(iterable)
215 for i in iterator:
216 if i != strip_value:
217 yield i
218 break
219 for i in iterator:
220 yield i
221
222
223def rstrip(iterable, strip_value=None):
224 """Strips values from the end of an iterable. Stripped items will
225 match the value of the argument strip_value. Functionality is analogous
226 to that of the method str.rstrip. Returns a list.
227
228 >>> rstrip(['Foo', 'Bar', 'Bam'], 'Bam')
229 ['Foo', 'Bar']
230
231 """
232 return list(rstrip_iter(iterable, strip_value))
233
234
235def rstrip_iter(iterable, strip_value=None):
236 """Strips values from the end of an iterable. Stripped items will
237 match the value of the argument strip_value. Functionality is analogous
238 to that of the method str.rstrip. Returns a generator.
239
240 >>> list(rstrip_iter(['Foo', 'Bar', 'Bam'], 'Bam'))
241 ['Foo', 'Bar']
242
243 """
244 iterator = iter(iterable)
245 for i in iterator:
246 if i == strip_value:
247 cache = list()
248 cache.append(i)
249 broken = False
250 for i in iterator:
251 if i == strip_value:
252 cache.append(i)
253 else:
254 broken = True
255 break
256 if not broken: # Return to caller here because the end of the
257 return # iterator has been reached
258 yield from cache
259 yield i
260
261
262def strip(iterable, strip_value=None):
263 """Strips values from the beginning and end of an iterable. Stripped items
264 will match the value of the argument strip_value. Functionality is
265 analogous to that of the method str.strip. Returns a list.
266
267 >>> strip(['Fu', 'Foo', 'Bar', 'Bam', 'Fu'], 'Fu')
268 ['Foo', 'Bar', 'Bam']
269
270 """
271 return list(strip_iter(iterable, strip_value))
272
273
274def strip_iter(iterable, strip_value=None):
275 """Strips values from the beginning and end of an iterable. Stripped items
276 will match the value of the argument strip_value. Functionality is
277 analogous to that of the method str.strip. Returns a generator.
278
279 >>> list(strip_iter(['Fu', 'Foo', 'Bar', 'Bam', 'Fu'], 'Fu'))
280 ['Foo', 'Bar', 'Bam']
281
282 """
283 return rstrip_iter(lstrip_iter(iterable, strip_value), strip_value)
284
285
286def chunked(src, size, count=None, **kw):
287 """Returns a list of *count* chunks, each with *size* elements,
288 generated from iterable *src*. If *src* is not evenly divisible by
289 *size*, the final chunk will have fewer than *size* elements.
290 Provide the *fill* keyword argument to provide a pad value and
291 enable padding, otherwise no padding will take place.
292
293 >>> chunked(range(10), 3)
294 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
295 >>> chunked(range(10), 3, fill=None)
296 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]]
297 >>> chunked(range(10), 3, count=2)
298 [[0, 1, 2], [3, 4, 5]]
299
300 See :func:`chunked_iter` for more info.
301 """
302 chunk_iter = chunked_iter(src, size, **kw)
303 if count is None:
304 return list(chunk_iter)
305 else:
306 return list(itertools.islice(chunk_iter, count))
307
308
309def _validate_positive_int(value, name, strictly_positive=True):
310 value = int(value)
311 if value < 0 or (strictly_positive and value == 0):
312 raise ValueError('expected a positive integer ' + name)
313 return value
314
315
316def chunked_iter(src, size, **kw):
317 """Generates *size*-sized chunks from *src* iterable. Unless the
318 optional *fill* keyword argument is provided, iterables not evenly
319 divisible by *size* will have a final chunk that is smaller than
320 *size*.
321
322 >>> list(chunked_iter(range(10), 3))
323 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
324 >>> list(chunked_iter(range(10), 3, fill=None))
325 [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]]
326
327 Note that ``fill=None`` in fact uses ``None`` as the fill value.
328 """
329 # TODO: add count kwarg?
330 if not is_iterable(src):
331 raise TypeError('expected an iterable')
332 size = _validate_positive_int(size, 'chunk size')
333 do_fill = True
334 try:
335 fill_val = kw.pop('fill')
336 except KeyError:
337 do_fill = False
338 fill_val = None
339 if kw:
340 raise ValueError('got unexpected keyword arguments: %r' % kw.keys())
341 if not src:
342 return
343
344 def postprocess(chk): return chk
345 if isinstance(src, (str, bytes)):
346 def postprocess(chk, _sep=type(src)()): return _sep.join(chk)
347 if isinstance(src, bytes):
348 def postprocess(chk): return bytes(chk)
349 src_iter = iter(src)
350 while True:
351 cur_chunk = list(itertools.islice(src_iter, size))
352 if not cur_chunk:
353 break
354 lc = len(cur_chunk)
355 if lc < size and do_fill:
356 cur_chunk[lc:] = [fill_val] * (size - lc)
357 yield postprocess(cur_chunk)
358 return
359
360
361def chunk_ranges(input_size, chunk_size, input_offset=0, overlap_size=0, align=False):
362 """Generates *chunk_size*-sized chunk ranges for an input with length *input_size*.
363 Optionally, a start of the input can be set via *input_offset*, and
364 and overlap between the chunks may be specified via *overlap_size*.
365 Also, if *align* is set to *True*, any items with *i % (chunk_size-overlap_size) == 0*
366 are always at the beginning of the chunk.
367
368 Returns an iterator of (start, end) tuples, one tuple per chunk.
369
370 >>> list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5))
371 [(10, 15), (15, 20)]
372 >>> list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5, overlap_size=1))
373 [(10, 15), (14, 19), (18, 20)]
374 >>> list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5, overlap_size=2))
375 [(10, 15), (13, 18), (16, 20)]
376
377 >>> list(chunk_ranges(input_offset=4, input_size=15, chunk_size=5, align=False))
378 [(4, 9), (9, 14), (14, 19)]
379 >>> list(chunk_ranges(input_offset=4, input_size=15, chunk_size=5, align=True))
380 [(4, 5), (5, 10), (10, 15), (15, 19)]
381
382 >>> list(chunk_ranges(input_offset=2, input_size=15, chunk_size=5, overlap_size=1, align=False))
383 [(2, 7), (6, 11), (10, 15), (14, 17)]
384 >>> list(chunk_ranges(input_offset=2, input_size=15, chunk_size=5, overlap_size=1, align=True))
385 [(2, 5), (4, 9), (8, 13), (12, 17)]
386 >>> list(chunk_ranges(input_offset=3, input_size=15, chunk_size=5, overlap_size=1, align=True))
387 [(3, 5), (4, 9), (8, 13), (12, 17), (16, 18)]
388 """
389 input_size = _validate_positive_int(
390 input_size, 'input_size', strictly_positive=False)
391 chunk_size = _validate_positive_int(chunk_size, 'chunk_size')
392 input_offset = _validate_positive_int(
393 input_offset, 'input_offset', strictly_positive=False)
394 overlap_size = _validate_positive_int(
395 overlap_size, 'overlap_size', strictly_positive=False)
396
397 input_stop = input_offset + input_size
398
399 if align:
400 initial_chunk_len = chunk_size - \
401 input_offset % (chunk_size - overlap_size)
402 if initial_chunk_len != overlap_size:
403 yield (input_offset, min(input_offset + initial_chunk_len, input_stop))
404 if input_offset + initial_chunk_len >= input_stop:
405 return
406 input_offset = input_offset + initial_chunk_len - overlap_size
407
408 for i in range(input_offset, input_stop, chunk_size - overlap_size):
409 yield (i, min(i + chunk_size, input_stop))
410
411 if i + chunk_size >= input_stop:
412 return
413
414
415def pairwise(src, end=_UNSET):
416 """Convenience function for calling :func:`windowed` on *src*, with
417 *size* set to 2.
418
419 >>> pairwise(range(5))
420 [(0, 1), (1, 2), (2, 3), (3, 4)]
421 >>> pairwise([])
422 []
423
424 Unless *end* is set, the number of pairs is always one less than
425 the number of elements in the iterable passed in, except on an empty input,
426 which will return an empty list.
427
428 With *end* set, a number of pairs equal to the length of *src* is returned,
429 with the last item of the last pair being equal to *end*.
430
431 >>> list(pairwise(range(3), end=None))
432 [(0, 1), (1, 2), (2, None)]
433
434 This way, *end* values can be useful as sentinels to signal the end of the iterable.
435 """
436 return windowed(src, 2, fill=end)
437
438
439def pairwise_iter(src, end=_UNSET):
440 """Convenience function for calling :func:`windowed_iter` on *src*,
441 with *size* set to 2.
442
443 >>> list(pairwise_iter(range(5)))
444 [(0, 1), (1, 2), (2, 3), (3, 4)]
445 >>> list(pairwise_iter([]))
446 []
447
448 Unless *end* is set, the number of pairs is always one less
449 than the number of elements in the iterable passed in,
450 or zero, when *src* is empty.
451
452 With *end* set, a number of pairs equal to the length of *src* is returned,
453 with the last item of the last pair being equal to *end*.
454
455 >>> list(pairwise_iter(range(3), end=None))
456 [(0, 1), (1, 2), (2, None)]
457
458 This way, *end* values can be useful as sentinels to signal the end
459 of the iterable. For infinite iterators, setting *end* has no effect.
460 """
461 return windowed_iter(src, 2, fill=end)
462
463
464def windowed(src, size, fill=_UNSET):
465 """Returns tuples with exactly length *size*. If *fill* is unset
466 and the iterable is too short to make a window of length *size*,
467 no tuples are returned. See :func:`windowed_iter` for more.
468 """
469 return list(windowed_iter(src, size, fill=fill))
470
471
472def windowed_iter(src, size, fill=_UNSET):
473 """Returns tuples with length *size* which represent a sliding
474 window over iterable *src*.
475
476 >>> list(windowed_iter(range(7), 3))
477 [(0, 1, 2), (1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)]
478
479 If *fill* is unset, and the iterable is too short to make a window
480 of length *size*, then no window tuples are returned.
481
482 >>> list(windowed_iter(range(3), 5))
483 []
484
485 With *fill* set, the iterator always yields a number of windows
486 equal to the length of the *src* iterable.
487
488 >>> windowed(range(4), 3, fill=None)
489 [(0, 1, 2), (1, 2, 3), (2, 3, None), (3, None, None)]
490
491 This way, *fill* values can be useful to signal the end of the iterable.
492 For infinite iterators, setting *fill* has no effect.
493 """
494 tees = itertools.tee(src, size)
495 if fill is _UNSET:
496 try:
497 for i, t in enumerate(tees):
498 for _ in range(i):
499 next(t)
500 except StopIteration:
501 return zip([])
502 return zip(*tees)
503
504 for i, t in enumerate(tees):
505 for _ in range(i):
506 try:
507 next(t)
508 except StopIteration:
509 continue
510 return zip_longest(*tees, fillvalue=fill)
511
512
513def xfrange(stop, start=None, step=1.0):
514 """Same as :func:`frange`, but generator-based instead of returning a
515 list.
516
517 >>> tuple(xfrange(1, 3, step=0.75))
518 (1.0, 1.75, 2.5)
519
520 See :func:`frange` for more details.
521 """
522 if not step:
523 raise ValueError('step must be non-zero')
524 if start is None:
525 start, stop = 0.0, stop * 1.0
526 else:
527 # swap when all args are used
528 stop, start = start * 1.0, stop * 1.0
529 cur = start
530 while cur < stop:
531 yield cur
532 cur += step
533
534
535def frange(stop, start=None, step=1.0):
536 """A :func:`range` clone for float-based ranges.
537
538 >>> frange(5)
539 [0.0, 1.0, 2.0, 3.0, 4.0]
540 >>> frange(6, step=1.25)
541 [0.0, 1.25, 2.5, 3.75, 5.0]
542 >>> frange(100.5, 101.5, 0.25)
543 [100.5, 100.75, 101.0, 101.25]
544 >>> frange(5, 0)
545 []
546 >>> frange(5, 0, step=-1.25)
547 [5.0, 3.75, 2.5, 1.25]
548 """
549 if not step:
550 raise ValueError('step must be non-zero')
551 if start is None:
552 start, stop = 0.0, stop * 1.0
553 else:
554 # swap when all args are used
555 stop, start = start * 1.0, stop * 1.0
556 count = int(math.ceil((stop - start) / step))
557 ret = [None] * count
558 if not ret:
559 return ret
560 ret[0] = start
561 for i in range(1, count):
562 ret[i] = ret[i - 1] + step
563 return ret
564
565
566def backoff(start, stop, count=None, factor=2.0, jitter=False):
567 """Returns a list of geometrically-increasing floating-point numbers,
568 suitable for usage with `exponential backoff`_. Exactly like
569 :func:`backoff_iter`, but without the ``'repeat'`` option for
570 *count*. See :func:`backoff_iter` for more details.
571
572 .. _exponential backoff: https://en.wikipedia.org/wiki/Exponential_backoff
573
574 >>> backoff(1, 10)
575 [1.0, 2.0, 4.0, 8.0, 10.0]
576 """
577 if count == 'repeat':
578 raise ValueError("'repeat' supported in backoff_iter, not backoff")
579 return list(backoff_iter(start, stop, count=count,
580 factor=factor, jitter=jitter))
581
582
583def backoff_iter(start, stop, count=None, factor=2.0, jitter=False):
584 """Generates a sequence of geometrically-increasing floats, suitable
585 for usage with `exponential backoff`_. Starts with *start*,
586 increasing by *factor* until *stop* is reached, optionally
587 stopping iteration once *count* numbers are yielded. *factor*
588 defaults to 2. In general retrying with properly-configured
589 backoff creates a better-behaved component for a larger service
590 ecosystem.
591
592 .. _exponential backoff: https://en.wikipedia.org/wiki/Exponential_backoff
593
594 >>> list(backoff_iter(1.0, 10.0, count=5))
595 [1.0, 2.0, 4.0, 8.0, 10.0]
596 >>> list(backoff_iter(1.0, 10.0, count=8))
597 [1.0, 2.0, 4.0, 8.0, 10.0, 10.0, 10.0, 10.0]
598 >>> list(backoff_iter(0.25, 100.0, factor=10))
599 [0.25, 2.5, 25.0, 100.0]
600
601 A simplified usage example:
602
603 .. code-block:: python
604
605 for timeout in backoff_iter(0.25, 5.0):
606 try:
607 res = network_call()
608 break
609 except Exception as e:
610 log(e)
611 time.sleep(timeout)
612
613 An enhancement for large-scale systems would be to add variation,
614 or *jitter*, to timeout values. This is done to avoid a thundering
615 herd on the receiving end of the network call.
616
617 Finally, for *count*, the special value ``'repeat'`` can be passed to
618 continue yielding indefinitely.
619
620 Args:
621
622 start (float): Positive number for baseline.
623 stop (float): Positive number for maximum.
624 count (int): Number of steps before stopping
625 iteration. Defaults to the number of steps between *start* and
626 *stop*. Pass the string, `'repeat'`, to continue iteration
627 indefinitely.
628 factor (float): Rate of exponential increase. Defaults to `2.0`,
629 e.g., `[1, 2, 4, 8, 16]`.
630 jitter (float): A factor between `-1.0` and `1.0`, used to
631 uniformly randomize and thus spread out timeouts in a distributed
632 system, avoiding rhythm effects. Positive values use the base
633 backoff curve as a maximum, negative values use the curve as a
634 minimum. Set to 1.0 or `True` for a jitter approximating
635 Ethernet's time-tested backoff solution. Defaults to `False`.
636
637 """
638 start = float(start)
639 stop = float(stop)
640 factor = float(factor)
641 if start < 0.0:
642 raise ValueError('expected start >= 0, not %r' % start)
643 if factor < 1.0:
644 raise ValueError('expected factor >= 1.0, not %r' % factor)
645 if stop == 0.0:
646 raise ValueError('expected stop >= 0')
647 if stop < start:
648 raise ValueError('expected stop >= start, not %r' % stop)
649 if count is None:
650 denom = start if start else 1
651 count = 1 + math.ceil(math.log(stop/denom, factor))
652 count = count if start else count + 1
653 if count != 'repeat' and count < 0:
654 raise ValueError('count must be positive or "repeat", not %r' % count)
655 if jitter:
656 jitter = float(jitter)
657 if not (-1.0 <= jitter <= 1.0):
658 raise ValueError('expected jitter -1 <= j <= 1, not: %r' % jitter)
659
660 cur, i = start, 0
661 while count == 'repeat' or i < count:
662 if not jitter:
663 cur_ret = cur
664 elif jitter:
665 cur_ret = cur - (cur * jitter * random.random())
666 yield cur_ret
667 i += 1
668 if cur == 0:
669 cur = 1
670 elif cur < stop:
671 cur *= factor
672 if cur > stop:
673 cur = stop
674 return
675
676
677def bucketize(src, key=bool, value_transform=None, key_filter=None):
678 """Group values in the *src* iterable by the value returned by *key*.
679
680 >>> bucketize(range(5))
681 {False: [0], True: [1, 2, 3, 4]}
682 >>> is_odd = lambda x: x % 2 == 1
683 >>> bucketize(range(5), is_odd)
684 {False: [0, 2, 4], True: [1, 3]}
685
686 *key* is :class:`bool` by default, but can either be a callable or a string or a list
687 if it is a string, it is the name of the attribute on which to bucketize objects.
688
689 >>> bucketize([1+1j, 2+2j, 1, 2], key='real')
690 {1.0: [(1+1j), 1], 2.0: [(2+2j), 2]}
691
692 if *key* is a list, it contains the buckets where to put each object
693
694 >>> bucketize([1,2,365,4,98],key=[0,1,2,0,2])
695 {0: [1, 4], 1: [2], 2: [365, 98]}
696
697
698 Value lists are not deduplicated:
699
700 >>> bucketize([None, None, None, 'hello'])
701 {False: [None, None, None], True: ['hello']}
702
703 Bucketize into more than 3 groups
704
705 >>> bucketize(range(10), lambda x: x % 3)
706 {0: [0, 3, 6, 9], 1: [1, 4, 7], 2: [2, 5, 8]}
707
708 ``bucketize`` has a couple of advanced options useful in certain
709 cases. *value_transform* can be used to modify values as they are
710 added to buckets, and *key_filter* will allow excluding certain
711 buckets from being collected.
712
713 >>> bucketize(range(5), value_transform=lambda x: x*x)
714 {False: [0], True: [1, 4, 9, 16]}
715
716 >>> bucketize(range(10), key=lambda x: x % 3, key_filter=lambda k: k % 3 != 1)
717 {0: [0, 3, 6, 9], 2: [2, 5, 8]}
718
719 Note in some of these examples there were at most two keys, ``True`` and
720 ``False``, and each key present has a list with at least one
721 item. See :func:`partition` for a version specialized for binary
722 use cases.
723
724 """
725 if not is_iterable(src):
726 raise TypeError('expected an iterable')
727 elif isinstance(key, list):
728 if len(key) != len(src):
729 raise ValueError("key and src have to be the same length")
730 src = zip(key, src)
731
732 if isinstance(key, str):
733 def key_func(x): return getattr(x, key, x)
734 elif callable(key):
735 key_func = key
736 elif isinstance(key, list):
737 def key_func(x): return x[0]
738 else:
739 raise TypeError('expected key to be callable or a string or a list')
740
741 if value_transform is None:
742 def value_transform(x): return x
743 if not callable(value_transform):
744 raise TypeError('expected callable value transform function')
745 if isinstance(key, list):
746 f = value_transform
747 def value_transform(x): return f(x[1])
748
749 ret = {}
750 for val in src:
751 key_of_val = key_func(val)
752 if key_filter is None or key_filter(key_of_val):
753 ret.setdefault(key_of_val, []).append(value_transform(val))
754 return ret
755
756
757def partition(src, key=bool):
758 """No relation to :meth:`str.partition`, ``partition`` is like
759 :func:`bucketize`, but for added convenience returns a tuple of
760 ``(truthy_values, falsy_values)``.
761
762 >>> nonempty, empty = partition(['', '', 'hi', '', 'bye'])
763 >>> nonempty
764 ['hi', 'bye']
765
766 *key* defaults to :class:`bool`, but can be carefully overridden to
767 use either a function that returns either ``True`` or ``False`` or
768 a string name of the attribute on which to partition objects.
769
770 >>> import string
771 >>> is_digit = lambda x: x in string.digits
772 >>> decimal_digits, hexletters = partition(string.hexdigits, is_digit)
773 >>> ''.join(decimal_digits), ''.join(hexletters)
774 ('0123456789', 'abcdefABCDEF')
775 """
776 bucketized = bucketize(src, key)
777 return bucketized.get(True, []), bucketized.get(False, [])
778
779
780def unique(src, key=None):
781 """``unique()`` returns a list of unique values, as determined by
782 *key*, in the order they first appeared in the input iterable,
783 *src*.
784
785 >>> ones_n_zeros = '11010110001010010101010'
786 >>> ''.join(unique(ones_n_zeros))
787 '10'
788
789 See :func:`unique_iter` docs for more details.
790 """
791 return list(unique_iter(src, key))
792
793
794def unique_iter(src, key=None):
795 """Yield unique elements from the iterable, *src*, based on *key*,
796 in the order in which they first appeared in *src*.
797
798 >>> repetitious = [1, 2, 3] * 10
799 >>> list(unique_iter(repetitious))
800 [1, 2, 3]
801
802 By default, *key* is the object itself, but *key* can either be a
803 callable or, for convenience, a string name of the attribute on
804 which to uniqueify objects, falling back on identity when the
805 attribute is not present.
806
807 >>> pleasantries = ['hi', 'hello', 'ok', 'bye', 'yes']
808 >>> list(unique_iter(pleasantries, key=lambda x: len(x)))
809 ['hi', 'hello', 'bye']
810 """
811 if not is_iterable(src):
812 raise TypeError('expected an iterable, not %r' % type(src))
813 if key is None:
814 def key_func(x): return x
815 elif callable(key):
816 key_func = key
817 elif isinstance(key, str):
818 def key_func(x): return getattr(x, key, x)
819 else:
820 raise TypeError('"key" expected a string or callable, not %r' % key)
821 seen = set()
822 for i in src:
823 k = key_func(i)
824 if k not in seen:
825 seen.add(k)
826 yield i
827 return
828
829
830def redundant(src, key=None, groups=False):
831 """The complement of :func:`unique()`.
832
833 By default returns non-unique/duplicate values as a list of the
834 *first* redundant value in *src*. Pass ``groups=True`` to get
835 groups of all values with redundancies, ordered by position of the
836 first redundant value. This is useful in conjunction with some
837 normalizing *key* function.
838
839 >>> redundant([1, 2, 3, 4])
840 []
841 >>> redundant([1, 2, 3, 2, 3, 3, 4])
842 [2, 3]
843 >>> redundant([1, 2, 3, 2, 3, 3, 4], groups=True)
844 [[2, 2], [3, 3, 3]]
845
846 An example using a *key* function to do case-insensitive
847 redundancy detection.
848
849 >>> redundant(['hi', 'Hi', 'HI', 'hello'], key=str.lower)
850 ['Hi']
851 >>> redundant(['hi', 'Hi', 'HI', 'hello'], groups=True, key=str.lower)
852 [['hi', 'Hi', 'HI']]
853
854 *key* should also be used when the values in *src* are not hashable.
855
856 .. note::
857
858 This output of this function is designed for reporting
859 duplicates in contexts when a unique input is desired. Due to
860 the grouped return type, there is no streaming equivalent of
861 this function for the time being.
862
863 """
864 if key is None:
865 pass
866 elif callable(key):
867 key_func = key
868 elif isinstance(key, (str, bytes)):
869 def key_func(x): return getattr(x, key, x)
870 else:
871 raise TypeError('"key" expected a string or callable, not %r' % key)
872 seen = {} # key to first seen item
873 redundant_order = []
874 redundant_groups = {}
875 for i in src:
876 k = key_func(i) if key else i
877 if k not in seen:
878 seen[k] = i
879 else:
880 if k in redundant_groups:
881 if groups:
882 redundant_groups[k].append(i)
883 else:
884 redundant_order.append(k)
885 redundant_groups[k] = [seen[k], i]
886 if not groups:
887 ret = [redundant_groups[k][1] for k in redundant_order]
888 else:
889 ret = [redundant_groups[k] for k in redundant_order]
890 return ret
891
892
893def one(src, default=None, key=None):
894 """Along the same lines as builtins, :func:`all` and :func:`any`, and
895 similar to :func:`first`, ``one()`` returns the single object in
896 the given iterable *src* that evaluates to ``True``, as determined
897 by callable *key*. If unset, *key* defaults to :class:`bool`. If
898 no such objects are found, *default* is returned. If *default* is
899 not passed, ``None`` is returned.
900
901 If *src* has more than one object that evaluates to ``True``, or
902 if there is no object that fulfills such condition, return
903 *default*. It's like an `XOR`_ over an iterable.
904
905 >>> one((True, False, False))
906 True
907 >>> one((True, False, True))
908 >>> one((0, 0, 'a'))
909 'a'
910 >>> one((0, False, None))
911 >>> one((True, True), default=False)
912 False
913 >>> bool(one(('', 1)))
914 True
915 >>> one((10, 20, 30, 42), key=lambda i: i > 40)
916 42
917
918 See `Martín Gaitán's original repo`_ for further use cases.
919
920 .. _Martín Gaitán's original repo: https://github.com/mgaitan/one
921 .. _XOR: https://en.wikipedia.org/wiki/Exclusive_or
922
923 """
924 ones = list(itertools.islice(filter(key, src), 2))
925 return ones[0] if len(ones) == 1 else default
926
927
928def first(iterable, default=None, key=None):
929 """Return first element of *iterable* that evaluates to ``True``, else
930 return ``None`` or optional *default*. Similar to :func:`one`.
931
932 >>> first([0, False, None, [], (), 42])
933 42
934 >>> first([0, False, None, [], ()]) is None
935 True
936 >>> first([0, False, None, [], ()], default='ohai')
937 'ohai'
938 >>> import re
939 >>> m = first(re.match(regex, 'abc') for regex in ['b.*', 'a(.*)'])
940 >>> m.group(1)
941 'bc'
942
943 The optional *key* argument specifies a one-argument predicate function
944 like that used for *filter()*. The *key* argument, if supplied, should be
945 in keyword form. For example, finding the first even number in an iterable:
946
947 >>> first([1, 1, 3, 4, 5], key=lambda x: x % 2 == 0)
948 4
949
950 Contributed by Hynek Schlawack, author of `the original standalone module`_.
951
952 .. _the original standalone module: https://github.com/hynek/first
953 """
954 return next(filter(key, iterable), default)
955
956
957def flatten_iter(iterable):
958 """``flatten_iter()`` yields all the elements from *iterable* while
959 collapsing any nested iterables.
960
961 >>> nested = [[1, 2], [[3], [4, 5]]]
962 >>> list(flatten_iter(nested))
963 [1, 2, 3, 4, 5]
964 """
965 for item in iterable:
966 if isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
967 yield from flatten_iter(item)
968 else:
969 yield item
970
971
972def flatten(iterable):
973 """``flatten()`` returns a collapsed list of all the elements from
974 *iterable* while collapsing any nested iterables.
975
976 >>> nested = [[1, 2], [[3], [4, 5]]]
977 >>> flatten(nested)
978 [1, 2, 3, 4, 5]
979 """
980 return list(flatten_iter(iterable))
981
982
983def same(iterable, ref=_UNSET):
984 """``same()`` returns ``True`` when all values in *iterable* are
985 equal to one another, or optionally a reference value,
986 *ref*. Similar to :func:`all` and :func:`any` in that it evaluates
987 an iterable and returns a :class:`bool`. ``same()`` returns
988 ``True`` for empty iterables.
989
990 >>> same([])
991 True
992 >>> same([1])
993 True
994 >>> same(['a', 'a', 'a'])
995 True
996 >>> same(range(20))
997 False
998 >>> same([[], []])
999 True
1000 >>> same([[], []], ref='test')
1001 False
1002
1003 """
1004 iterator = iter(iterable)
1005 if ref is _UNSET:
1006 ref = next(iterator, ref)
1007 return all(val == ref for val in iterator)
1008
1009
1010def default_visit(path, key, value):
1011 # print('visit(%r, %r, %r)' % (path, key, value))
1012 return key, value
1013
1014
1015# enable the extreme: monkeypatching iterutils with a different default_visit
1016_orig_default_visit = default_visit
1017
1018
1019def default_enter(path, key, value):
1020 # print('enter(%r, %r)' % (key, value))
1021 if isinstance(value, (str, bytes)):
1022 return value, False
1023 elif isinstance(value, Mapping):
1024 return value.__class__(), ItemsView(value)
1025 elif isinstance(value, Sequence):
1026 return value.__class__(), enumerate(value)
1027 elif isinstance(value, Set):
1028 return value.__class__(), enumerate(value)
1029 else:
1030 # files, strings, other iterables, and scalars are not
1031 # traversed
1032 return value, False
1033
1034
1035def default_exit(path, key, old_parent, new_parent, new_items):
1036 # print('exit(%r, %r, %r, %r, %r)'
1037 # % (path, key, old_parent, new_parent, new_items))
1038 ret = new_parent
1039 if isinstance(new_parent, Mapping):
1040 new_parent.update(new_items)
1041 elif isinstance(new_parent, Sequence):
1042 vals = [v for i, v in new_items]
1043 try:
1044 new_parent.extend(vals)
1045 except AttributeError:
1046 ret = new_parent.__class__(vals) # tuples
1047 elif isinstance(new_parent, Set):
1048 vals = [v for i, v in new_items]
1049 try:
1050 new_parent.update(vals)
1051 except AttributeError:
1052 ret = new_parent.__class__(vals) # frozensets
1053 else:
1054 raise RuntimeError('unexpected iterable type: %r' % type(new_parent))
1055 return ret
1056
1057
1058def remap(root, visit=default_visit, enter=default_enter, exit=default_exit,
1059 **kwargs):
1060 """The remap ("recursive map") function is used to traverse and
1061 transform nested structures. Lists, tuples, sets, and dictionaries
1062 are just a few of the data structures nested into heterogeneous
1063 tree-like structures that are so common in programming.
1064 Unfortunately, Python's built-in ways to manipulate collections
1065 are almost all flat. List comprehensions may be fast and succinct,
1066 but they do not recurse, making it tedious to apply quick changes
1067 or complex transforms to real-world data.
1068
1069 remap goes where list comprehensions cannot.
1070
1071 Here's an example of removing all Nones from some data:
1072
1073 >>> from pprint import pprint
1074 >>> reviews = {'Star Trek': {'TNG': 10, 'DS9': 8.5, 'ENT': None},
1075 ... 'Babylon 5': 6, 'Dr. Who': None}
1076 >>> pprint(remap(reviews, lambda p, k, v: v is not None))
1077 {'Babylon 5': 6, 'Star Trek': {'DS9': 8.5, 'TNG': 10}}
1078
1079 Notice how both Nones have been removed despite the nesting in the
1080 dictionary. Not bad for a one-liner, and that's just the beginning.
1081 See `this remap cookbook`_ for more delicious recipes.
1082
1083 .. _this remap cookbook: http://sedimental.org/remap.html
1084
1085 remap takes four main arguments: the object to traverse and three
1086 optional callables which determine how the remapped object will be
1087 created.
1088
1089 Args:
1090
1091 root: The target object to traverse. By default, remap
1092 supports iterables like :class:`list`, :class:`tuple`,
1093 :class:`dict`, and :class:`set`, but any object traversable by
1094 *enter* will work.
1095 visit (callable): This function is called on every item in
1096 *root*. It must accept three positional arguments, *path*,
1097 *key*, and *value*. *path* is simply a tuple of parents'
1098 keys. *visit* should return the new key-value pair. It may
1099 also return ``True`` as shorthand to keep the old item
1100 unmodified, or ``False`` to drop the item from the new
1101 structure. *visit* is called after *enter*, on the new parent.
1102
1103 The *visit* function is called for every item in root,
1104 including duplicate items. For traversable values, it is
1105 called on the new parent object, after all its children
1106 have been visited. The default visit behavior simply
1107 returns the key-value pair unmodified.
1108 enter (callable): This function controls which items in *root*
1109 are traversed. It accepts the same arguments as *visit*: the
1110 path, the key, and the value of the current item. It returns a
1111 pair of the blank new parent, and an iterator over the items
1112 which should be visited. If ``False`` is returned instead of
1113 an iterator, the value will not be traversed.
1114
1115 The *enter* function is only called once per unique value. The
1116 default enter behavior support mappings, sequences, and
1117 sets. Strings and all other iterables will not be traversed.
1118 exit (callable): This function determines how to handle items
1119 once they have been visited. It gets the same three
1120 arguments as the other functions -- *path*, *key*, *value*
1121 -- plus two more: the blank new parent object returned
1122 from *enter*, and a list of the new items, as remapped by
1123 *visit*.
1124
1125 Like *enter*, the *exit* function is only called once per
1126 unique value. The default exit behavior is to simply add
1127 all new items to the new parent, e.g., using
1128 :meth:`list.extend` and :meth:`dict.update` to add to the
1129 new parent. Immutable objects, such as a :class:`tuple` or
1130 :class:`namedtuple`, must be recreated from scratch, but
1131 use the same type as the new parent passed back from the
1132 *enter* function.
1133 reraise_visit (bool): A pragmatic convenience for the *visit*
1134 callable. When set to ``False``, remap ignores any errors
1135 raised by the *visit* callback. Items causing exceptions
1136 are kept. See examples for more details.
1137 trace (bool): Pass ``trace=True`` to print out the entire
1138 traversal. Or pass a tuple of ``'visit'``, ``'enter'``,
1139 or ``'exit'`` to print only the selected events.
1140
1141 remap is designed to cover the majority of cases with just the
1142 *visit* callable. While passing in multiple callables is very
1143 empowering, remap is designed so very few cases should require
1144 passing more than one function.
1145
1146 When passing *enter* and *exit*, it's common and easiest to build
1147 on the default behavior. Simply add ``from boltons.iterutils import
1148 default_enter`` (or ``default_exit``), and have your enter/exit
1149 function call the default behavior before or after your custom
1150 logic. See `this example`_.
1151
1152 Duplicate and self-referential objects (aka reference loops) are
1153 automatically handled internally, `as shown here`_.
1154
1155 .. _this example: http://sedimental.org/remap.html#sort_all_lists
1156 .. _as shown here: http://sedimental.org/remap.html#corner_cases
1157
1158 """
1159 # TODO: improve argument formatting in sphinx doc
1160 # TODO: enter() return (False, items) to continue traverse but cancel copy?
1161 if not callable(visit):
1162 raise TypeError('visit expected callable, not: %r' % visit)
1163 if not callable(enter):
1164 raise TypeError('enter expected callable, not: %r' % enter)
1165 if not callable(exit):
1166 raise TypeError('exit expected callable, not: %r' % exit)
1167 reraise_visit = kwargs.pop('reraise_visit', True)
1168 trace = kwargs.pop('trace', ())
1169 if trace is True:
1170 trace = ('visit', 'enter', 'exit')
1171 elif isinstance(trace, str):
1172 trace = (trace,)
1173 if not isinstance(trace, (tuple, list, set)):
1174 raise TypeError('trace expected tuple of event names, not: %r' % trace)
1175 trace_enter, trace_exit, trace_visit = 'enter' in trace, 'exit' in trace, 'visit' in trace
1176
1177 if kwargs:
1178 raise TypeError('unexpected keyword arguments: %r' % kwargs.keys())
1179
1180 path, registry, stack = (), {}, [(None, root)]
1181 new_items_stack = []
1182 while stack:
1183 key, value = stack.pop()
1184 id_value = id(value)
1185 if key is _REMAP_EXIT:
1186 key, new_parent, old_parent = value
1187 id_value = id(old_parent)
1188 path, new_items = new_items_stack.pop()
1189 if trace_exit:
1190 print(' .. remap exit:', path, '-', key, '-',
1191 old_parent, '-', new_parent, '-', new_items)
1192 value = exit(path, key, old_parent, new_parent, new_items)
1193 if trace_exit:
1194 print(' .. remap exit result:', value)
1195 registry[id_value] = value
1196 if not new_items_stack:
1197 continue
1198 elif id_value in registry:
1199 value = registry[id_value]
1200 else:
1201 if trace_enter:
1202 print(' .. remap enter:', path, '-', key, '-', value)
1203 res = enter(path, key, value)
1204 if trace_enter:
1205 print(' .. remap enter result:', res)
1206 try:
1207 new_parent, new_items = res
1208 except TypeError:
1209 # TODO: handle False?
1210 raise TypeError('enter should return a tuple of (new_parent,'
1211 ' items_iterator), not: %r' % res)
1212 if new_items is not False:
1213 # traverse unless False is explicitly passed
1214 registry[id_value] = new_parent
1215 new_items_stack.append((path, []))
1216 if value is not root:
1217 path += (key,)
1218 stack.append((_REMAP_EXIT, (key, new_parent, value)))
1219 if new_items:
1220 stack.extend(reversed(list(new_items)))
1221 if trace_enter:
1222 print(' .. remap stack size now:', len(stack))
1223 continue
1224 if visit is _orig_default_visit:
1225 # avoid function call overhead by inlining identity operation
1226 visited_item = (key, value)
1227 else:
1228 try:
1229 if trace_visit:
1230 print(' .. remap visit:', path, '-', key, '-', value)
1231 visited_item = visit(path, key, value)
1232 except Exception:
1233 if reraise_visit:
1234 raise
1235 visited_item = True
1236 if visited_item is False:
1237 if trace_visit:
1238 print(' .. remap visit result: <drop>')
1239 continue # drop
1240 elif visited_item is True:
1241 visited_item = (key, value)
1242 if trace_visit:
1243 print(' .. remap visit result:', visited_item)
1244 # TODO: typecheck?
1245 # raise TypeError('expected (key, value) from visit(),'
1246 # ' not: %r' % visited_item)
1247 try:
1248 new_items_stack[-1][1].append(visited_item)
1249 except IndexError:
1250 raise TypeError('expected remappable root, not: %r' % root)
1251 return value
1252
1253
1254class PathAccessError(KeyError, IndexError, TypeError):
1255 """An amalgamation of KeyError, IndexError, and TypeError,
1256 representing what can occur when looking up a path in a nested
1257 object.
1258 """
1259
1260 def __init__(self, exc, seg, path):
1261 self.exc = exc
1262 self.seg = seg
1263 self.path = path
1264
1265 def __repr__(self):
1266 cn = self.__class__.__name__
1267 return f'{cn}({self.exc!r}, {self.seg!r}, {self.path!r})'
1268
1269 def __str__(self):
1270 return ('could not access %r from path %r, got error: %r'
1271 % (self.seg, self.path, self.exc))
1272
1273
1274def get_path(root, path, default=_UNSET):
1275 """Retrieve a value from a nested object via a tuple representing the
1276 lookup path.
1277
1278 >>> root = {'a': {'b': {'c': [[1], [2], [3]]}}}
1279 >>> get_path(root, ('a', 'b', 'c', 2, 0))
1280 3
1281
1282 The path tuple format is intentionally consistent with that of
1283 :func:`remap`, but a single dotted string can also be passed.
1284
1285 One of get_path's chief aims is improved error messaging. EAFP is
1286 great, but the error messages are not.
1287
1288 For instance, ``root['a']['b']['c'][2][1]`` gives back
1289 ``IndexError: list index out of range``
1290
1291 What went out of range where? get_path currently raises
1292 ``PathAccessError: could not access 2 from path ('a', 'b', 'c', 2,
1293 1), got error: IndexError('list index out of range',)``, a
1294 subclass of IndexError and KeyError.
1295
1296 You can also pass a default that covers the entire operation,
1297 should the lookup fail at any level.
1298
1299 Args:
1300 root: The target nesting of dictionaries, lists, or other
1301 objects supporting ``__getitem__``.
1302 path (tuple): A sequence of strings and integers to be successively
1303 looked up within *root*. A dot-separated (``a.b``) string may
1304 also be passed.
1305 default: The value to be returned should any
1306 ``PathAccessError`` exceptions be raised.
1307 """
1308 if isinstance(path, str):
1309 path = path.split('.')
1310 cur = root
1311 try:
1312 for seg in path:
1313 try:
1314 cur = cur[seg]
1315 except (KeyError, IndexError) as exc:
1316 raise PathAccessError(exc, seg, path)
1317 except TypeError as exc:
1318 # either string index in a list, or a parent that
1319 # doesn't support indexing
1320 try:
1321 seg = int(seg)
1322 cur = cur[seg]
1323 except (ValueError, KeyError, IndexError, TypeError):
1324 if not is_iterable(cur):
1325 exc = TypeError('%r object is not indexable'
1326 % type(cur).__name__)
1327 raise PathAccessError(exc, seg, path)
1328 except PathAccessError:
1329 if default is _UNSET:
1330 raise
1331 return default
1332 return cur
1333
1334
1335def research(root, query=lambda p, k, v: True, reraise=False, enter=default_enter):
1336 """The :func:`research` function uses :func:`remap` to recurse over
1337 any data nested in *root*, and find values which match a given
1338 criterion, specified by the *query* callable.
1339
1340 Results are returned as a list of ``(path, value)`` pairs. The
1341 paths are tuples in the same format accepted by
1342 :func:`get_path`. This can be useful for comparing values nested
1343 in two or more different structures.
1344
1345 Here's a simple example that finds all integers:
1346
1347 >>> root = {'a': {'b': 1, 'c': (2, 'd', 3)}, 'e': None}
1348 >>> res = research(root, query=lambda p, k, v: isinstance(v, int))
1349 >>> print(sorted(res))
1350 [(('a', 'b'), 1), (('a', 'c', 0), 2), (('a', 'c', 2), 3)]
1351
1352 Note how *query* follows the same, familiar ``path, key, value``
1353 signature as the ``visit`` and ``enter`` functions on
1354 :func:`remap`, and returns a :class:`bool`.
1355
1356 Args:
1357 root: The target object to search. Supports the same types of
1358 objects as :func:`remap`, including :class:`list`,
1359 :class:`tuple`, :class:`dict`, and :class:`set`.
1360 query (callable): The function called on every object to
1361 determine whether to include it in the search results. The
1362 callable must accept three arguments, *path*, *key*, and
1363 *value*, commonly abbreviated *p*, *k*, and *v*, same as
1364 *enter* and *visit* from :func:`remap`.
1365 reraise (bool): Whether to reraise exceptions raised by *query*
1366 or to simply drop the result that caused the error.
1367
1368
1369 With :func:`research` it's easy to inspect the details of a data
1370 structure, like finding values that are at a certain depth (using
1371 ``len(p)``) and much more. If more advanced functionality is
1372 needed, check out the code and make your own :func:`remap`
1373 wrapper, and consider `submitting a patch`_!
1374
1375 .. _submitting a patch: https://github.com/mahmoud/boltons/pulls
1376 """
1377 ret = []
1378
1379 if not callable(query):
1380 raise TypeError('query expected callable, not: %r' % query)
1381
1382 def _enter(path, key, value):
1383 try:
1384 if query(path, key, value):
1385 ret.append((path + (key,), value))
1386 except Exception:
1387 if reraise:
1388 raise
1389 return enter(path, key, value)
1390
1391 remap(root, enter=_enter)
1392 return ret
1393
1394
1395# TODO: recollect()
1396# TODO: refilter()
1397# TODO: reiter()
1398
1399
1400# GUID iterators: 10x faster and somewhat more compact than uuid.
1401
1402class GUIDerator:
1403 """The GUIDerator is an iterator that yields a globally-unique
1404 identifier (GUID) on every iteration. The GUIDs produced are
1405 hexadecimal strings.
1406
1407 Testing shows it to be around 12x faster than the uuid module. By
1408 default it is also more compact, partly due to its default 96-bit
1409 (24-hexdigit) length. 96 bits of randomness means that there is a
1410 1 in 2 ^ 32 chance of collision after 2 ^ 64 iterations. If more
1411 or less uniqueness is desired, the *size* argument can be adjusted
1412 accordingly.
1413
1414 Args:
1415 size (int): character length of the GUID, defaults to 24. Lengths
1416 between 20 and 36 are considered valid.
1417
1418 The GUIDerator has built-in fork protection that causes it to
1419 detect a fork on next iteration and reseed accordingly.
1420
1421 """
1422
1423 def __init__(self, size=24):
1424 self.size = size
1425 if size < 20 or size > 36:
1426 raise ValueError('expected 20 < size <= 36')
1427 import hashlib
1428 self._sha1 = hashlib.sha1
1429 self.count = itertools.count()
1430 self.reseed()
1431
1432 def reseed(self):
1433 import socket
1434 self.pid = os.getpid()
1435 self.salt = '-'.join([str(self.pid),
1436 socket.gethostname() or '<nohostname>',
1437 str(time.time()),
1438 os.urandom(6).hex()])
1439 return
1440
1441 def __iter__(self):
1442 return self
1443
1444 def __next__(self):
1445 if os.getpid() != self.pid:
1446 self.reseed()
1447 target_bytes = (self.salt + str(next(self.count))).encode('utf8')
1448 hash_text = self._sha1(target_bytes).hexdigest()[:self.size]
1449 return hash_text
1450
1451 next = __next__
1452
1453
1454class SequentialGUIDerator(GUIDerator):
1455 """Much like the standard GUIDerator, the SequentialGUIDerator is an
1456 iterator that yields a globally-unique identifier (GUID) on every
1457 iteration. The GUIDs produced are hexadecimal strings.
1458
1459 The SequentialGUIDerator differs in that it picks a starting GUID
1460 value and increments every iteration. This yields GUIDs which are
1461 of course unique, but also ordered and lexicographically sortable.
1462
1463 The SequentialGUIDerator is around 50% faster than the normal
1464 GUIDerator, making it almost 20x as fast as the built-in uuid
1465 module. By default it is also more compact, partly due to its
1466 96-bit (24-hexdigit) default length. 96 bits of randomness means that
1467 there is a 1 in 2 ^ 32 chance of collision after 2 ^ 64
1468 iterations. If more or less uniqueness is desired, the *size*
1469 argument can be adjusted accordingly.
1470
1471 Args:
1472 size (int): character length of the GUID, defaults to 24.
1473
1474 Note that with SequentialGUIDerator there is a chance of GUIDs
1475 growing larger than the size configured. The SequentialGUIDerator
1476 has built-in fork protection that causes it to detect a fork on
1477 next iteration and reseed accordingly.
1478
1479 """
1480
1481 def reseed(self):
1482 super().reseed()
1483 start_str = self._sha1(self.salt.encode('utf8')).hexdigest()
1484 self.start = int(start_str[:self.size], 16)
1485 self.start |= (1 << ((self.size * 4) - 2))
1486
1487 def __next__(self):
1488 if os.getpid() != self.pid:
1489 self.reseed()
1490 return '%x' % (next(self.count) + self.start)
1491
1492 next = __next__
1493
1494
1495guid_iter = GUIDerator()
1496seq_guid_iter = SequentialGUIDerator()
1497
1498
1499def soft_sorted(iterable, first=None, last=None, key=None, reverse=False):
1500 """For when you care about the order of some elements, but not about
1501 others.
1502
1503 Use this to float to the top and/or sink to the bottom a specific
1504 ordering, while sorting the rest of the elements according to
1505 normal :func:`sorted` rules.
1506
1507 >>> soft_sorted(['two', 'b', 'one', 'a'], first=['one', 'two'])
1508 ['one', 'two', 'a', 'b']
1509 >>> soft_sorted(range(7), first=[6, 15], last=[2, 4], reverse=True)
1510 [6, 5, 3, 1, 0, 2, 4]
1511 >>> import string
1512 >>> ''.join(soft_sorted(string.hexdigits, first='za1', last='b', key=str.lower))
1513 'aA1023456789cCdDeEfFbB'
1514
1515 Args:
1516 iterable (list): A list or other iterable to sort.
1517 first (list): A sequence to enforce for elements which should
1518 appear at the beginning of the returned list.
1519 last (list): A sequence to enforce for elements which should
1520 appear at the end of the returned list.
1521 key (callable): Callable used to generate a comparable key for
1522 each item to be sorted, same as the key in
1523 :func:`sorted`. Note that entries in *first* and *last*
1524 should be the keys for the items. Defaults to
1525 passthrough/the identity function.
1526 reverse (bool): Whether or not elements not explicitly ordered
1527 by *first* and *last* should be in reverse order or not.
1528
1529 Returns a new list in sorted order.
1530 """
1531 first = first or []
1532 last = last or []
1533 key = key or (lambda x: x)
1534 seq = list(iterable)
1535 other = [x for x in seq if not (
1536 (first and key(x) in first) or (last and key(x) in last))]
1537 other.sort(key=key, reverse=reverse)
1538
1539 if first:
1540 first = sorted([x for x in seq if key(x) in first],
1541 key=lambda x: first.index(key(x)))
1542 if last:
1543 last = sorted([x for x in seq if key(x) in last],
1544 key=lambda x: last.index(key(x)))
1545 return first + other + last
1546
1547
1548def untyped_sorted(iterable, key=None, reverse=False):
1549 """A version of :func:`sorted` which will happily sort an iterable of
1550 heterogeneous types and return a new list, similar to legacy Python's
1551 behavior.
1552
1553 >>> untyped_sorted(['abc', 2.0, 1, 2, 'def'])
1554 [1, 2.0, 2, 'abc', 'def']
1555
1556 Note how mutually orderable types are sorted as expected, as in
1557 the case of the integers and floats above.
1558
1559 .. note::
1560
1561 Results may vary across Python versions and builds, but the
1562 function will produce a sorted list, except in the case of
1563 explicitly unorderable objects.
1564
1565 """
1566 class _Wrapper:
1567 slots = ('obj',)
1568
1569 def __init__(self, obj):
1570 self.obj = obj
1571
1572 def __lt__(self, other):
1573 obj = key(self.obj) if key is not None else self.obj
1574 other = key(other.obj) if key is not None else other.obj
1575 try:
1576 ret = obj < other
1577 except TypeError:
1578 ret = ((type(obj).__name__, id(type(obj)), obj)
1579 < (type(other).__name__, id(type(other)), other))
1580 return ret
1581
1582 if key is not None and not callable(key):
1583 raise TypeError('expected function or callable object for key, not: %r'
1584 % key)
1585
1586 return sorted(iterable, key=_Wrapper, reverse=reverse)
1587
1588
1589"""
1590May actually be faster to do an isinstance check for a str path
1591
1592$ python -m timeit -s "x = [1]" "x[0]"
159310000000 loops, best of 3: 0.0207 usec per loop
1594$ python -m timeit -s "x = [1]" "try: x[0] \nexcept: pass"
159510000000 loops, best of 3: 0.029 usec per loop
1596$ python -m timeit -s "x = [1]" "try: x[1] \nexcept: pass"
15971000000 loops, best of 3: 0.315 usec per loop
1598# setting up try/except is fast, only around 0.01us
1599# actually triggering the exception takes almost 10x as long
1600
1601$ python -m timeit -s "x = [1]" "isinstance(x, basestring)"
160210000000 loops, best of 3: 0.141 usec per loop
1603$ python -m timeit -s "x = [1]" "isinstance(x, str)"
160410000000 loops, best of 3: 0.131 usec per loop
1605$ python -m timeit -s "x = [1]" "try: x.split('.')\n except: pass"
16061000000 loops, best of 3: 0.443 usec per loop
1607$ python -m timeit -s "x = [1]" "try: x.split('.') \nexcept AttributeError: pass"
16081000000 loops, best of 3: 0.544 usec per loop
1609"""