Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/hashing.py: 26%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Fast cryptographic hash of Python objects, with a special case for fast
3hashing of numpy arrays.
4"""
6# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
7# Copyright (c) 2009 Gael Varoquaux
8# License: BSD Style, 3 clauses.
10import pickle
11import hashlib
12import sys
13import types
14import struct
15import io
16import decimal
19Pickler = pickle._Pickler
22class _ConsistentSet(object):
23 """ Class used to ensure the hash of Sets is preserved
24 whatever the order of its items.
25 """
26 def __init__(self, set_sequence):
27 # Forces order of elements in set to ensure consistent hash.
28 try:
29 # Trying first to order the set assuming the type of elements is
30 # consistent and orderable.
31 # This fails on python 3 when elements are unorderable
32 # but we keep it in a try as it's faster.
33 self._sequence = sorted(set_sequence)
34 except (TypeError, decimal.InvalidOperation):
35 # If elements are unorderable, sorting them using their hash.
36 # This is slower but works in any case.
37 self._sequence = sorted((hash(e) for e in set_sequence))
40class _MyHash(object):
41 """ Class used to hash objects that won't normally pickle """
43 def __init__(self, *args):
44 self.args = args
47class Hasher(Pickler):
48 """ A subclass of pickler, to do cryptographic hashing, rather than
49 pickling.
50 """
52 def __init__(self, hash_name='md5'):
53 self.stream = io.BytesIO()
54 # By default we want a pickle protocol that only changes with
55 # the major python version and not the minor one
56 protocol = 3
57 Pickler.__init__(self, self.stream, protocol=protocol)
58 # Initialise the hash obj
59 self._hash = hashlib.new(hash_name)
61 def hash(self, obj, return_digest=True):
62 try:
63 self.dump(obj)
64 except pickle.PicklingError as e:
65 e.args += ('PicklingError while hashing %r: %r' % (obj, e),)
66 raise
67 dumps = self.stream.getvalue()
68 self._hash.update(dumps)
69 if return_digest:
70 return self._hash.hexdigest()
72 def save(self, obj):
73 if isinstance(obj, (types.MethodType, type({}.pop))):
74 # the Pickler cannot pickle instance methods; here we decompose
75 # them into components that make them uniquely identifiable
76 if hasattr(obj, '__func__'):
77 func_name = obj.__func__.__name__
78 else:
79 func_name = obj.__name__
80 inst = obj.__self__
81 if type(inst) is type(pickle):
82 obj = _MyHash(func_name, inst.__name__)
83 elif inst is None:
84 # type(None) or type(module) do not pickle
85 obj = _MyHash(func_name, inst)
86 else:
87 cls = obj.__self__.__class__
88 obj = _MyHash(func_name, inst, cls)
89 Pickler.save(self, obj)
91 def memoize(self, obj):
92 # We want hashing to be sensitive to value instead of reference.
93 # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
94 # to hash to the same value and that's why we disable memoization
95 # for strings
96 if isinstance(obj, (bytes, str)):
97 return
98 Pickler.memoize(self, obj)
100 # The dispatch table of the pickler is not accessible in Python
101 # 3, as these lines are only bugware for IPython, we skip them.
102 def save_global(self, obj, name=None, pack=struct.pack):
103 # We have to override this method in order to deal with objects
104 # defined interactively in IPython that are not injected in
105 # __main__
106 kwargs = dict(name=name, pack=pack)
107 del kwargs['pack']
108 try:
109 Pickler.save_global(self, obj, **kwargs)
110 except pickle.PicklingError:
111 Pickler.save_global(self, obj, **kwargs)
112 module = getattr(obj, "__module__", None)
113 if module == '__main__':
114 my_name = name
115 if my_name is None:
116 my_name = obj.__name__
117 mod = sys.modules[module]
118 if not hasattr(mod, my_name):
119 # IPython doesn't inject the variables define
120 # interactively in __main__
121 setattr(mod, my_name, obj)
123 dispatch = Pickler.dispatch.copy()
124 # builtin
125 dispatch[type(len)] = save_global
126 # type
127 dispatch[type(object)] = save_global
128 # classobj
129 dispatch[type(Pickler)] = save_global
130 # function
131 dispatch[type(pickle.dump)] = save_global
133 def _batch_setitems(self, items):
134 # forces order of keys in dict to ensure consistent hash.
135 try:
136 # Trying first to compare dict assuming the type of keys is
137 # consistent and orderable.
138 # This fails on python 3 when keys are unorderable
139 # but we keep it in a try as it's faster.
140 Pickler._batch_setitems(self, iter(sorted(items)))
141 except TypeError:
142 # If keys are unorderable, sorting them using their hash. This is
143 # slower but works in any case.
144 Pickler._batch_setitems(self, iter(sorted((hash(k), v)
145 for k, v in items)))
147 def save_set(self, set_items):
148 # forces order of items in Set to ensure consistent hash
149 Pickler.save(self, _ConsistentSet(set_items))
151 dispatch[type(set())] = save_set
154class NumpyHasher(Hasher):
155 """ Special case the hasher for when numpy is loaded.
156 """
158 def __init__(self, hash_name='md5', coerce_mmap=False):
159 """
160 Parameters
161 ----------
162 hash_name: string
163 The hash algorithm to be used
164 coerce_mmap: boolean
165 Make no difference between np.memmap and np.ndarray
166 objects.
167 """
168 self.coerce_mmap = coerce_mmap
169 Hasher.__init__(self, hash_name=hash_name)
170 # delayed import of numpy, to avoid tight coupling
171 import numpy as np
172 self.np = np
173 if hasattr(np, 'getbuffer'):
174 self._getbuffer = np.getbuffer
175 else:
176 self._getbuffer = memoryview
178 def save(self, obj):
179 """ Subclass the save method, to hash ndarray subclass, rather
180 than pickling them. Off course, this is a total abuse of
181 the Pickler class.
182 """
183 if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
184 # Compute a hash of the object
185 # The update function of the hash requires a c_contiguous buffer.
186 if obj.shape == ():
187 # 0d arrays need to be flattened because viewing them as bytes
188 # raises a ValueError exception.
189 obj_c_contiguous = obj.flatten()
190 elif obj.flags.c_contiguous:
191 obj_c_contiguous = obj
192 elif obj.flags.f_contiguous:
193 obj_c_contiguous = obj.T
194 else:
195 # Cater for non-single-segment arrays: this creates a
196 # copy, and thus alleviates this issue.
197 # XXX: There might be a more efficient way of doing this
198 obj_c_contiguous = obj.flatten()
200 # memoryview is not supported for some dtypes, e.g. datetime64, see
201 # https://github.com/numpy/numpy/issues/4983. The
202 # workaround is to view the array as bytes before
203 # taking the memoryview.
204 self._hash.update(
205 self._getbuffer(obj_c_contiguous.view(self.np.uint8)))
207 # We store the class, to be able to distinguish between
208 # Objects with the same binary content, but different
209 # classes.
210 if self.coerce_mmap and isinstance(obj, self.np.memmap):
211 # We don't make the difference between memmap and
212 # normal ndarrays, to be able to reload previously
213 # computed results with memmap.
214 klass = self.np.ndarray
215 else:
216 klass = obj.__class__
217 # We also return the dtype and the shape, to distinguish
218 # different views on the same data with different dtypes.
220 # The object will be pickled by the pickler hashed at the end.
221 obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
222 elif isinstance(obj, self.np.dtype):
223 # numpy.dtype consistent hashing is tricky to get right. This comes
224 # from the fact that atomic np.dtype objects are interned:
225 # ``np.dtype('f4') is np.dtype('f4')``. The situation is
226 # complicated by the fact that this interning does not resist a
227 # simple pickle.load/dump roundtrip:
228 # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
229 # np.dtype('f4') Because pickle relies on memoization during
230 # pickling, it is easy to
231 # produce different hashes for seemingly identical objects, such as
232 # ``[np.dtype('f4'), np.dtype('f4')]``
233 # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
234 # To prevent memoization from interfering with hashing, we isolate
235 # the serialization (and thus the pickle memoization) of each dtype
236 # using each time a different ``pickle.dumps`` call unrelated to
237 # the current Hasher instance.
238 self._hash.update("_HASHED_DTYPE".encode('utf-8'))
239 self._hash.update(pickle.dumps(obj))
240 return
241 Hasher.save(self, obj)
244def hash(obj, hash_name='md5', coerce_mmap=False):
245 """ Quick calculation of a hash to identify uniquely Python objects
246 containing numpy arrays.
248 Parameters
249 ----------
250 hash_name: 'md5' or 'sha1'
251 Hashing algorithm used. sha1 is supposedly safer, but md5 is
252 faster.
253 coerce_mmap: boolean
254 Make no difference between np.memmap and np.ndarray
255 """
256 valid_hash_names = ('md5', 'sha1')
257 if hash_name not in valid_hash_names:
258 raise ValueError("Valid options for 'hash_name' are {}. "
259 "Got hash_name={!r} instead."
260 .format(valid_hash_names, hash_name))
261 if 'numpy' in sys.modules:
262 hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
263 else:
264 hasher = Hasher(hash_name=hash_name)
265 return hasher.hash(obj)