Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/joblib/hashing.py: 27%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Fast cryptographic hash of Python objects, with a special case for fast
3hashing of numpy arrays.
4"""
6# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
7# Copyright (c) 2009 Gael Varoquaux
8# License: BSD Style, 3 clauses.
10import decimal
11import hashlib
12import io
13import pickle
14import struct
15import sys
16import types
18Pickler = pickle._Pickler
21class _ConsistentSet(object):
22 """Class used to ensure the hash of Sets is preserved
23 whatever the order of its items.
24 """
26 def __init__(self, set_sequence):
27 # Forces order of elements in set to ensure consistent hash.
28 try:
29 # Trying first to order the set assuming the type of elements is
30 # consistent and orderable.
31 # This fails on python 3 when elements are unorderable
32 # but we keep it in a try as it's faster.
33 self._sequence = sorted(set_sequence)
34 except (TypeError, decimal.InvalidOperation):
35 # If elements are unorderable, sorting them using their hash.
36 # This is slower but works in any case.
37 self._sequence = sorted((hash(e) for e in set_sequence))
40class _MyHash(object):
41 """Class used to hash objects that won't normally pickle"""
43 def __init__(self, *args):
44 self.args = args
47class Hasher(Pickler):
48 """A subclass of pickler, to do cryptographic hashing, rather than
49 pickling. This is used to produce a unique hash of the given
50 Python object that is not necessarily cryptographically secure.
51 """
53 def __init__(self, hash_name="md5"):
54 self.stream = io.BytesIO()
55 # By default we want a pickle protocol that only changes with
56 # the major python version and not the minor one
57 protocol = 3
58 Pickler.__init__(self, self.stream, protocol=protocol)
59 # Initialise the hash obj
60 self._hash = hashlib.new(hash_name, usedforsecurity=False)
62 def hash(self, obj, return_digest=True):
63 try:
64 self.dump(obj)
65 except pickle.PicklingError as e:
66 e.args += ("PicklingError while hashing %r: %r" % (obj, e),)
67 raise
68 dumps = self.stream.getvalue()
69 self._hash.update(dumps)
70 if return_digest:
71 return self._hash.hexdigest()
73 def save(self, obj):
74 if isinstance(obj, (types.MethodType, type({}.pop))):
75 # the Pickler cannot pickle instance methods; here we decompose
76 # them into components that make them uniquely identifiable
77 if hasattr(obj, "__func__"):
78 func_name = obj.__func__.__name__
79 else:
80 func_name = obj.__name__
81 inst = obj.__self__
82 if type(inst) is type(pickle):
83 obj = _MyHash(func_name, inst.__name__)
84 elif inst is None:
85 # type(None) or type(module) do not pickle
86 obj = _MyHash(func_name, inst)
87 else:
88 cls = obj.__self__.__class__
89 obj = _MyHash(func_name, inst, cls)
90 Pickler.save(self, obj)
92 def memoize(self, obj):
93 # We want hashing to be sensitive to value instead of reference.
94 # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
95 # to hash to the same value and that's why we disable memoization
96 # for strings
97 if isinstance(obj, (bytes, str)):
98 return
99 Pickler.memoize(self, obj)
101 # The dispatch table of the pickler is not accessible in Python
102 # 3, as these lines are only bugware for IPython, we skip them.
103 def save_global(self, obj, name=None, pack=struct.pack):
104 # We have to override this method in order to deal with objects
105 # defined interactively in IPython that are not injected in
106 # __main__
107 kwargs = dict(name=name, pack=pack)
108 del kwargs["pack"]
109 try:
110 Pickler.save_global(self, obj, **kwargs)
111 except pickle.PicklingError:
112 Pickler.save_global(self, obj, **kwargs)
113 module = getattr(obj, "__module__", None)
114 if module == "__main__":
115 my_name = name
116 if my_name is None:
117 my_name = obj.__name__
118 mod = sys.modules[module]
119 if not hasattr(mod, my_name):
120 # IPython doesn't inject the variables define
121 # interactively in __main__
122 setattr(mod, my_name, obj)
124 dispatch = Pickler.dispatch.copy()
125 # builtin
126 dispatch[type(len)] = save_global
127 # type
128 dispatch[type(object)] = save_global
129 # classobj
130 dispatch[type(Pickler)] = save_global
131 # function
132 dispatch[type(pickle.dump)] = save_global
134 # We use *args in _batch_setitems signature because _batch_setitems has an
135 # additional 'obj' argument in Python 3.14
136 def _batch_setitems(self, items, *args):
137 # forces order of keys in dict to ensure consistent hash.
138 try:
139 # Trying first to compare dict assuming the type of keys is
140 # consistent and orderable.
141 # This fails on python 3 when keys are unorderable
142 # but we keep it in a try as it's faster.
143 Pickler._batch_setitems(self, iter(sorted(items)), *args)
144 except TypeError:
145 # If keys are unorderable, sorting them using their hash. This is
146 # slower but works in any case.
147 Pickler._batch_setitems(
148 self, iter(sorted((hash(k), v) for k, v in items)), *args
149 )
151 def save_set(self, set_items):
152 # forces order of items in Set to ensure consistent hash
153 Pickler.save(self, _ConsistentSet(set_items))
155 dispatch[type(set())] = save_set
158class NumpyHasher(Hasher):
159 """Special case the hasher for when numpy is loaded."""
161 def __init__(self, hash_name="md5", coerce_mmap=False):
162 """
163 Parameters
164 ----------
165 hash_name: string
166 The hash algorithm to be used
167 coerce_mmap: boolean
168 Make no difference between np.memmap and np.ndarray
169 objects.
170 """
171 self.coerce_mmap = coerce_mmap
172 Hasher.__init__(self, hash_name=hash_name)
173 # delayed import of numpy, to avoid tight coupling
174 import numpy as np
176 self.np = np
177 if hasattr(np, "getbuffer"):
178 self._getbuffer = np.getbuffer
179 else:
180 self._getbuffer = memoryview
182 def save(self, obj):
183 """Subclass the save method, to hash ndarray subclass, rather
184 than pickling them. Off course, this is a total abuse of
185 the Pickler class.
186 """
187 if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
188 # Compute a hash of the object
189 # The update function of the hash requires a c_contiguous buffer.
190 if obj.shape == ():
191 # 0d arrays need to be flattened because viewing them as bytes
192 # raises a ValueError exception.
193 obj_c_contiguous = obj.flatten()
194 elif obj.flags.c_contiguous:
195 obj_c_contiguous = obj
196 elif obj.flags.f_contiguous:
197 obj_c_contiguous = obj.T
198 else:
199 # Cater for non-single-segment arrays: this creates a
200 # copy, and thus alleviates this issue.
201 # XXX: There might be a more efficient way of doing this
202 obj_c_contiguous = obj.flatten()
204 # memoryview is not supported for some dtypes, e.g. datetime64, see
205 # https://github.com/numpy/numpy/issues/4983. The
206 # workaround is to view the array as bytes before
207 # taking the memoryview.
208 self._hash.update(self._getbuffer(obj_c_contiguous.view(self.np.uint8)))
210 # We store the class, to be able to distinguish between
211 # Objects with the same binary content, but different
212 # classes.
213 if self.coerce_mmap and isinstance(obj, self.np.memmap):
214 # We don't make the difference between memmap and
215 # normal ndarrays, to be able to reload previously
216 # computed results with memmap.
217 klass = self.np.ndarray
218 else:
219 klass = obj.__class__
220 # We also return the dtype and the shape, to distinguish
221 # different views on the same data with different dtypes.
223 # The object will be pickled by the pickler hashed at the end.
224 obj = (klass, ("HASHED", obj.dtype, obj.shape, obj.strides))
225 elif isinstance(obj, self.np.dtype):
226 # numpy.dtype consistent hashing is tricky to get right. This comes
227 # from the fact that atomic np.dtype objects are interned:
228 # ``np.dtype('f4') is np.dtype('f4')``. The situation is
229 # complicated by the fact that this interning does not resist a
230 # simple pickle.load/dump roundtrip:
231 # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
232 # np.dtype('f4') Because pickle relies on memoization during
233 # pickling, it is easy to
234 # produce different hashes for seemingly identical objects, such as
235 # ``[np.dtype('f4'), np.dtype('f4')]``
236 # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
237 # To prevent memoization from interfering with hashing, we isolate
238 # the serialization (and thus the pickle memoization) of each dtype
239 # using each time a different ``pickle.dumps`` call unrelated to
240 # the current Hasher instance.
241 self._hash.update("_HASHED_DTYPE".encode("utf-8"))
242 self._hash.update(pickle.dumps(obj))
243 return
244 Hasher.save(self, obj)
247def hash(obj, hash_name="md5", coerce_mmap=False):
248 """Quick calculation of a hash to identify uniquely Python objects
249 containing numpy arrays.
251 Parameters
252 ----------
253 hash_name: 'md5' or 'sha1'
254 Hashing algorithm used. sha1 is supposedly safer, but md5 is
255 faster.
256 coerce_mmap: boolean
257 Make no difference between np.memmap and np.ndarray
258 """
259 valid_hash_names = ("md5", "sha1")
260 if hash_name not in valid_hash_names:
261 raise ValueError(
262 "Valid options for 'hash_name' are {}. Got hash_name={!r} instead.".format(
263 valid_hash_names, hash_name
264 )
265 )
266 if "numpy" in sys.modules:
267 hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
268 else:
269 hasher = Hasher(hash_name=hash_name)
270 return hasher.hash(obj)