Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/joblib/hashing.py: 27%

1"""

2Fast cryptographic hash of Python objects, with a special case for fast

3hashing of numpy arrays.

4"""

6# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>

8# License: BSD Style, 3 clauses.

10import decimal

11import hashlib

12import io

13import pickle

14import struct

15import sys

16import types

18Pickler = pickle._Pickler

21class _ConsistentSet(object):

22 """Class used to ensure the hash of Sets is preserved

23 whatever the order of its items.

24 """

26 def __init__(self, set_sequence):

27 # Forces order of elements in set to ensure consistent hash.

28 try:

29 # Trying first to order the set assuming the type of elements is

30 # consistent and orderable.

31 # This fails on python 3 when elements are unorderable

32 # but we keep it in a try as it's faster.

33 self._sequence = sorted(set_sequence)

34 except (TypeError, decimal.InvalidOperation):

35 # If elements are unorderable, sorting them using their hash.

36 # This is slower but works in any case.

37 self._sequence = sorted((hash(e) for e in set_sequence))

40class _MyHash(object):

41 """Class used to hash objects that won't normally pickle"""

43 def __init__(self, *args):

44 self.args = args

47class Hasher(Pickler):

48 """A subclass of pickler, to do cryptographic hashing, rather than

49 pickling. This is used to produce a unique hash of the given

50 Python object that is not necessarily cryptographically secure.

51 """

53 def __init__(self, hash_name="md5"):

54 self.stream = io.BytesIO()

55 # By default we want a pickle protocol that only changes with

56 # the major python version and not the minor one

57 protocol = 3

58 Pickler.__init__(self, self.stream, protocol=protocol)

59 # Initialise the hash obj

60 self._hash = hashlib.new(hash_name, usedforsecurity=False)

62 def hash(self, obj, return_digest=True):

63 try:

64 self.dump(obj)

65 except pickle.PicklingError as e:

66 e.args += ("PicklingError while hashing %r: %r" % (obj, e),)

67 raise

68 dumps = self.stream.getvalue()

69 self._hash.update(dumps)

70 if return_digest:

71 return self._hash.hexdigest()

73 def save(self, obj):

74 if isinstance(obj, (types.MethodType, type({}.pop))):

75 # the Pickler cannot pickle instance methods; here we decompose

76 # them into components that make them uniquely identifiable

77 if hasattr(obj, "__func__"):

78 func_name = obj.__func__.__name__

79 else:

80 func_name = obj.__name__

81 inst = obj.__self__

82 if type(inst) is type(pickle):

83 obj = _MyHash(func_name, inst.__name__)

84 elif inst is None:

85 # type(None) or type(module) do not pickle

86 obj = _MyHash(func_name, inst)

87 else:

88 cls = obj.__self__.__class__

89 obj = _MyHash(func_name, inst, cls)

90 Pickler.save(self, obj)

92 def memoize(self, obj):

93 # We want hashing to be sensitive to value instead of reference.

94 # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]

95 # to hash to the same value and that's why we disable memoization

96 # for strings

97 if isinstance(obj, (bytes, str)):

98 return

99 Pickler.memoize(self, obj)

100

101 # The dispatch table of the pickler is not accessible in Python

102 # 3, as these lines are only bugware for IPython, we skip them.

103 def save_global(self, obj, name=None, pack=struct.pack):

104 # We have to override this method in order to deal with objects

105 # defined interactively in IPython that are not injected in

106 # __main__

107 kwargs = dict(name=name, pack=pack)

108 del kwargs["pack"]

109 try:

110 Pickler.save_global(self, obj, **kwargs)

111 except pickle.PicklingError:

112 Pickler.save_global(self, obj, **kwargs)

113 module = getattr(obj, "__module__", None)

114 if module == "__main__":

115 my_name = name

116 if my_name is None:

117 my_name = obj.__name__

118 mod = sys.modules[module]

119 if not hasattr(mod, my_name):

120 # IPython doesn't inject the variables define

121 # interactively in __main__

122 setattr(mod, my_name, obj)

123

124 dispatch = Pickler.dispatch.copy()

125 # builtin

126 dispatch[type(len)] = save_global

127 # type

128 dispatch[type(object)] = save_global

129 # classobj

130 dispatch[type(Pickler)] = save_global

131 # function

132 dispatch[type(pickle.dump)] = save_global

133

134 # We use *args in _batch_setitems signature because _batch_setitems has an

135 # additional 'obj' argument in Python 3.14

136 def _batch_setitems(self, items, *args):

137 # forces order of keys in dict to ensure consistent hash.

138 try:

139 # Trying first to compare dict assuming the type of keys is

140 # consistent and orderable.

141 # This fails on python 3 when keys are unorderable

142 # but we keep it in a try as it's faster.

143 Pickler._batch_setitems(self, iter(sorted(items)), *args)

144 except TypeError:

145 # If keys are unorderable, sorting them using their hash. This is

146 # slower but works in any case.

147 Pickler._batch_setitems(

148 self, iter(sorted((hash(k), v) for k, v in items)), *args

149 )

150

151 def save_set(self, set_items):

152 # forces order of items in Set to ensure consistent hash

153 Pickler.save(self, _ConsistentSet(set_items))

154

155 dispatch[type(set())] = save_set

156

157

158class NumpyHasher(Hasher):

159 """Special case the hasher for when numpy is loaded."""

160

161 def __init__(self, hash_name="md5", coerce_mmap=False):

162 """

163 Parameters

164 ----------

165 hash_name: string

166 The hash algorithm to be used

167 coerce_mmap: boolean

168 Make no difference between np.memmap and np.ndarray

169 objects.

170 """

171 self.coerce_mmap = coerce_mmap

172 Hasher.__init__(self, hash_name=hash_name)

173 # delayed import of numpy, to avoid tight coupling

174 import numpy as np

175

176 self.np = np

177 if hasattr(np, "getbuffer"):

178 self._getbuffer = np.getbuffer

179 else:

180 self._getbuffer = memoryview

181

182 def save(self, obj):

183 """Subclass the save method, to hash ndarray subclass, rather

184 than pickling them. Off course, this is a total abuse of

185 the Pickler class.

186 """

187 if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:

188 # Compute a hash of the object

189 # The update function of the hash requires a c_contiguous buffer.

190 if obj.shape == ():

191 # 0d arrays need to be flattened because viewing them as bytes

192 # raises a ValueError exception.

193 obj_c_contiguous = obj.flatten()

194 elif obj.flags.c_contiguous:

195 obj_c_contiguous = obj

196 elif obj.flags.f_contiguous:

197 obj_c_contiguous = obj.T

198 else:

199 # Cater for non-single-segment arrays: this creates a

200 # copy, and thus alleviates this issue.

201 # XXX: There might be a more efficient way of doing this

202 obj_c_contiguous = obj.flatten()

203

204 # memoryview is not supported for some dtypes, e.g. datetime64, see

205 # https://github.com/numpy/numpy/issues/4983. The

206 # workaround is to view the array as bytes before

207 # taking the memoryview.

208 self._hash.update(self._getbuffer(obj_c_contiguous.view(self.np.uint8)))

209

210 # We store the class, to be able to distinguish between

211 # Objects with the same binary content, but different

212 # classes.

213 if self.coerce_mmap and isinstance(obj, self.np.memmap):

214 # We don't make the difference between memmap and

215 # normal ndarrays, to be able to reload previously

216 # computed results with memmap.

217 klass = self.np.ndarray

218 else:

219 klass = obj.__class__

220 # We also return the dtype and the shape, to distinguish

221 # different views on the same data with different dtypes.

222

223 # The object will be pickled by the pickler hashed at the end.

224 obj = (klass, ("HASHED", obj.dtype, obj.shape, obj.strides))

225 elif isinstance(obj, self.np.dtype):

226 # numpy.dtype consistent hashing is tricky to get right. This comes

227 # from the fact that atomic np.dtype objects are interned:

228 # ``np.dtype('f4') is np.dtype('f4')``. The situation is

229 # complicated by the fact that this interning does not resist a

230 # simple pickle.load/dump roundtrip:

231 # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not

232 # np.dtype('f4') Because pickle relies on memoization during

233 # pickling, it is easy to

234 # produce different hashes for seemingly identical objects, such as

235 # ``[np.dtype('f4'), np.dtype('f4')]``

236 # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.

237 # To prevent memoization from interfering with hashing, we isolate

238 # the serialization (and thus the pickle memoization) of each dtype

239 # using each time a different ``pickle.dumps`` call unrelated to

240 # the current Hasher instance.

241 self._hash.update("_HASHED_DTYPE".encode("utf-8"))

242 self._hash.update(pickle.dumps(obj))

243 return

244 Hasher.save(self, obj)

245

246

247def hash(obj, hash_name="md5", coerce_mmap=False):

248 """Quick calculation of a hash to identify uniquely Python objects

249 containing numpy arrays.

250

251 Parameters

252 ----------

253 hash_name: 'md5' or 'sha1'

254 Hashing algorithm used. sha1 is supposedly safer, but md5 is

255 faster.

256 coerce_mmap: boolean

257 Make no difference between np.memmap and np.ndarray

258 """

259 valid_hash_names = ("md5", "sha1")

260 if hash_name not in valid_hash_names:

261 raise ValueError(

262 "Valid options for 'hash_name' are {}. Got hash_name={!r} instead.".format(

263 valid_hash_names, hash_name

264 )

265 )

266 if "numpy" in sys.modules:

267 hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)

268 else:

269 hasher = Hasher(hash_name=hash_name)

270 return hasher.hash(obj)