Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/joblib/hashing.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

115 statements  

1""" 

2Fast cryptographic hash of Python objects, with a special case for fast 

3hashing of numpy arrays. 

4""" 

5 

6# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> 

7# Copyright (c) 2009 Gael Varoquaux 

8# License: BSD Style, 3 clauses. 

9 

10import decimal 

11import hashlib 

12import io 

13import pickle 

14import struct 

15import sys 

16import types 

17 

18Pickler = pickle._Pickler 

19 

20 

21class _ConsistentSet(object): 

22 """Class used to ensure the hash of Sets is preserved 

23 whatever the order of its items. 

24 """ 

25 

26 def __init__(self, set_sequence): 

27 # Forces order of elements in set to ensure consistent hash. 

28 try: 

29 # Trying first to order the set assuming the type of elements is 

30 # consistent and orderable. 

31 # This fails on python 3 when elements are unorderable 

32 # but we keep it in a try as it's faster. 

33 self._sequence = sorted(set_sequence) 

34 except (TypeError, decimal.InvalidOperation): 

35 # If elements are unorderable, sorting them using their hash. 

36 # This is slower but works in any case. 

37 self._sequence = sorted((hash(e) for e in set_sequence)) 

38 

39 

40class _MyHash(object): 

41 """Class used to hash objects that won't normally pickle""" 

42 

43 def __init__(self, *args): 

44 self.args = args 

45 

46 

47class Hasher(Pickler): 

48 """A subclass of pickler, to do cryptographic hashing, rather than 

49 pickling. This is used to produce a unique hash of the given 

50 Python object that is not necessarily cryptographically secure. 

51 """ 

52 

53 def __init__(self, hash_name="md5"): 

54 self.stream = io.BytesIO() 

55 # By default we want a pickle protocol that only changes with 

56 # the major python version and not the minor one 

57 protocol = 3 

58 Pickler.__init__(self, self.stream, protocol=protocol) 

59 # Initialise the hash obj 

60 self._hash = hashlib.new(hash_name, usedforsecurity=False) 

61 

62 def hash(self, obj, return_digest=True): 

63 try: 

64 self.dump(obj) 

65 except pickle.PicklingError as e: 

66 e.args += ("PicklingError while hashing %r: %r" % (obj, e),) 

67 raise 

68 dumps = self.stream.getvalue() 

69 self._hash.update(dumps) 

70 if return_digest: 

71 return self._hash.hexdigest() 

72 

73 def save(self, obj): 

74 if isinstance(obj, (types.MethodType, type({}.pop))): 

75 # the Pickler cannot pickle instance methods; here we decompose 

76 # them into components that make them uniquely identifiable 

77 if hasattr(obj, "__func__"): 

78 func_name = obj.__func__.__name__ 

79 else: 

80 func_name = obj.__name__ 

81 inst = obj.__self__ 

82 if type(inst) is type(pickle): 

83 obj = _MyHash(func_name, inst.__name__) 

84 elif inst is None: 

85 # type(None) or type(module) do not pickle 

86 obj = _MyHash(func_name, inst) 

87 else: 

88 cls = obj.__self__.__class__ 

89 obj = _MyHash(func_name, inst, cls) 

90 Pickler.save(self, obj) 

91 

92 def memoize(self, obj): 

93 # We want hashing to be sensitive to value instead of reference. 

94 # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] 

95 # to hash to the same value and that's why we disable memoization 

96 # for strings 

97 if isinstance(obj, (bytes, str)): 

98 return 

99 Pickler.memoize(self, obj) 

100 

101 # The dispatch table of the pickler is not accessible in Python 

102 # 3, as these lines are only bugware for IPython, we skip them. 

103 def save_global(self, obj, name=None, pack=struct.pack): 

104 # We have to override this method in order to deal with objects 

105 # defined interactively in IPython that are not injected in 

106 # __main__ 

107 kwargs = dict(name=name, pack=pack) 

108 del kwargs["pack"] 

109 try: 

110 Pickler.save_global(self, obj, **kwargs) 

111 except pickle.PicklingError: 

112 Pickler.save_global(self, obj, **kwargs) 

113 module = getattr(obj, "__module__", None) 

114 if module == "__main__": 

115 my_name = name 

116 if my_name is None: 

117 my_name = obj.__name__ 

118 mod = sys.modules[module] 

119 if not hasattr(mod, my_name): 

120 # IPython doesn't inject the variables define 

121 # interactively in __main__ 

122 setattr(mod, my_name, obj) 

123 

124 dispatch = Pickler.dispatch.copy() 

125 # builtin 

126 dispatch[type(len)] = save_global 

127 # type 

128 dispatch[type(object)] = save_global 

129 # classobj 

130 dispatch[type(Pickler)] = save_global 

131 # function 

132 dispatch[type(pickle.dump)] = save_global 

133 

134 # We use *args in _batch_setitems signature because _batch_setitems has an 

135 # additional 'obj' argument in Python 3.14 

136 def _batch_setitems(self, items, *args): 

137 # forces order of keys in dict to ensure consistent hash. 

138 try: 

139 # Trying first to compare dict assuming the type of keys is 

140 # consistent and orderable. 

141 # This fails on python 3 when keys are unorderable 

142 # but we keep it in a try as it's faster. 

143 Pickler._batch_setitems(self, iter(sorted(items)), *args) 

144 except TypeError: 

145 # If keys are unorderable, sorting them using their hash. This is 

146 # slower but works in any case. 

147 Pickler._batch_setitems( 

148 self, iter(sorted((hash(k), v) for k, v in items)), *args 

149 ) 

150 

151 def save_set(self, set_items): 

152 # forces order of items in Set to ensure consistent hash 

153 Pickler.save(self, _ConsistentSet(set_items)) 

154 

155 dispatch[type(set())] = save_set 

156 

157 

158class NumpyHasher(Hasher): 

159 """Special case the hasher for when numpy is loaded.""" 

160 

161 def __init__(self, hash_name="md5", coerce_mmap=False): 

162 """ 

163 Parameters 

164 ---------- 

165 hash_name: string 

166 The hash algorithm to be used 

167 coerce_mmap: boolean 

168 Make no difference between np.memmap and np.ndarray 

169 objects. 

170 """ 

171 self.coerce_mmap = coerce_mmap 

172 Hasher.__init__(self, hash_name=hash_name) 

173 # delayed import of numpy, to avoid tight coupling 

174 import numpy as np 

175 

176 self.np = np 

177 if hasattr(np, "getbuffer"): 

178 self._getbuffer = np.getbuffer 

179 else: 

180 self._getbuffer = memoryview 

181 

182 def save(self, obj): 

183 """Subclass the save method, to hash ndarray subclass, rather 

184 than pickling them. Off course, this is a total abuse of 

185 the Pickler class. 

186 """ 

187 if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: 

188 # Compute a hash of the object 

189 # The update function of the hash requires a c_contiguous buffer. 

190 if obj.shape == (): 

191 # 0d arrays need to be flattened because viewing them as bytes 

192 # raises a ValueError exception. 

193 obj_c_contiguous = obj.flatten() 

194 elif obj.flags.c_contiguous: 

195 obj_c_contiguous = obj 

196 elif obj.flags.f_contiguous: 

197 obj_c_contiguous = obj.T 

198 else: 

199 # Cater for non-single-segment arrays: this creates a 

200 # copy, and thus alleviates this issue. 

201 # XXX: There might be a more efficient way of doing this 

202 obj_c_contiguous = obj.flatten() 

203 

204 # memoryview is not supported for some dtypes, e.g. datetime64, see 

205 # https://github.com/numpy/numpy/issues/4983. The 

206 # workaround is to view the array as bytes before 

207 # taking the memoryview. 

208 self._hash.update(self._getbuffer(obj_c_contiguous.view(self.np.uint8))) 

209 

210 # We store the class, to be able to distinguish between 

211 # Objects with the same binary content, but different 

212 # classes. 

213 if self.coerce_mmap and isinstance(obj, self.np.memmap): 

214 # We don't make the difference between memmap and 

215 # normal ndarrays, to be able to reload previously 

216 # computed results with memmap. 

217 klass = self.np.ndarray 

218 else: 

219 klass = obj.__class__ 

220 # We also return the dtype and the shape, to distinguish 

221 # different views on the same data with different dtypes. 

222 

223 # The object will be pickled by the pickler hashed at the end. 

224 obj = (klass, ("HASHED", obj.dtype, obj.shape, obj.strides)) 

225 elif isinstance(obj, self.np.dtype): 

226 # numpy.dtype consistent hashing is tricky to get right. This comes 

227 # from the fact that atomic np.dtype objects are interned: 

228 # ``np.dtype('f4') is np.dtype('f4')``. The situation is 

229 # complicated by the fact that this interning does not resist a 

230 # simple pickle.load/dump roundtrip: 

231 # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not 

232 # np.dtype('f4') Because pickle relies on memoization during 

233 # pickling, it is easy to 

234 # produce different hashes for seemingly identical objects, such as 

235 # ``[np.dtype('f4'), np.dtype('f4')]`` 

236 # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``. 

237 # To prevent memoization from interfering with hashing, we isolate 

238 # the serialization (and thus the pickle memoization) of each dtype 

239 # using each time a different ``pickle.dumps`` call unrelated to 

240 # the current Hasher instance. 

241 self._hash.update("_HASHED_DTYPE".encode("utf-8")) 

242 self._hash.update(pickle.dumps(obj)) 

243 return 

244 Hasher.save(self, obj) 

245 

246 

247def hash(obj, hash_name="md5", coerce_mmap=False): 

248 """Quick calculation of a hash to identify uniquely Python objects 

249 containing numpy arrays. 

250 

251 Parameters 

252 ---------- 

253 hash_name: 'md5' or 'sha1' 

254 Hashing algorithm used. sha1 is supposedly safer, but md5 is 

255 faster. 

256 coerce_mmap: boolean 

257 Make no difference between np.memmap and np.ndarray 

258 """ 

259 valid_hash_names = ("md5", "sha1") 

260 if hash_name not in valid_hash_names: 

261 raise ValueError( 

262 "Valid options for 'hash_name' are {}. Got hash_name={!r} instead.".format( 

263 valid_hash_names, hash_name 

264 ) 

265 ) 

266 if "numpy" in sys.modules: 

267 hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) 

268 else: 

269 hasher = Hasher(hash_name=hash_name) 

270 return hasher.hash(obj)