Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/hashing.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

114 statements  

1""" 

2Fast cryptographic hash of Python objects, with a special case for fast 

3hashing of numpy arrays. 

4""" 

5 

6# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> 

7# Copyright (c) 2009 Gael Varoquaux 

8# License: BSD Style, 3 clauses. 

9 

10import pickle 

11import hashlib 

12import sys 

13import types 

14import struct 

15import io 

16import decimal 

17 

18 

19Pickler = pickle._Pickler 

20 

21 

22class _ConsistentSet(object): 

23 """ Class used to ensure the hash of Sets is preserved 

24 whatever the order of its items. 

25 """ 

26 def __init__(self, set_sequence): 

27 # Forces order of elements in set to ensure consistent hash. 

28 try: 

29 # Trying first to order the set assuming the type of elements is 

30 # consistent and orderable. 

31 # This fails on python 3 when elements are unorderable 

32 # but we keep it in a try as it's faster. 

33 self._sequence = sorted(set_sequence) 

34 except (TypeError, decimal.InvalidOperation): 

35 # If elements are unorderable, sorting them using their hash. 

36 # This is slower but works in any case. 

37 self._sequence = sorted((hash(e) for e in set_sequence)) 

38 

39 

40class _MyHash(object): 

41 """ Class used to hash objects that won't normally pickle """ 

42 

43 def __init__(self, *args): 

44 self.args = args 

45 

46 

47class Hasher(Pickler): 

48 """ A subclass of pickler, to do cryptographic hashing, rather than 

49 pickling. 

50 """ 

51 

52 def __init__(self, hash_name='md5'): 

53 self.stream = io.BytesIO() 

54 # By default we want a pickle protocol that only changes with 

55 # the major python version and not the minor one 

56 protocol = 3 

57 Pickler.__init__(self, self.stream, protocol=protocol) 

58 # Initialise the hash obj 

59 self._hash = hashlib.new(hash_name) 

60 

61 def hash(self, obj, return_digest=True): 

62 try: 

63 self.dump(obj) 

64 except pickle.PicklingError as e: 

65 e.args += ('PicklingError while hashing %r: %r' % (obj, e),) 

66 raise 

67 dumps = self.stream.getvalue() 

68 self._hash.update(dumps) 

69 if return_digest: 

70 return self._hash.hexdigest() 

71 

72 def save(self, obj): 

73 if isinstance(obj, (types.MethodType, type({}.pop))): 

74 # the Pickler cannot pickle instance methods; here we decompose 

75 # them into components that make them uniquely identifiable 

76 if hasattr(obj, '__func__'): 

77 func_name = obj.__func__.__name__ 

78 else: 

79 func_name = obj.__name__ 

80 inst = obj.__self__ 

81 if type(inst) is type(pickle): 

82 obj = _MyHash(func_name, inst.__name__) 

83 elif inst is None: 

84 # type(None) or type(module) do not pickle 

85 obj = _MyHash(func_name, inst) 

86 else: 

87 cls = obj.__self__.__class__ 

88 obj = _MyHash(func_name, inst, cls) 

89 Pickler.save(self, obj) 

90 

91 def memoize(self, obj): 

92 # We want hashing to be sensitive to value instead of reference. 

93 # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] 

94 # to hash to the same value and that's why we disable memoization 

95 # for strings 

96 if isinstance(obj, (bytes, str)): 

97 return 

98 Pickler.memoize(self, obj) 

99 

100 # The dispatch table of the pickler is not accessible in Python 

101 # 3, as these lines are only bugware for IPython, we skip them. 

102 def save_global(self, obj, name=None, pack=struct.pack): 

103 # We have to override this method in order to deal with objects 

104 # defined interactively in IPython that are not injected in 

105 # __main__ 

106 kwargs = dict(name=name, pack=pack) 

107 del kwargs['pack'] 

108 try: 

109 Pickler.save_global(self, obj, **kwargs) 

110 except pickle.PicklingError: 

111 Pickler.save_global(self, obj, **kwargs) 

112 module = getattr(obj, "__module__", None) 

113 if module == '__main__': 

114 my_name = name 

115 if my_name is None: 

116 my_name = obj.__name__ 

117 mod = sys.modules[module] 

118 if not hasattr(mod, my_name): 

119 # IPython doesn't inject the variables define 

120 # interactively in __main__ 

121 setattr(mod, my_name, obj) 

122 

123 dispatch = Pickler.dispatch.copy() 

124 # builtin 

125 dispatch[type(len)] = save_global 

126 # type 

127 dispatch[type(object)] = save_global 

128 # classobj 

129 dispatch[type(Pickler)] = save_global 

130 # function 

131 dispatch[type(pickle.dump)] = save_global 

132 

133 def _batch_setitems(self, items): 

134 # forces order of keys in dict to ensure consistent hash. 

135 try: 

136 # Trying first to compare dict assuming the type of keys is 

137 # consistent and orderable. 

138 # This fails on python 3 when keys are unorderable 

139 # but we keep it in a try as it's faster. 

140 Pickler._batch_setitems(self, iter(sorted(items))) 

141 except TypeError: 

142 # If keys are unorderable, sorting them using their hash. This is 

143 # slower but works in any case. 

144 Pickler._batch_setitems(self, iter(sorted((hash(k), v) 

145 for k, v in items))) 

146 

147 def save_set(self, set_items): 

148 # forces order of items in Set to ensure consistent hash 

149 Pickler.save(self, _ConsistentSet(set_items)) 

150 

151 dispatch[type(set())] = save_set 

152 

153 

154class NumpyHasher(Hasher): 

155 """ Special case the hasher for when numpy is loaded. 

156 """ 

157 

158 def __init__(self, hash_name='md5', coerce_mmap=False): 

159 """ 

160 Parameters 

161 ---------- 

162 hash_name: string 

163 The hash algorithm to be used 

164 coerce_mmap: boolean 

165 Make no difference between np.memmap and np.ndarray 

166 objects. 

167 """ 

168 self.coerce_mmap = coerce_mmap 

169 Hasher.__init__(self, hash_name=hash_name) 

170 # delayed import of numpy, to avoid tight coupling 

171 import numpy as np 

172 self.np = np 

173 if hasattr(np, 'getbuffer'): 

174 self._getbuffer = np.getbuffer 

175 else: 

176 self._getbuffer = memoryview 

177 

178 def save(self, obj): 

179 """ Subclass the save method, to hash ndarray subclass, rather 

180 than pickling them. Off course, this is a total abuse of 

181 the Pickler class. 

182 """ 

183 if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: 

184 # Compute a hash of the object 

185 # The update function of the hash requires a c_contiguous buffer. 

186 if obj.shape == (): 

187 # 0d arrays need to be flattened because viewing them as bytes 

188 # raises a ValueError exception. 

189 obj_c_contiguous = obj.flatten() 

190 elif obj.flags.c_contiguous: 

191 obj_c_contiguous = obj 

192 elif obj.flags.f_contiguous: 

193 obj_c_contiguous = obj.T 

194 else: 

195 # Cater for non-single-segment arrays: this creates a 

196 # copy, and thus alleviates this issue. 

197 # XXX: There might be a more efficient way of doing this 

198 obj_c_contiguous = obj.flatten() 

199 

200 # memoryview is not supported for some dtypes, e.g. datetime64, see 

201 # https://github.com/numpy/numpy/issues/4983. The 

202 # workaround is to view the array as bytes before 

203 # taking the memoryview. 

204 self._hash.update( 

205 self._getbuffer(obj_c_contiguous.view(self.np.uint8))) 

206 

207 # We store the class, to be able to distinguish between 

208 # Objects with the same binary content, but different 

209 # classes. 

210 if self.coerce_mmap and isinstance(obj, self.np.memmap): 

211 # We don't make the difference between memmap and 

212 # normal ndarrays, to be able to reload previously 

213 # computed results with memmap. 

214 klass = self.np.ndarray 

215 else: 

216 klass = obj.__class__ 

217 # We also return the dtype and the shape, to distinguish 

218 # different views on the same data with different dtypes. 

219 

220 # The object will be pickled by the pickler hashed at the end. 

221 obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides)) 

222 elif isinstance(obj, self.np.dtype): 

223 # numpy.dtype consistent hashing is tricky to get right. This comes 

224 # from the fact that atomic np.dtype objects are interned: 

225 # ``np.dtype('f4') is np.dtype('f4')``. The situation is 

226 # complicated by the fact that this interning does not resist a 

227 # simple pickle.load/dump roundtrip: 

228 # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not 

229 # np.dtype('f4') Because pickle relies on memoization during 

230 # pickling, it is easy to 

231 # produce different hashes for seemingly identical objects, such as 

232 # ``[np.dtype('f4'), np.dtype('f4')]`` 

233 # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``. 

234 # To prevent memoization from interfering with hashing, we isolate 

235 # the serialization (and thus the pickle memoization) of each dtype 

236 # using each time a different ``pickle.dumps`` call unrelated to 

237 # the current Hasher instance. 

238 self._hash.update("_HASHED_DTYPE".encode('utf-8')) 

239 self._hash.update(pickle.dumps(obj)) 

240 return 

241 Hasher.save(self, obj) 

242 

243 

244def hash(obj, hash_name='md5', coerce_mmap=False): 

245 """ Quick calculation of a hash to identify uniquely Python objects 

246 containing numpy arrays. 

247 

248 Parameters 

249 ---------- 

250 hash_name: 'md5' or 'sha1' 

251 Hashing algorithm used. sha1 is supposedly safer, but md5 is 

252 faster. 

253 coerce_mmap: boolean 

254 Make no difference between np.memmap and np.ndarray 

255 """ 

256 valid_hash_names = ('md5', 'sha1') 

257 if hash_name not in valid_hash_names: 

258 raise ValueError("Valid options for 'hash_name' are {}. " 

259 "Got hash_name={!r} instead." 

260 .format(valid_hash_names, hash_name)) 

261 if 'numpy' in sys.modules: 

262 hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) 

263 else: 

264 hasher = Hasher(hash_name=hash_name) 

265 return hasher.hash(obj)