Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/hashing.py: 26%

1"""

2Fast cryptographic hash of Python objects, with a special case for fast

3hashing of numpy arrays.

4"""

6# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>

8# License: BSD Style, 3 clauses.

10import pickle

11import hashlib

12import sys

13import types

14import struct

15import io

16import decimal

19Pickler = pickle._Pickler

22class _ConsistentSet(object):

23 """ Class used to ensure the hash of Sets is preserved

24 whatever the order of its items.

25 """

26 def __init__(self, set_sequence):

27 # Forces order of elements in set to ensure consistent hash.

28 try:

29 # Trying first to order the set assuming the type of elements is

30 # consistent and orderable.

31 # This fails on python 3 when elements are unorderable

32 # but we keep it in a try as it's faster.

33 self._sequence = sorted(set_sequence)

34 except (TypeError, decimal.InvalidOperation):

35 # If elements are unorderable, sorting them using their hash.

36 # This is slower but works in any case.

37 self._sequence = sorted((hash(e) for e in set_sequence))

40class _MyHash(object):

41 """ Class used to hash objects that won't normally pickle """

43 def __init__(self, *args):

44 self.args = args

47class Hasher(Pickler):

48 """ A subclass of pickler, to do cryptographic hashing, rather than

49 pickling.

50 """

52 def __init__(self, hash_name='md5'):

53 self.stream = io.BytesIO()

54 # By default we want a pickle protocol that only changes with

55 # the major python version and not the minor one

56 protocol = 3

57 Pickler.__init__(self, self.stream, protocol=protocol)

58 # Initialise the hash obj

59 self._hash = hashlib.new(hash_name)

61 def hash(self, obj, return_digest=True):

62 try:

63 self.dump(obj)

64 except pickle.PicklingError as e:

65 e.args += ('PicklingError while hashing %r: %r' % (obj, e),)

66 raise

67 dumps = self.stream.getvalue()

68 self._hash.update(dumps)

69 if return_digest:

70 return self._hash.hexdigest()

72 def save(self, obj):

73 if isinstance(obj, (types.MethodType, type({}.pop))):

74 # the Pickler cannot pickle instance methods; here we decompose

75 # them into components that make them uniquely identifiable

76 if hasattr(obj, '__func__'):

77 func_name = obj.__func__.__name__

78 else:

79 func_name = obj.__name__

80 inst = obj.__self__

81 if type(inst) is type(pickle):

82 obj = _MyHash(func_name, inst.__name__)

83 elif inst is None:

84 # type(None) or type(module) do not pickle

85 obj = _MyHash(func_name, inst)

86 else:

87 cls = obj.__self__.__class__

88 obj = _MyHash(func_name, inst, cls)

89 Pickler.save(self, obj)

91 def memoize(self, obj):

92 # We want hashing to be sensitive to value instead of reference.

93 # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]

94 # to hash to the same value and that's why we disable memoization

95 # for strings

96 if isinstance(obj, (bytes, str)):

97 return

98 Pickler.memoize(self, obj)

100 # The dispatch table of the pickler is not accessible in Python

101 # 3, as these lines are only bugware for IPython, we skip them.

102 def save_global(self, obj, name=None, pack=struct.pack):

103 # We have to override this method in order to deal with objects

104 # defined interactively in IPython that are not injected in

105 # __main__

106 kwargs = dict(name=name, pack=pack)

107 del kwargs['pack']

108 try:

109 Pickler.save_global(self, obj, **kwargs)

110 except pickle.PicklingError:

111 Pickler.save_global(self, obj, **kwargs)

112 module = getattr(obj, "__module__", None)

113 if module == '__main__':

114 my_name = name

115 if my_name is None:

116 my_name = obj.__name__

117 mod = sys.modules[module]

118 if not hasattr(mod, my_name):

119 # IPython doesn't inject the variables define

120 # interactively in __main__

121 setattr(mod, my_name, obj)

122

123 dispatch = Pickler.dispatch.copy()

124 # builtin

125 dispatch[type(len)] = save_global

126 # type

127 dispatch[type(object)] = save_global

128 # classobj

129 dispatch[type(Pickler)] = save_global

130 # function

131 dispatch[type(pickle.dump)] = save_global

132

133 def _batch_setitems(self, items):

134 # forces order of keys in dict to ensure consistent hash.

135 try:

136 # Trying first to compare dict assuming the type of keys is

137 # consistent and orderable.

138 # This fails on python 3 when keys are unorderable

139 # but we keep it in a try as it's faster.

140 Pickler._batch_setitems(self, iter(sorted(items)))

141 except TypeError:

142 # If keys are unorderable, sorting them using their hash. This is

143 # slower but works in any case.

144 Pickler._batch_setitems(self, iter(sorted((hash(k), v)

145 for k, v in items)))

146

147 def save_set(self, set_items):

148 # forces order of items in Set to ensure consistent hash

149 Pickler.save(self, _ConsistentSet(set_items))

150

151 dispatch[type(set())] = save_set

152

153

154class NumpyHasher(Hasher):

155 """ Special case the hasher for when numpy is loaded.

156 """

157

158 def __init__(self, hash_name='md5', coerce_mmap=False):

159 """

160 Parameters

161 ----------

162 hash_name: string

163 The hash algorithm to be used

164 coerce_mmap: boolean

165 Make no difference between np.memmap and np.ndarray

166 objects.

167 """

168 self.coerce_mmap = coerce_mmap

169 Hasher.__init__(self, hash_name=hash_name)

170 # delayed import of numpy, to avoid tight coupling

171 import numpy as np

172 self.np = np

173 if hasattr(np, 'getbuffer'):

174 self._getbuffer = np.getbuffer

175 else:

176 self._getbuffer = memoryview

177

178 def save(self, obj):

179 """ Subclass the save method, to hash ndarray subclass, rather

180 than pickling them. Off course, this is a total abuse of

181 the Pickler class.

182 """

183 if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:

184 # Compute a hash of the object

185 # The update function of the hash requires a c_contiguous buffer.

186 if obj.shape == ():

187 # 0d arrays need to be flattened because viewing them as bytes

188 # raises a ValueError exception.

189 obj_c_contiguous = obj.flatten()

190 elif obj.flags.c_contiguous:

191 obj_c_contiguous = obj

192 elif obj.flags.f_contiguous:

193 obj_c_contiguous = obj.T

194 else:

195 # Cater for non-single-segment arrays: this creates a

196 # copy, and thus alleviates this issue.

197 # XXX: There might be a more efficient way of doing this

198 obj_c_contiguous = obj.flatten()

199

200 # memoryview is not supported for some dtypes, e.g. datetime64, see

201 # https://github.com/numpy/numpy/issues/4983. The

202 # workaround is to view the array as bytes before

203 # taking the memoryview.

204 self._hash.update(

205 self._getbuffer(obj_c_contiguous.view(self.np.uint8)))

206

207 # We store the class, to be able to distinguish between

208 # Objects with the same binary content, but different

209 # classes.

210 if self.coerce_mmap and isinstance(obj, self.np.memmap):

211 # We don't make the difference between memmap and

212 # normal ndarrays, to be able to reload previously

213 # computed results with memmap.

214 klass = self.np.ndarray

215 else:

216 klass = obj.__class__

217 # We also return the dtype and the shape, to distinguish

218 # different views on the same data with different dtypes.

219

220 # The object will be pickled by the pickler hashed at the end.

221 obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))

222 elif isinstance(obj, self.np.dtype):

223 # numpy.dtype consistent hashing is tricky to get right. This comes

224 # from the fact that atomic np.dtype objects are interned:

225 # ``np.dtype('f4') is np.dtype('f4')``. The situation is

226 # complicated by the fact that this interning does not resist a

227 # simple pickle.load/dump roundtrip:

228 # ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not

229 # np.dtype('f4') Because pickle relies on memoization during

230 # pickling, it is easy to

231 # produce different hashes for seemingly identical objects, such as

232 # ``[np.dtype('f4'), np.dtype('f4')]``

233 # and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.

234 # To prevent memoization from interfering with hashing, we isolate

235 # the serialization (and thus the pickle memoization) of each dtype

236 # using each time a different ``pickle.dumps`` call unrelated to

237 # the current Hasher instance.

238 self._hash.update("_HASHED_DTYPE".encode('utf-8'))

239 self._hash.update(pickle.dumps(obj))

240 return

241 Hasher.save(self, obj)

242

243

244def hash(obj, hash_name='md5', coerce_mmap=False):

245 """ Quick calculation of a hash to identify uniquely Python objects

246 containing numpy arrays.

247

248 Parameters

249 ----------

250 hash_name: 'md5' or 'sha1'

251 Hashing algorithm used. sha1 is supposedly safer, but md5 is

252 faster.

253 coerce_mmap: boolean

254 Make no difference between np.memmap and np.ndarray

255 """

256 valid_hash_names = ('md5', 'sha1')

257 if hash_name not in valid_hash_names:

258 raise ValueError("Valid options for 'hash_name' are {}. "

259 "Got hash_name={!r} instead."

260 .format(valid_hash_names, hash_name))

261 if 'numpy' in sys.modules:

262 hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)

263 else:

264 hasher = Hasher(hash_name=hash_name)

265 return hasher.hash(obj)