Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/numpy_pickle_utils.py: 59%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

88 statements  

1"""Utilities for fast persistence of big data, with optional compression.""" 

2 

3# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> 

4# Copyright (c) 2009 Gael Varoquaux 

5# License: BSD Style, 3 clauses. 

6 

7import pickle 

8import io 

9import sys 

10import warnings 

11import contextlib 

12 

13from .compressor import _ZFILE_PREFIX 

14from .compressor import _COMPRESSORS 

15 

16try: 

17 import numpy as np 

18except ImportError: 

19 np = None 

20 

21Unpickler = pickle._Unpickler 

22Pickler = pickle._Pickler 

23xrange = range 

24 

25 

26try: 

27 # The python standard library can be built without bz2 so we make bz2 

28 # usage optional. 

29 # see https://github.com/scikit-learn/scikit-learn/issues/7526 for more 

30 # details. 

31 import bz2 

32except ImportError: 

33 bz2 = None 

34 

35# Buffer size used in io.BufferedReader and io.BufferedWriter 

36_IO_BUFFER_SIZE = 1024 ** 2 

37 

38 

39def _is_raw_file(fileobj): 

40 """Check if fileobj is a raw file object, e.g created with open.""" 

41 fileobj = getattr(fileobj, 'raw', fileobj) 

42 return isinstance(fileobj, io.FileIO) 

43 

44 

45def _get_prefixes_max_len(): 

46 # Compute the max prefix len of registered compressors. 

47 prefixes = [len(compressor.prefix) for compressor in _COMPRESSORS.values()] 

48 prefixes += [len(_ZFILE_PREFIX)] 

49 return max(prefixes) 

50 

51 

52def _is_numpy_array_byte_order_mismatch(array): 

53 """Check if numpy array is having byte order mismatch""" 

54 return ((sys.byteorder == 'big' and 

55 (array.dtype.byteorder == '<' or 

56 (array.dtype.byteorder == '|' and array.dtype.fields and 

57 all(e[0].byteorder == '<' 

58 for e in array.dtype.fields.values())))) or 

59 (sys.byteorder == 'little' and 

60 (array.dtype.byteorder == '>' or 

61 (array.dtype.byteorder == '|' and array.dtype.fields and 

62 all(e[0].byteorder == '>' 

63 for e in array.dtype.fields.values()))))) 

64 

65 

66def _ensure_native_byte_order(array): 

67 """Use the byte order of the host while preserving values 

68 

69 Does nothing if array already uses the system byte order. 

70 """ 

71 if _is_numpy_array_byte_order_mismatch(array): 

72 array = array.byteswap().view(array.dtype.newbyteorder('=')) 

73 return array 

74 

75 

76############################################################################### 

77# Cache file utilities 

78def _detect_compressor(fileobj): 

79 """Return the compressor matching fileobj. 

80 

81 Parameters 

82 ---------- 

83 fileobj: file object 

84 

85 Returns 

86 ------- 

87 str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', 'not-compressed'} 

88 """ 

89 # Read the magic number in the first bytes of the file. 

90 max_prefix_len = _get_prefixes_max_len() 

91 if hasattr(fileobj, 'peek'): 

92 # Peek allows to read those bytes without moving the cursor in the 

93 # file whic. 

94 first_bytes = fileobj.peek(max_prefix_len) 

95 else: 

96 # Fallback to seek if the fileobject is not peekable. 

97 first_bytes = fileobj.read(max_prefix_len) 

98 fileobj.seek(0) 

99 

100 if first_bytes.startswith(_ZFILE_PREFIX): 

101 return "compat" 

102 else: 

103 for name, compressor in _COMPRESSORS.items(): 

104 if first_bytes.startswith(compressor.prefix): 

105 return name 

106 

107 return "not-compressed" 

108 

109 

110def _buffered_read_file(fobj): 

111 """Return a buffered version of a read file object.""" 

112 return io.BufferedReader(fobj, buffer_size=_IO_BUFFER_SIZE) 

113 

114 

115def _buffered_write_file(fobj): 

116 """Return a buffered version of a write file object.""" 

117 return io.BufferedWriter(fobj, buffer_size=_IO_BUFFER_SIZE) 

118 

119 

120@contextlib.contextmanager 

121def _read_fileobject(fileobj, filename, mmap_mode=None): 

122 """Utility function opening the right fileobject from a filename. 

123 

124 The magic number is used to choose between the type of file object to open: 

125 * regular file object (default) 

126 * zlib file object 

127 * gzip file object 

128 * bz2 file object 

129 * lzma file object (for xz and lzma compressor) 

130 

131 Parameters 

132 ---------- 

133 fileobj: file object 

134 compressor: str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', 

135 'not-compressed'} 

136 filename: str 

137 filename path corresponding to the fileobj parameter. 

138 mmap_mode: str 

139 memory map mode that should be used to open the pickle file. This 

140 parameter is useful to verify that the user is not trying to one with 

141 compression. Default: None. 

142 

143 Returns 

144 ------- 

145 a file like object 

146 

147 """ 

148 # Detect if the fileobj contains compressed data. 

149 compressor = _detect_compressor(fileobj) 

150 

151 if compressor == 'compat': 

152 # Compatibility with old pickle mode: simply return the input 

153 # filename "as-is" and let the compatibility function be called by the 

154 # caller. 

155 warnings.warn("The file '%s' has been generated with a joblib " 

156 "version less than 0.10. " 

157 "Please regenerate this pickle file." % filename, 

158 DeprecationWarning, stacklevel=2) 

159 yield filename 

160 else: 

161 if compressor in _COMPRESSORS: 

162 # based on the compressor detected in the file, we open the 

163 # correct decompressor file object, wrapped in a buffer. 

164 compressor_wrapper = _COMPRESSORS[compressor] 

165 inst = compressor_wrapper.decompressor_file(fileobj) 

166 fileobj = _buffered_read_file(inst) 

167 

168 # Checking if incompatible load parameters with the type of file: 

169 # mmap_mode cannot be used with compressed file or in memory buffers 

170 # such as io.BytesIO. 

171 if mmap_mode is not None: 

172 if isinstance(fileobj, io.BytesIO): 

173 warnings.warn('In memory persistence is not compatible with ' 

174 'mmap_mode "%(mmap_mode)s" flag passed. ' 

175 'mmap_mode option will be ignored.' 

176 % locals(), stacklevel=2) 

177 elif compressor != 'not-compressed': 

178 warnings.warn('mmap_mode "%(mmap_mode)s" is not compatible ' 

179 'with compressed file %(filename)s. ' 

180 '"%(mmap_mode)s" flag will be ignored.' 

181 % locals(), stacklevel=2) 

182 elif not _is_raw_file(fileobj): 

183 warnings.warn('"%(fileobj)r" is not a raw file, mmap_mode ' 

184 '"%(mmap_mode)s" flag will be ignored.' 

185 % locals(), stacklevel=2) 

186 

187 yield fileobj 

188 

189 

190def _write_fileobject(filename, compress=("zlib", 3)): 

191 """Return the right compressor file object in write mode.""" 

192 compressmethod = compress[0] 

193 compresslevel = compress[1] 

194 

195 if compressmethod in _COMPRESSORS.keys(): 

196 file_instance = _COMPRESSORS[compressmethod].compressor_file( 

197 filename, compresslevel=compresslevel) 

198 return _buffered_write_file(file_instance) 

199 else: 

200 file_instance = _COMPRESSORS['zlib'].compressor_file( 

201 filename, compresslevel=compresslevel) 

202 return _buffered_write_file(file_instance) 

203 

204 

205# Utility functions/variables from numpy required for writing arrays. 

206# We need at least the functions introduced in version 1.9 of numpy. Here, 

207# we use the ones from numpy 1.10.2. 

208BUFFER_SIZE = 2 ** 18 # size of buffer for reading npz files in bytes 

209 

210 

211def _read_bytes(fp, size, error_template="ran out of data"): 

212 """Read from file-like object until size bytes are read. 

213 

214 TODO python2_drop: is it still needed? The docstring mentions python 2.6 

215 and it looks like this can be at least simplified ... 

216 

217 Raises ValueError if not EOF is encountered before size bytes are read. 

218 Non-blocking objects only supported if they derive from io objects. 

219 

220 Required as e.g. ZipExtFile in python 2.6 can return less data than 

221 requested. 

222 

223 This function was taken from numpy/lib/format.py in version 1.10.2. 

224 

225 Parameters 

226 ---------- 

227 fp: file-like object 

228 size: int 

229 error_template: str 

230 

231 Returns 

232 ------- 

233 a bytes object 

234 The data read in bytes. 

235 

236 """ 

237 data = bytes() 

238 while True: 

239 # io files (default in python3) return None or raise on 

240 # would-block, python2 file will truncate, probably nothing can be 

241 # done about that. note that regular files can't be non-blocking 

242 try: 

243 r = fp.read(size - len(data)) 

244 data += r 

245 if len(r) == 0 or len(data) == size: 

246 break 

247 except io.BlockingIOError: 

248 pass 

249 if len(data) != size: 

250 msg = "EOF: reading %s, expected %d bytes got %d" 

251 raise ValueError(msg % (error_template, size, len(data))) 

252 else: 

253 return data