Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bz2file.py: 60%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

272 statements  

1"""Module for reading and writing bzip2-compressed files. 

2 

3This module contains a backport of Python 3.4's bz2.open() function and 

4BZ2File class, adapted to work with earlier versions of Python. 

5""" 

6 

7__all__ = ["BZ2File", "open"] 

8 

9__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>" 

10 

11import io 

12import sys 

13import warnings 

14 

15try: 

16 from threading import RLock 

17except ImportError: 

18 from dummy_threading import RLock 

19 

20from bz2 import BZ2Compressor, BZ2Decompressor 

21 

22 

23_MODE_CLOSED = 0 

24_MODE_READ = 1 

25_MODE_READ_EOF = 2 

26_MODE_WRITE = 3 

27 

28_BUFFER_SIZE = 8192 

29 

30_STR_TYPES = (str, unicode) if (str is bytes) else (str, bytes) 

31 

32# The 'x' mode for open() was introduced in Python 3.3. 

33_HAS_OPEN_X_MODE = sys.version_info[:2] >= (3, 3) 

34 

35_builtin_open = open 

36 

37 

38class BZ2File(io.BufferedIOBase): 

39 

40 """A file object providing transparent bzip2 (de)compression. 

41 

42 A BZ2File can act as a wrapper for an existing file object, or refer 

43 directly to a named file on disk. 

44 

45 Note that BZ2File provides a *binary* file interface - data read is 

46 returned as bytes, and data to be written should be given as bytes. 

47 """ 

48 

49 def __init__(self, filename, mode="r", buffering=None, compresslevel=9): 

50 """Open a bzip2-compressed file. 

51 

52 If filename is a str, bytes or unicode object, it gives the name 

53 of the file to be opened. Otherwise, it should be a file object, 

54 which will be used to read or write the compressed data. 

55 

56 mode can be 'r' for reading (default), 'w' for (over)writing, 

57 'x' for creating exclusively, or 'a' for appending. These can 

58 equivalently be given as 'rb', 'wb', 'xb', and 'ab'. 

59 

60 buffering is ignored. Its use is deprecated. 

61 

62 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 

63 and 9 specifying the level of compression: 1 produces the least 

64 compression, and 9 (default) produces the most compression. 

65 

66 If mode is 'r', the input file may be the concatenation of 

67 multiple compressed streams. 

68 """ 

69 # This lock must be recursive, so that BufferedIOBase's 

70 # readline(), readlines() and writelines() don't deadlock. 

71 self._lock = RLock() 

72 self._fp = None 

73 self._closefp = False 

74 self._mode = _MODE_CLOSED 

75 self._pos = 0 

76 self._size = -1 

77 

78 if buffering is not None: 

79 warnings.warn("Use of 'buffering' argument is deprecated", 

80 DeprecationWarning) 

81 

82 if not (1 <= compresslevel <= 9): 

83 raise ValueError("compresslevel must be between 1 and 9") 

84 

85 if mode in ("", "r", "rb"): 

86 mode = "rb" 

87 mode_code = _MODE_READ 

88 self._decompressor = BZ2Decompressor() 

89 self._buffer = b"" 

90 self._buffer_offset = 0 

91 elif mode in ("w", "wb"): 

92 mode = "wb" 

93 mode_code = _MODE_WRITE 

94 self._compressor = BZ2Compressor(compresslevel) 

95 elif mode in ("x", "xb") and _HAS_OPEN_X_MODE: 

96 mode = "xb" 

97 mode_code = _MODE_WRITE 

98 self._compressor = BZ2Compressor(compresslevel) 

99 elif mode in ("a", "ab"): 

100 mode = "ab" 

101 mode_code = _MODE_WRITE 

102 self._compressor = BZ2Compressor(compresslevel) 

103 else: 

104 raise ValueError("Invalid mode: %r" % (mode,)) 

105 

106 if isinstance(filename, _STR_TYPES): 

107 self._fp = _builtin_open(filename, mode) 

108 self._closefp = True 

109 self._mode = mode_code 

110 elif hasattr(filename, "read") or hasattr(filename, "write"): 

111 self._fp = filename 

112 self._mode = mode_code 

113 else: 

114 raise TypeError("filename must be a %s or %s object, or a file" % 

115 (_STR_TYPES[0].__name__, _STR_TYPES[1].__name__)) 

116 

117 def close(self): 

118 """Flush and close the file. 

119 

120 May be called more than once without error. Once the file is 

121 closed, any other operation on it will raise a ValueError. 

122 """ 

123 with self._lock: 

124 if self._mode == _MODE_CLOSED: 

125 return 

126 try: 

127 if self._mode in (_MODE_READ, _MODE_READ_EOF): 

128 self._decompressor = None 

129 elif self._mode == _MODE_WRITE: 

130 self._fp.write(self._compressor.flush()) 

131 self._compressor = None 

132 finally: 

133 try: 

134 if self._closefp: 

135 self._fp.close() 

136 finally: 

137 self._fp = None 

138 self._closefp = False 

139 self._mode = _MODE_CLOSED 

140 self._buffer = b"" 

141 self._buffer_offset = 0 

142 

143 @property 

144 def closed(self): 

145 """True if this file is closed.""" 

146 return self._mode == _MODE_CLOSED 

147 

148 def fileno(self): 

149 """Return the file descriptor for the underlying file.""" 

150 self._check_not_closed() 

151 return self._fp.fileno() 

152 

153 def seekable(self): 

154 """Return whether the file supports seeking.""" 

155 return self.readable() and (self._fp.seekable() 

156 if hasattr(self._fp, "seekable") 

157 else hasattr(self._fp, "seek")) 

158 

159 def readable(self): 

160 """Return whether the file was opened for reading.""" 

161 self._check_not_closed() 

162 return self._mode in (_MODE_READ, _MODE_READ_EOF) 

163 

164 def writable(self): 

165 """Return whether the file was opened for writing.""" 

166 self._check_not_closed() 

167 return self._mode == _MODE_WRITE 

168 

169 # Mode-checking helper functions. 

170 

171 def _check_not_closed(self): 

172 if self.closed: 

173 raise ValueError("I/O operation on closed file") 

174 

175 def _check_can_read(self): 

176 if self._mode not in (_MODE_READ, _MODE_READ_EOF): 

177 self._check_not_closed() 

178 raise io.UnsupportedOperation("File not open for reading") 

179 

180 def _check_can_write(self): 

181 if self._mode != _MODE_WRITE: 

182 self._check_not_closed() 

183 raise io.UnsupportedOperation("File not open for writing") 

184 

185 def _check_can_seek(self): 

186 if self._mode not in (_MODE_READ, _MODE_READ_EOF): 

187 self._check_not_closed() 

188 raise io.UnsupportedOperation("Seeking is only supported " 

189 "on files open for reading") 

190 if hasattr(self._fp, "seekable") and not self._fp.seekable(): 

191 raise io.UnsupportedOperation("The underlying file object " 

192 "does not support seeking") 

193 

194 # Fill the readahead buffer if it is empty. Returns False on EOF. 

195 def _fill_buffer(self): 

196 if self._mode == _MODE_READ_EOF: 

197 return False 

198 # Depending on the input data, our call to the decompressor may not 

199 # return any data. In this case, try again after reading another block. 

200 while self._buffer_offset == len(self._buffer): 

201 rawblock = (self._decompressor.unused_data or 

202 self._fp.read(_BUFFER_SIZE)) 

203 

204 if not rawblock: 

205 try: 

206 self._decompressor.decompress(b"") 

207 except EOFError: 

208 # End-of-stream marker and end of file. We're good. 

209 self._mode = _MODE_READ_EOF 

210 self._size = self._pos 

211 return False 

212 else: 

213 # Problem - we were expecting more compressed data. 

214 raise EOFError("Compressed file ended before the " 

215 "end-of-stream marker was reached") 

216 

217 try: 

218 self._buffer = self._decompressor.decompress(rawblock) 

219 except EOFError: 

220 # Continue to next stream. 

221 self._decompressor = BZ2Decompressor() 

222 try: 

223 self._buffer = self._decompressor.decompress(rawblock) 

224 except IOError: 

225 # Trailing data isn't a valid bzip2 stream. We're done here. 

226 self._mode = _MODE_READ_EOF 

227 self._size = self._pos 

228 return False 

229 self._buffer_offset = 0 

230 return True 

231 

232 # Read data until EOF. 

233 # If return_data is false, consume the data without returning it. 

234 def _read_all(self, return_data=True): 

235 # The loop assumes that _buffer_offset is 0. Ensure that this is true. 

236 self._buffer = self._buffer[self._buffer_offset:] 

237 self._buffer_offset = 0 

238 

239 blocks = [] 

240 while self._fill_buffer(): 

241 if return_data: 

242 blocks.append(self._buffer) 

243 self._pos += len(self._buffer) 

244 self._buffer = b"" 

245 if return_data: 

246 return b"".join(blocks) 

247 

248 # Read a block of up to n bytes. 

249 # If return_data is false, consume the data without returning it. 

250 def _read_block(self, n, return_data=True): 

251 # If we have enough data buffered, return immediately. 

252 end = self._buffer_offset + n 

253 if end <= len(self._buffer): 

254 data = self._buffer[self._buffer_offset : end] 

255 self._buffer_offset = end 

256 self._pos += len(data) 

257 return data if return_data else None 

258 

259 # The loop assumes that _buffer_offset is 0. Ensure that this is true. 

260 self._buffer = self._buffer[self._buffer_offset:] 

261 self._buffer_offset = 0 

262 

263 blocks = [] 

264 while n > 0 and self._fill_buffer(): 

265 if n < len(self._buffer): 

266 data = self._buffer[:n] 

267 self._buffer_offset = n 

268 else: 

269 data = self._buffer 

270 self._buffer = b"" 

271 if return_data: 

272 blocks.append(data) 

273 self._pos += len(data) 

274 n -= len(data) 

275 if return_data: 

276 return b"".join(blocks) 

277 

278 def peek(self, n=0): 

279 """Return buffered data without advancing the file position. 

280 

281 Always returns at least one byte of data, unless at EOF. 

282 The exact number of bytes returned is unspecified. 

283 """ 

284 with self._lock: 

285 self._check_can_read() 

286 if not self._fill_buffer(): 

287 return b"" 

288 return self._buffer[self._buffer_offset:] 

289 

290 def read(self, size=-1): 

291 """Read up to size uncompressed bytes from the file. 

292 

293 If size is negative or omitted, read until EOF is reached. 

294 Returns b'' if the file is already at EOF. 

295 """ 

296 if size is None: 

297 raise TypeError() 

298 with self._lock: 

299 self._check_can_read() 

300 if size == 0: 

301 return b"" 

302 elif size < 0: 

303 return self._read_all() 

304 else: 

305 return self._read_block(size) 

306 

307 def read1(self, size=-1): 

308 """Read up to size uncompressed bytes, while trying to avoid 

309 making multiple reads from the underlying stream. 

310 

311 Returns b'' if the file is at EOF. 

312 """ 

313 # Usually, read1() calls _fp.read() at most once. However, sometimes 

314 # this does not give enough data for the decompressor to make progress. 

315 # In this case we make multiple reads, to avoid returning b"". 

316 with self._lock: 

317 self._check_can_read() 

318 if (size == 0 or 

319 # Only call _fill_buffer() if the buffer is actually empty. 

320 # This gives a significant speedup if *size* is small. 

321 (self._buffer_offset == len(self._buffer) and not self._fill_buffer())): 

322 return b"" 

323 if size > 0: 

324 data = self._buffer[self._buffer_offset : 

325 self._buffer_offset + size] 

326 self._buffer_offset += len(data) 

327 else: 

328 data = self._buffer[self._buffer_offset:] 

329 self._buffer = b"" 

330 self._buffer_offset = 0 

331 self._pos += len(data) 

332 return data 

333 

334 def readinto(self, b): 

335 """Read up to len(b) bytes into b. 

336 

337 Returns the number of bytes read (0 for EOF). 

338 """ 

339 with self._lock: 

340 return io.BufferedIOBase.readinto(self, b) 

341 

342 def readline(self, size=-1): 

343 """Read a line of uncompressed bytes from the file. 

344 

345 The terminating newline (if present) is retained. If size is 

346 non-negative, no more than size bytes will be read (in which 

347 case the line may be incomplete). Returns b'' if already at EOF. 

348 """ 

349 if not isinstance(size, int): 

350 if not hasattr(size, "__index__"): 

351 raise TypeError("Integer argument expected") 

352 size = size.__index__() 

353 with self._lock: 

354 self._check_can_read() 

355 # Shortcut for the common case - the whole line is in the buffer. 

356 if size < 0: 

357 end = self._buffer.find(b"\n", self._buffer_offset) + 1 

358 if end > 0: 

359 line = self._buffer[self._buffer_offset : end] 

360 self._buffer_offset = end 

361 self._pos += len(line) 

362 return line 

363 return io.BufferedIOBase.readline(self, size) 

364 

365 def readlines(self, size=-1): 

366 """Read a list of lines of uncompressed bytes from the file. 

367 

368 size can be specified to control the number of lines read: no 

369 further lines will be read once the total size of the lines read 

370 so far equals or exceeds size. 

371 """ 

372 if not isinstance(size, int): 

373 if not hasattr(size, "__index__"): 

374 raise TypeError("Integer argument expected") 

375 size = size.__index__() 

376 with self._lock: 

377 return io.BufferedIOBase.readlines(self, size) 

378 

379 def write(self, data): 

380 """Write a byte string to the file. 

381 

382 Returns the number of uncompressed bytes written, which is 

383 always len(data). Note that due to buffering, the file on disk 

384 may not reflect the data written until close() is called. 

385 """ 

386 with self._lock: 

387 self._check_can_write() 

388 compressed = self._compressor.compress(data) 

389 self._fp.write(compressed) 

390 self._pos += len(data) 

391 return len(data) 

392 

393 def writelines(self, seq): 

394 """Write a sequence of byte strings to the file. 

395 

396 Returns the number of uncompressed bytes written. 

397 seq can be any iterable yielding byte strings. 

398 

399 Line separators are not added between the written byte strings. 

400 """ 

401 with self._lock: 

402 return io.BufferedIOBase.writelines(self, seq) 

403 

404 # Rewind the file to the beginning of the data stream. 

405 def _rewind(self): 

406 self._fp.seek(0, 0) 

407 self._mode = _MODE_READ 

408 self._pos = 0 

409 self._decompressor = BZ2Decompressor() 

410 self._buffer = b"" 

411 self._buffer_offset = 0 

412 

413 def seek(self, offset, whence=0): 

414 """Change the file position. 

415 

416 The new position is specified by offset, relative to the 

417 position indicated by whence. Values for whence are: 

418 

419 0: start of stream (default); offset must not be negative 

420 1: current stream position 

421 2: end of stream; offset must not be positive 

422 

423 Returns the new file position. 

424 

425 Note that seeking is emulated, so depending on the parameters, 

426 this operation may be extremely slow. 

427 """ 

428 with self._lock: 

429 self._check_can_seek() 

430 

431 # Recalculate offset as an absolute file position. 

432 if whence == 0: 

433 pass 

434 elif whence == 1: 

435 offset = self._pos + offset 

436 elif whence == 2: 

437 # Seeking relative to EOF - we need to know the file's size. 

438 if self._size < 0: 

439 self._read_all(return_data=False) 

440 offset = self._size + offset 

441 else: 

442 raise ValueError("Invalid value for whence: %s" % (whence,)) 

443 

444 # Make it so that offset is the number of bytes to skip forward. 

445 if offset < self._pos: 

446 self._rewind() 

447 else: 

448 offset -= self._pos 

449 

450 # Read and discard data until we reach the desired position. 

451 self._read_block(offset, return_data=False) 

452 

453 return self._pos 

454 

455 def tell(self): 

456 """Return the current file position.""" 

457 with self._lock: 

458 self._check_not_closed() 

459 return self._pos 

460 

461 

462def open(filename, mode="rb", compresslevel=9, 

463 encoding=None, errors=None, newline=None): 

464 """Open a bzip2-compressed file in binary or text mode. 

465 

466 The filename argument can be an actual filename (a str, bytes or unicode 

467 object), or an existing file object to read from or write to. 

468 

469 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or 

470 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. 

471 The default mode is "rb", and the default compresslevel is 9. 

472 

473 For binary mode, this function is equivalent to the BZ2File 

474 constructor: BZ2File(filename, mode, compresslevel). In this case, 

475 the encoding, errors and newline arguments must not be provided. 

476 

477 For text mode, a BZ2File object is created, and wrapped in an 

478 io.TextIOWrapper instance with the specified encoding, error 

479 handling behavior, and line ending(s). 

480 

481 """ 

482 if "t" in mode: 

483 if "b" in mode: 

484 raise ValueError("Invalid mode: %r" % (mode,)) 

485 else: 

486 if encoding is not None: 

487 raise ValueError("Argument 'encoding' not supported in binary mode") 

488 if errors is not None: 

489 raise ValueError("Argument 'errors' not supported in binary mode") 

490 if newline is not None: 

491 raise ValueError("Argument 'newline' not supported in binary mode") 

492 

493 bz_mode = mode.replace("t", "") 

494 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) 

495 

496 if "t" in mode: 

497 return io.TextIOWrapper(binary_file, encoding, errors, newline) 

498 else: 

499 return binary_file