Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bz2file.py: 60%

1"""Module for reading and writing bzip2-compressed files.

3This module contains a backport of Python 3.4's bz2.open() function and

4BZ2File class, adapted to work with earlier versions of Python.

5"""

7__all__ = ["BZ2File", "open"]

9__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"

11import io

12import sys

13import warnings

15try:

16 from threading import RLock

17except ImportError:

18 from dummy_threading import RLock

20from bz2 import BZ2Compressor, BZ2Decompressor

23_MODE_CLOSED = 0

24_MODE_READ = 1

25_MODE_READ_EOF = 2

26_MODE_WRITE = 3

28_BUFFER_SIZE = 8192

30_STR_TYPES = (str, unicode) if (str is bytes) else (str, bytes)

32# The 'x' mode for open() was introduced in Python 3.3.

33_HAS_OPEN_X_MODE = sys.version_info[:2] >= (3, 3)

35_builtin_open = open

38class BZ2File(io.BufferedIOBase):

40 """A file object providing transparent bzip2 (de)compression.

42 A BZ2File can act as a wrapper for an existing file object, or refer

43 directly to a named file on disk.

45 Note that BZ2File provides a *binary* file interface - data read is

46 returned as bytes, and data to be written should be given as bytes.

47 """

49 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):

50 """Open a bzip2-compressed file.

52 If filename is a str, bytes or unicode object, it gives the name

53 of the file to be opened. Otherwise, it should be a file object,

54 which will be used to read or write the compressed data.

56 mode can be 'r' for reading (default), 'w' for (over)writing,

57 'x' for creating exclusively, or 'a' for appending. These can

58 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.

60 buffering is ignored. Its use is deprecated.

62 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1

63 and 9 specifying the level of compression: 1 produces the least

64 compression, and 9 (default) produces the most compression.

66 If mode is 'r', the input file may be the concatenation of

67 multiple compressed streams.

68 """

69 # This lock must be recursive, so that BufferedIOBase's

70 # readline(), readlines() and writelines() don't deadlock.

71 self._lock = RLock()

72 self._fp = None

73 self._closefp = False

74 self._mode = _MODE_CLOSED

75 self._pos = 0

76 self._size = -1

78 if buffering is not None:

79 warnings.warn("Use of 'buffering' argument is deprecated",

80 DeprecationWarning)

82 if not (1 <= compresslevel <= 9):

83 raise ValueError("compresslevel must be between 1 and 9")

85 if mode in ("", "r", "rb"):

86 mode = "rb"

87 mode_code = _MODE_READ

88 self._decompressor = BZ2Decompressor()

89 self._buffer = b""

90 self._buffer_offset = 0

91 elif mode in ("w", "wb"):

92 mode = "wb"

93 mode_code = _MODE_WRITE

94 self._compressor = BZ2Compressor(compresslevel)

95 elif mode in ("x", "xb") and _HAS_OPEN_X_MODE:

96 mode = "xb"

97 mode_code = _MODE_WRITE

98 self._compressor = BZ2Compressor(compresslevel)

99 elif mode in ("a", "ab"):

100 mode = "ab"

101 mode_code = _MODE_WRITE

102 self._compressor = BZ2Compressor(compresslevel)

103 else:

104 raise ValueError("Invalid mode: %r" % (mode,))

105

106 if isinstance(filename, _STR_TYPES):

107 self._fp = _builtin_open(filename, mode)

108 self._closefp = True

109 self._mode = mode_code

110 elif hasattr(filename, "read") or hasattr(filename, "write"):

111 self._fp = filename

112 self._mode = mode_code

113 else:

114 raise TypeError("filename must be a %s or %s object, or a file" %

115 (_STR_TYPES[0].__name__, _STR_TYPES[1].__name__))

116

117 def close(self):

118 """Flush and close the file.

119

120 May be called more than once without error. Once the file is

121 closed, any other operation on it will raise a ValueError.

122 """

123 with self._lock:

124 if self._mode == _MODE_CLOSED:

125 return

126 try:

127 if self._mode in (_MODE_READ, _MODE_READ_EOF):

128 self._decompressor = None

129 elif self._mode == _MODE_WRITE:

130 self._fp.write(self._compressor.flush())

131 self._compressor = None

132 finally:

133 try:

134 if self._closefp:

135 self._fp.close()

136 finally:

137 self._fp = None

138 self._closefp = False

139 self._mode = _MODE_CLOSED

140 self._buffer = b""

141 self._buffer_offset = 0

142

143 @property

144 def closed(self):

145 """True if this file is closed."""

146 return self._mode == _MODE_CLOSED

147

148 def fileno(self):

149 """Return the file descriptor for the underlying file."""

150 self._check_not_closed()

151 return self._fp.fileno()

152

153 def seekable(self):

154 """Return whether the file supports seeking."""

155 return self.readable() and (self._fp.seekable()

156 if hasattr(self._fp, "seekable")

157 else hasattr(self._fp, "seek"))

158

159 def readable(self):

160 """Return whether the file was opened for reading."""

161 self._check_not_closed()

162 return self._mode in (_MODE_READ, _MODE_READ_EOF)

163

164 def writable(self):

165 """Return whether the file was opened for writing."""

166 self._check_not_closed()

167 return self._mode == _MODE_WRITE

168

169 # Mode-checking helper functions.

170

171 def _check_not_closed(self):

172 if self.closed:

173 raise ValueError("I/O operation on closed file")

174

175 def _check_can_read(self):

176 if self._mode not in (_MODE_READ, _MODE_READ_EOF):

177 self._check_not_closed()

178 raise io.UnsupportedOperation("File not open for reading")

179

180 def _check_can_write(self):

181 if self._mode != _MODE_WRITE:

182 self._check_not_closed()

183 raise io.UnsupportedOperation("File not open for writing")

184

185 def _check_can_seek(self):

186 if self._mode not in (_MODE_READ, _MODE_READ_EOF):

187 self._check_not_closed()

188 raise io.UnsupportedOperation("Seeking is only supported "

189 "on files open for reading")

190 if hasattr(self._fp, "seekable") and not self._fp.seekable():

191 raise io.UnsupportedOperation("The underlying file object "

192 "does not support seeking")

193

194 # Fill the readahead buffer if it is empty. Returns False on EOF.

195 def _fill_buffer(self):

196 if self._mode == _MODE_READ_EOF:

197 return False

198 # Depending on the input data, our call to the decompressor may not

199 # return any data. In this case, try again after reading another block.

200 while self._buffer_offset == len(self._buffer):

201 rawblock = (self._decompressor.unused_data or

202 self._fp.read(_BUFFER_SIZE))

203

204 if not rawblock:

205 try:

206 self._decompressor.decompress(b"")

207 except EOFError:

208 # End-of-stream marker and end of file. We're good.

209 self._mode = _MODE_READ_EOF

210 self._size = self._pos

211 return False

212 else:

213 # Problem - we were expecting more compressed data.

214 raise EOFError("Compressed file ended before the "

215 "end-of-stream marker was reached")

216

217 try:

218 self._buffer = self._decompressor.decompress(rawblock)

219 except EOFError:

220 # Continue to next stream.

221 self._decompressor = BZ2Decompressor()

222 try:

223 self._buffer = self._decompressor.decompress(rawblock)

224 except IOError:

225 # Trailing data isn't a valid bzip2 stream. We're done here.

226 self._mode = _MODE_READ_EOF

227 self._size = self._pos

228 return False

229 self._buffer_offset = 0

230 return True

231

232 # Read data until EOF.

233 # If return_data is false, consume the data without returning it.

234 def _read_all(self, return_data=True):

235 # The loop assumes that _buffer_offset is 0. Ensure that this is true.

236 self._buffer = self._buffer[self._buffer_offset:]

237 self._buffer_offset = 0

238

239 blocks = []

240 while self._fill_buffer():

241 if return_data:

242 blocks.append(self._buffer)

243 self._pos += len(self._buffer)

244 self._buffer = b""

245 if return_data:

246 return b"".join(blocks)

247

248 # Read a block of up to n bytes.

249 # If return_data is false, consume the data without returning it.

250 def _read_block(self, n, return_data=True):

251 # If we have enough data buffered, return immediately.

252 end = self._buffer_offset + n

253 if end <= len(self._buffer):

254 data = self._buffer[self._buffer_offset : end]

255 self._buffer_offset = end

256 self._pos += len(data)

257 return data if return_data else None

258

259 # The loop assumes that _buffer_offset is 0. Ensure that this is true.

260 self._buffer = self._buffer[self._buffer_offset:]

261 self._buffer_offset = 0

262

263 blocks = []

264 while n > 0 and self._fill_buffer():

265 if n < len(self._buffer):

266 data = self._buffer[:n]

267 self._buffer_offset = n

268 else:

269 data = self._buffer

270 self._buffer = b""

271 if return_data:

272 blocks.append(data)

273 self._pos += len(data)

274 n -= len(data)

275 if return_data:

276 return b"".join(blocks)

277

278 def peek(self, n=0):

279 """Return buffered data without advancing the file position.

280

281 Always returns at least one byte of data, unless at EOF.

282 The exact number of bytes returned is unspecified.

283 """

284 with self._lock:

285 self._check_can_read()

286 if not self._fill_buffer():

287 return b""

288 return self._buffer[self._buffer_offset:]

289

290 def read(self, size=-1):

291 """Read up to size uncompressed bytes from the file.

292

293 If size is negative or omitted, read until EOF is reached.

294 Returns b'' if the file is already at EOF.

295 """

296 if size is None:

297 raise TypeError()

298 with self._lock:

299 self._check_can_read()

300 if size == 0:

301 return b""

302 elif size < 0:

303 return self._read_all()

304 else:

305 return self._read_block(size)

306

307 def read1(self, size=-1):

308 """Read up to size uncompressed bytes, while trying to avoid

309 making multiple reads from the underlying stream.

310

311 Returns b'' if the file is at EOF.

312 """

313 # Usually, read1() calls _fp.read() at most once. However, sometimes

314 # this does not give enough data for the decompressor to make progress.

315 # In this case we make multiple reads, to avoid returning b"".

316 with self._lock:

317 self._check_can_read()

318 if (size == 0 or

319 # Only call _fill_buffer() if the buffer is actually empty.

320 # This gives a significant speedup if *size* is small.

321 (self._buffer_offset == len(self._buffer) and not self._fill_buffer())):

322 return b""

323 if size > 0:

324 data = self._buffer[self._buffer_offset :

325 self._buffer_offset + size]

326 self._buffer_offset += len(data)

327 else:

328 data = self._buffer[self._buffer_offset:]

329 self._buffer = b""

330 self._buffer_offset = 0

331 self._pos += len(data)

332 return data

333

334 def readinto(self, b):

335 """Read up to len(b) bytes into b.

336

337 Returns the number of bytes read (0 for EOF).

338 """

339 with self._lock:

340 return io.BufferedIOBase.readinto(self, b)

341

342 def readline(self, size=-1):

343 """Read a line of uncompressed bytes from the file.

344

345 The terminating newline (if present) is retained. If size is

346 non-negative, no more than size bytes will be read (in which

347 case the line may be incomplete). Returns b'' if already at EOF.

348 """

349 if not isinstance(size, int):

350 if not hasattr(size, "__index__"):

351 raise TypeError("Integer argument expected")

352 size = size.__index__()

353 with self._lock:

354 self._check_can_read()

355 # Shortcut for the common case - the whole line is in the buffer.

356 if size < 0:

357 end = self._buffer.find(b"\n", self._buffer_offset) + 1

358 if end > 0:

359 line = self._buffer[self._buffer_offset : end]

360 self._buffer_offset = end

361 self._pos += len(line)

362 return line

363 return io.BufferedIOBase.readline(self, size)

364

365 def readlines(self, size=-1):

366 """Read a list of lines of uncompressed bytes from the file.

367

368 size can be specified to control the number of lines read: no

369 further lines will be read once the total size of the lines read

370 so far equals or exceeds size.

371 """

372 if not isinstance(size, int):

373 if not hasattr(size, "__index__"):

374 raise TypeError("Integer argument expected")

375 size = size.__index__()

376 with self._lock:

377 return io.BufferedIOBase.readlines(self, size)

378

379 def write(self, data):

380 """Write a byte string to the file.

381

382 Returns the number of uncompressed bytes written, which is

383 always len(data). Note that due to buffering, the file on disk

384 may not reflect the data written until close() is called.

385 """

386 with self._lock:

387 self._check_can_write()

388 compressed = self._compressor.compress(data)

389 self._fp.write(compressed)

390 self._pos += len(data)

391 return len(data)

392

393 def writelines(self, seq):

394 """Write a sequence of byte strings to the file.

395

396 Returns the number of uncompressed bytes written.

397 seq can be any iterable yielding byte strings.

398

399 Line separators are not added between the written byte strings.

400 """

401 with self._lock:

402 return io.BufferedIOBase.writelines(self, seq)

403

404 # Rewind the file to the beginning of the data stream.

405 def _rewind(self):

406 self._fp.seek(0, 0)

407 self._mode = _MODE_READ

408 self._pos = 0

409 self._decompressor = BZ2Decompressor()

410 self._buffer = b""

411 self._buffer_offset = 0

412

413 def seek(self, offset, whence=0):

414 """Change the file position.

415

416 The new position is specified by offset, relative to the

417 position indicated by whence. Values for whence are:

418

419 0: start of stream (default); offset must not be negative

420 1: current stream position

421 2: end of stream; offset must not be positive

422

423 Returns the new file position.

424

425 Note that seeking is emulated, so depending on the parameters,

426 this operation may be extremely slow.

427 """

428 with self._lock:

429 self._check_can_seek()

430

431 # Recalculate offset as an absolute file position.

432 if whence == 0:

433 pass

434 elif whence == 1:

435 offset = self._pos + offset

436 elif whence == 2:

437 # Seeking relative to EOF - we need to know the file's size.

438 if self._size < 0:

439 self._read_all(return_data=False)

440 offset = self._size + offset

441 else:

442 raise ValueError("Invalid value for whence: %s" % (whence,))

443

444 # Make it so that offset is the number of bytes to skip forward.

445 if offset < self._pos:

446 self._rewind()

447 else:

448 offset -= self._pos

449

450 # Read and discard data until we reach the desired position.

451 self._read_block(offset, return_data=False)

452

453 return self._pos

454

455 def tell(self):

456 """Return the current file position."""

457 with self._lock:

458 self._check_not_closed()

459 return self._pos

460

461

462def open(filename, mode="rb", compresslevel=9,

463 encoding=None, errors=None, newline=None):

464 """Open a bzip2-compressed file in binary or text mode.

465

466 The filename argument can be an actual filename (a str, bytes or unicode

467 object), or an existing file object to read from or write to.

468

469 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or

470 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.

471 The default mode is "rb", and the default compresslevel is 9.

472

473 For binary mode, this function is equivalent to the BZ2File

474 constructor: BZ2File(filename, mode, compresslevel). In this case,

475 the encoding, errors and newline arguments must not be provided.

476

477 For text mode, a BZ2File object is created, and wrapped in an

478 io.TextIOWrapper instance with the specified encoding, error

479 handling behavior, and line ending(s).

480

481 """

482 if "t" in mode:

483 if "b" in mode:

484 raise ValueError("Invalid mode: %r" % (mode,))

485 else:

486 if encoding is not None:

487 raise ValueError("Argument 'encoding' not supported in binary mode")

488 if errors is not None:

489 raise ValueError("Argument 'errors' not supported in binary mode")

490 if newline is not None:

491 raise ValueError("Argument 'newline' not supported in binary mode")

492

493 bz_mode = mode.replace("t", "")

494 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)

495

496 if "t" in mode:

497 return io.TextIOWrapper(binary_file, encoding, errors, newline)

498 else:

499 return binary_file