Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/unblob/handlers/compression/compress.py: 100%

1"""Unix compress'ed chunk identification.

3We identify the end offset of any identified unix compress'ed chunk by

4performing Lempel-Ziv-Welch decompression on a chunk starting from the

5identified start offset, and ending at the end of the whole file being

6analyzed.

8If we reach an invalid code or the stream ends in the middle of a

9code, we do not recursively call the decompression with -1 size,

10rather just fail on the chunk as we have seen too many false-positives

11picked up by this heuristic.

13Once the decompression procedure works without errors, that means we

14have a valid chunk and can return its current end offset.

16We use a small heuristic to return the right end offset. This

17heuristic tends to work well when arbitrary data appended at the end

18of the stream is made of random bytes (no repeating letters, no large

19set of ASCII letters).

21It obviously can be wrong from time to time, leading to a compress'ed

22chunk that we can decompress (obviously), but uncompressed data will

23contain garbage bytes at the end.

25Sadly, there is no way we can identify with 100% probability the end

26offset of a compress'ed stream with byte precision if it is followed

27by other content.

29The good news is that because of this behavior, it's highly unlikely

30we will observe compress'ed chunks followed by other chunks in the

31wild. The only ones I observed were followed by null bytes sentinels,

32which helps identifying the exact end offset.

33"""

35import io

36from typing import Optional

38from structlog import get_logger

40from unblob.extractors import Command

42from ...file_utils import Endian, InvalidInputFormat, convert_int8, convert_int16

43from ...models import (

44 File,

45 HandlerDoc,

46 HandlerType,

47 HexString,

48 Reference,

49 StructHandler,

50 ValidChunk,

51)

53logger = get_logger()

56class UnixCompressHandler(StructHandler):

57 NAME = "compress"

59 PATTERNS = [

60 # reference: https://fuchsia.googlesource.com/third_party/wuffs/+/HEAD/std/lzw/README.md

61 HexString("1f 9d")

62 ]

64 C_DEFINITIONS = r"""

65 struct compress_header {

66 char magic[2]; // compress signature/magic number

67 uint8 flags; // blocks = flags&0x80, bits = flags&0x1f

68 };

69 """

70 HEADER_STRUCT = "compress_header"

72 EXTRACTOR = Command("7z", "x", "-y", "{inpath}", "-so", stdout="lzw.uncompressed")

74 DOC = HandlerDoc(

75 name=NAME,

76 description="Unix compress files use the Lempel-Ziv-Welch (LZW) algorithm for data compression and are identified by a 2-byte magic number (0x1F 0x9D). This format supports optional block compression and variable bit lengths ranging from 9 to 16 bits.",

77 handler_type=HandlerType.COMPRESSION,

78 vendor=None,

79 references=[

80 Reference(

81 title="Unix Compress File Format Documentation",

82 url="https://fuchsia.googlesource.com/third_party/wuffs/+/HEAD/std/lzw/README.md",

83 ),

84 Reference(

85 title="LZW Compression Algorithm",

86 url="https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch",

87 ),

88 ],