Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/unblob/handlers/compression/lz4.py: 98%

1"""LZ4 handler.

3Frame format definition: https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md.

4"""

6import io

8from lz4.block import LZ4BlockError, decompress

9from structlog import get_logger

11from unblob.extractors import Command

13from ...file_utils import Endian, InvalidInputFormat, convert_int8, convert_int32

14from ...models import (

15 File,

16 Handler,

17 HandlerDoc,

18 HandlerType,

19 HexString,

20 Reference,

21 ValidChunk,

22)

24logger = get_logger()

26SKIPPABLE_FRAMES_MAGIC = [0x184D2A50 + i for i in range(16)]

27FRAME_MAGIC = 0x184D2204

28LEGACY_FRAME_MAGIC = 0x184C2102

29FRAME_MAGICS = [*SKIPPABLE_FRAMES_MAGIC, FRAME_MAGIC, LEGACY_FRAME_MAGIC]

31_1BIT = 0x01

32_2BITS = 0x03

34END_MARK = 0x00000000

36CONTENT_SIZE_LEN = 8

37BLOCK_SIZE_LEN = FRAME_SIZE_LEN = BLOCK_CHECKSUM_LEN = CONTENT_CHECKSUM_LEN = (

38 MAGIC_LEN

39) = DICTID_LEN = 4

40FLG_LEN = BD_LEN = HC_LEN = 1

41MAX_LEGACY_BLOCK_SIZE = 8 * 1024 * 1024 # 8 MB

44class FLG:

45 """Represents the FLG field."""

47 version: int = 0

48 block_independence: int = 0

49 block_checksum: int = 0

50 content_size: int = 0

51 content_checksum: int = 0

52 dictid: int = 0

54 def __init__(self, raw_flg: int):

55 self.version = (raw_flg >> 6) & _2BITS

56 self.block_independence = (raw_flg >> 5) & _1BIT

57 self.block_checksum = (raw_flg >> 4) & _1BIT

58 self.content_size = (raw_flg >> 3) & _1BIT

59 self.content_checksum = (raw_flg >> 2) & _1BIT

60 self.dictid = raw_flg & _1BIT

62 def as_dict(self) -> dict:

63 return {

64 "version": self.version,

65 "block_independence": self.block_independence,

66 "block_checksum": self.block_checksum,

67 "content_size": self.content_size,

68 "content_checksum": self.content_checksum,

69 "dictid": self.dictid,

70 }

73class _LZ4HandlerBase(Handler):

74 """A common base for all LZ4 formats."""

76 def _skip_magic_bytes(self, file: File):

77 file.seek(MAGIC_LEN, io.SEEK_CUR)

79 EXTRACTOR = Command("lz4", "--decompress", "{inpath}", "{outdir}/lz4.uncompressed")

82class LegacyFrameHandler(_LZ4HandlerBase):

83 NAME = "lz4_legacy"

84 PATTERNS = [HexString("02 21 4C 18")]

86 DOC = HandlerDoc(

87 name="LZ4 (legacy)",

88 description="LZ4 legacy format is an older framing format used prior to the LZ4 Frame specification, featuring a simpler structure and no support for skippable frames or extensive metadata. Unlike the default LZ4 Frame format, it lacks built-in checksums, versioning, or block independence flags, making it less robust and primarily used for backward compatibility.",

89 handler_type=HandlerType.COMPRESSION,

90 vendor=None,

91 references=[

92 Reference(

93 title="LZ4 Frame Format Documentation",

94 url="https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md",

95 ),

96 Reference(

97 title="LZ4 Wikipedia",

98 url="https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)",

99 ),

100 ],

101 limitations=[],

102 )

103

104 def calculate_chunk(self, file: File, start_offset: int) -> ValidChunk | None:

105 self._skip_magic_bytes(file)

106

107 while True:

108 # The last block is detected either because it is followed by the “EOF” (End of File) mark,

109 # or because it is followed by a known Frame Magic Number.

110 raw_bsize = file.read(BLOCK_SIZE_LEN)

111 if raw_bsize == b"": # EOF

112 break

113

114 block_compressed_size = convert_int32(raw_bsize, Endian.LITTLE)

115 if block_compressed_size in FRAME_MAGICS:

116 # next magic, read too far

117 file.seek(-4, io.SEEK_CUR)

118 break

119

120 compressed_block = file.read(block_compressed_size)

121 try:

122 uncompressed_block = decompress(compressed_block, MAX_LEGACY_BLOCK_SIZE)

123 except LZ4BlockError:

124 raise InvalidInputFormat("Invalid LZ4 legacy frame.") from None

125

126 # See 'fixed block size' in https://android.googlesource.com/platform/external/lz4/+/HEAD/doc/lz4_Frame_format.md#legacy-frame

127 if len(uncompressed_block) < MAX_LEGACY_BLOCK_SIZE:

128 break

129

130 end_offset = file.tell()

131 return ValidChunk(start_offset=start_offset, end_offset=end_offset)

132

133

134class SkippableFrameHandler(_LZ4HandlerBase):

135 """Can be anything, basically uncompressed data."""

136

137 NAME = "lz4_skippable"

138 PATTERNS = [HexString("5? 2A 4D 18")]

139

140 DOC = HandlerDoc(

141 name="LZ4 (skippable)",

142 description="LZ4 skippable format is designed to encapsulate arbitrary data within an LZ4 stream allowing compliant parsers to skip over it safely. This format does not contain compressed data itself but is often used for embedding metadata or non-LZ4 content alongside standard frames.",

143 handler_type=HandlerType.COMPRESSION,

144 vendor=None,

145 references=[

146 Reference(

147 title="LZ4 Frame Format Documentation",

148 url="https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md",

149 ),

150 Reference(

151 title="LZ4 Wikipedia",

152 url="https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)",

153 ),

154 ],