Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_utils.py: 48%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

126 statements  

1import codecs 

2from typing import Union 

3 

4from .._codecs import _pdfdoc_encoding 

5from .._utils import StreamType, logger_warning, read_non_whitespace 

6from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError 

7from ._base import ByteStringObject, TextStringObject 

8 

9 

10def hex_to_rgb(value: str) -> tuple[float, float, float]: 

11 return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore[return-value] 

12 

13 

14def read_hex_string_from_stream( 

15 stream: StreamType, 

16 forced_encoding: Union[None, str, list[str], dict[int, str]] = None, 

17) -> Union["TextStringObject", "ByteStringObject"]: 

18 stream.read(1) 

19 arr = [] 

20 x = b"" 

21 while True: 

22 tok = read_non_whitespace(stream) 

23 if not tok: 

24 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

25 if tok == b">": 

26 break 

27 x += tok 

28 if len(x) == 2: 

29 arr.append(int(x, base=16)) 

30 x = b"" 

31 if len(x) == 1: 

32 x += b"0" 

33 if x != b"": 

34 arr.append(int(x, base=16)) 

35 return create_string_object(bytes(arr), forced_encoding) 

36 

37 

38__ESCAPE_DICT__ = { 

39 b"n": ord(b"\n"), 

40 b"r": ord(b"\r"), 

41 b"t": ord(b"\t"), 

42 b"b": ord(b"\b"), 

43 b"f": ord(b"\f"), 

44 b"(": ord(b"("), 

45 b")": ord(b")"), 

46 b"/": ord(b"/"), 

47 b"\\": ord(b"\\"), 

48 b" ": ord(b" "), 

49 b"%": ord(b"%"), 

50 b"<": ord(b"<"), 

51 b">": ord(b">"), 

52 b"[": ord(b"["), 

53 b"]": ord(b"]"), 

54 b"#": ord(b"#"), 

55 b"_": ord(b"_"), 

56 b"&": ord(b"&"), 

57 b"$": ord(b"$"), 

58} 

59__BACKSLASH_CODE__ = 92 

60 

61 

62def read_string_from_stream( 

63 stream: StreamType, 

64 forced_encoding: Union[None, str, list[str], dict[int, str]] = None, 

65) -> Union["TextStringObject", "ByteStringObject"]: 

66 tok = stream.read(1) 

67 parens = 1 

68 txt = [] 

69 while True: 

70 tok = stream.read(1) 

71 if not tok: 

72 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

73 if tok == b"(": 

74 parens += 1 

75 elif tok == b")": 

76 parens -= 1 

77 if parens == 0: 

78 break 

79 elif tok == b"\\": 

80 tok = stream.read(1) 

81 try: 

82 txt.append(__ESCAPE_DICT__[tok]) 

83 continue 

84 except KeyError: 

85 if b"0" <= tok <= b"7": 

86 # "The number ddd may consist of one, two, or three 

87 # octal digits; high-order overflow shall be ignored. 

88 # Three octal digits shall be used, with leading zeros 

89 # as needed, if the next character of the string is also 

90 # a digit." (PDF reference 7.3.4.2, p 16) 

91 sav = stream.tell() - 1 

92 for _ in range(2): 

93 ntok = stream.read(1) 

94 if b"0" <= ntok <= b"7": 

95 tok += ntok 

96 else: 

97 stream.seek(-1, 1) # ntok has to be analyzed 

98 break 

99 i = int(tok, base=8) 

100 if i > 255: 

101 txt.append(__BACKSLASH_CODE__) 

102 stream.seek(sav) 

103 else: 

104 txt.append(i) 

105 continue 

106 if tok in b"\n\r": 

107 # This case is hit when a backslash followed by a line 

108 # break occurs. If it's a multi-char EOL, consume the 

109 # second character: 

110 tok = stream.read(1) 

111 if tok not in b"\n\r": 

112 stream.seek(-1, 1) 

113 # Then don't add anything to the actual string, since this 

114 # line break was escaped: 

115 continue 

116 logger_warning( 

117 "Unexpected escaped string: %(token)s", 

118 source=__name__, 

119 token=tok.decode("utf-8", "ignore"), 

120 ) 

121 txt.append(__BACKSLASH_CODE__) 

122 txt.append(ord(tok)) 

123 return create_string_object(bytes(txt), forced_encoding) 

124 

125 

126def create_string_object( 

127 string: Union[str, bytes], 

128 forced_encoding: Union[None, str, list[str], dict[int, str]] = None, 

129) -> Union[TextStringObject, ByteStringObject]: 

130 """ 

131 Create a ByteStringObject or a TextStringObject from a string to represent the string. 

132 

133 Args: 

134 string: The data being used 

135 forced_encoding: Typically None, or an encoding string 

136 

137 Returns: 

138 A ByteStringObject 

139 

140 Raises: 

141 TypeError: If string is not of type str or bytes. 

142 

143 """ 

144 if isinstance(string, str): 

145 return TextStringObject(string) 

146 if isinstance(string, bytes): 

147 if isinstance(forced_encoding, (list, dict)): 

148 out = "" 

149 for x in string: 

150 try: 

151 out += forced_encoding[x] 

152 except Exception: 

153 out += bytes((x,)).decode("charmap") 

154 obj = TextStringObject(out) 

155 obj._original_bytes = string 

156 return obj 

157 if isinstance(forced_encoding, str): 

158 if forced_encoding == "bytes": 

159 return ByteStringObject(string) 

160 obj = TextStringObject(string.decode(forced_encoding)) 

161 obj._original_bytes = string 

162 return obj 

163 try: 

164 if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): 

165 retval = TextStringObject(string.decode("utf-16")) 

166 retval._original_bytes = string 

167 retval.autodetect_utf16 = True 

168 retval.utf16_bom = string[:2] 

169 return retval 

170 if string.startswith(b"\x00"): 

171 retval = TextStringObject(string.decode("utf-16be")) 

172 retval._original_bytes = string 

173 retval.autodetect_utf16 = True 

174 retval.utf16_bom = codecs.BOM_UTF16_BE 

175 return retval 

176 if string[1:2] == b"\x00": 

177 retval = TextStringObject(string.decode("utf-16le")) 

178 retval._original_bytes = string 

179 retval.autodetect_utf16 = True 

180 retval.utf16_bom = codecs.BOM_UTF16_LE 

181 return retval 

182 

183 # This is probably a big performance hit here, but we need 

184 # to convert string objects into the text/unicode-aware 

185 # version if possible... and the only way to check if that's 

186 # possible is to try. 

187 # Some strings are strings, some are just byte arrays. 

188 retval = TextStringObject(decode_pdfdocencoding(string)) 

189 retval._original_bytes = string 

190 retval.autodetect_pdfdocencoding = True 

191 return retval 

192 except UnicodeDecodeError: 

193 return ByteStringObject(string) 

194 else: 

195 raise TypeError("create_string_object should have str or unicode arg") 

196 

197 

198def decode_pdfdocencoding(byte_array: bytes) -> str: 

199 retval = "" 

200 for b in byte_array: 

201 c = _pdfdoc_encoding[b] 

202 if c == "\u0000": 

203 raise UnicodeDecodeError( 

204 "pdfdocencoding", 

205 bytearray(b), 

206 -1, 

207 -1, 

208 "does not exist in translation table", 

209 ) 

210 retval += c 

211 return retval