Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_utils.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

127 statements  

1import codecs 

2from typing import Dict, List, Tuple, Union 

3 

4from .._codecs import _pdfdoc_encoding 

5from .._utils import StreamType, logger_warning, read_non_whitespace 

6from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError 

7from ._base import ByteStringObject, TextStringObject 

8 

9 

10def hex_to_rgb(value: str) -> Tuple[float, float, float]: 

11 return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore 

12 

13 

14def read_hex_string_from_stream( 

15 stream: StreamType, 

16 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, 

17) -> Union["TextStringObject", "ByteStringObject"]: 

18 stream.read(1) 

19 arr = [] 

20 x = b"" 

21 while True: 

22 tok = read_non_whitespace(stream) 

23 if not tok: 

24 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

25 if tok == b">": 

26 break 

27 x += tok 

28 if len(x) == 2: 

29 arr.append(int(x, base=16)) 

30 x = b"" 

31 if len(x) == 1: 

32 x += b"0" 

33 if x != b"": 

34 arr.append(int(x, base=16)) 

35 return create_string_object(bytes(arr), forced_encoding) 

36 

37 

38__ESCAPE_DICT__ = { 

39 b"n": ord(b"\n"), 

40 b"r": ord(b"\r"), 

41 b"t": ord(b"\t"), 

42 b"b": ord(b"\b"), 

43 b"f": ord(b"\f"), 

44 b"(": ord(b"("), 

45 b")": ord(b")"), 

46 b"/": ord(b"/"), 

47 b"\\": ord(b"\\"), 

48 b" ": ord(b" "), 

49 b"%": ord(b"%"), 

50 b"<": ord(b"<"), 

51 b">": ord(b">"), 

52 b"[": ord(b"["), 

53 b"]": ord(b"]"), 

54 b"#": ord(b"#"), 

55 b"_": ord(b"_"), 

56 b"&": ord(b"&"), 

57 b"$": ord(b"$"), 

58} 

59__BACKSLASH_CODE__ = 92 

60 

61 

62def read_string_from_stream( 

63 stream: StreamType, 

64 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, 

65) -> Union["TextStringObject", "ByteStringObject"]: 

66 tok = stream.read(1) 

67 parens = 1 

68 txt = [] 

69 while True: 

70 tok = stream.read(1) 

71 if not tok: 

72 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) 

73 if tok == b"(": 

74 parens += 1 

75 elif tok == b")": 

76 parens -= 1 

77 if parens == 0: 

78 break 

79 elif tok == b"\\": 

80 tok = stream.read(1) 

81 try: 

82 txt.append(__ESCAPE_DICT__[tok]) 

83 continue 

84 except KeyError: 

85 if b"0" <= tok <= b"7": 

86 # "The number ddd may consist of one, two, or three 

87 # octal digits; high-order overflow shall be ignored. 

88 # Three octal digits shall be used, with leading zeros 

89 # as needed, if the next character of the string is also 

90 # a digit." (PDF reference 7.3.4.2, p 16) 

91 sav = stream.tell() - 1 

92 for _ in range(2): 

93 ntok = stream.read(1) 

94 if b"0" <= ntok <= b"7": 

95 tok += ntok 

96 else: 

97 stream.seek(-1, 1) # ntok has to be analyzed 

98 break 

99 i = int(tok, base=8) 

100 if i > 255: 

101 txt.append(__BACKSLASH_CODE__) 

102 stream.seek(sav) 

103 else: 

104 txt.append(i) 

105 continue 

106 if tok in b"\n\r": 

107 # This case is hit when a backslash followed by a line 

108 # break occurs. If it's a multi-char EOL, consume the 

109 # second character: 

110 tok = stream.read(1) 

111 if tok not in b"\n\r": 

112 stream.seek(-1, 1) 

113 # Then don't add anything to the actual string, since this 

114 # line break was escaped: 

115 continue 

116 msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}" 

117 logger_warning(msg, __name__) 

118 txt.append(__BACKSLASH_CODE__) 

119 txt.append(ord(tok)) 

120 return create_string_object(bytes(txt), forced_encoding) 

121 

122 

123def create_string_object( 

124 string: Union[str, bytes], 

125 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, 

126) -> Union[TextStringObject, ByteStringObject]: 

127 """ 

128 Create a ByteStringObject or a TextStringObject from a string to represent the string. 

129 

130 Args: 

131 string: The data being used 

132 forced_encoding: Typically None, or an encoding string 

133 

134 Returns: 

135 A ByteStringObject 

136 

137 Raises: 

138 TypeError: If string is not of type str or bytes. 

139 

140 """ 

141 if isinstance(string, str): 

142 return TextStringObject(string) 

143 if isinstance(string, bytes): 

144 if isinstance(forced_encoding, (list, dict)): 

145 out = "" 

146 for x in string: 

147 try: 

148 out += forced_encoding[x] 

149 except Exception: 

150 out += bytes((x,)).decode("charmap") 

151 obj = TextStringObject(out) 

152 obj._original_bytes = string 

153 return obj 

154 if isinstance(forced_encoding, str): 

155 if forced_encoding == "bytes": 

156 return ByteStringObject(string) 

157 obj = TextStringObject(string.decode(forced_encoding)) 

158 obj._original_bytes = string 

159 return obj 

160 try: 

161 if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): 

162 retval = TextStringObject(string.decode("utf-16")) 

163 retval._original_bytes = string 

164 retval.autodetect_utf16 = True 

165 retval.utf16_bom = string[:2] 

166 return retval 

167 if string.startswith(b"\x00"): 

168 retval = TextStringObject(string.decode("utf-16be")) 

169 retval._original_bytes = string 

170 retval.autodetect_utf16 = True 

171 retval.utf16_bom = codecs.BOM_UTF16_BE 

172 return retval 

173 if string[1:2] == b"\x00": 

174 retval = TextStringObject(string.decode("utf-16le")) 

175 retval._original_bytes = string 

176 retval.autodetect_utf16 = True 

177 retval.utf16_bom = codecs.BOM_UTF16_LE 

178 return retval 

179 

180 # This is probably a big performance hit here, but we need 

181 # to convert string objects into the text/unicode-aware 

182 # version if possible... and the only way to check if that's 

183 # possible is to try. 

184 # Some strings are strings, some are just byte arrays. 

185 retval = TextStringObject(decode_pdfdocencoding(string)) 

186 retval._original_bytes = string 

187 retval.autodetect_pdfdocencoding = True 

188 return retval 

189 except UnicodeDecodeError: 

190 return ByteStringObject(string) 

191 else: 

192 raise TypeError("create_string_object should have str or unicode arg") 

193 

194 

195def decode_pdfdocencoding(byte_array: bytes) -> str: 

196 retval = "" 

197 for b in byte_array: 

198 c = _pdfdoc_encoding[b] 

199 if c == "\u0000": 

200 raise UnicodeDecodeError( 

201 "pdfdocencoding", 

202 bytearray(b), 

203 -1, 

204 -1, 

205 "does not exist in translation table", 

206 ) 

207 retval += c 

208 return retval