Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/

1import codecs

2from typing import Dict, List, Tuple, Union

4from .._codecs import _pdfdoc_encoding

5from .._utils import StreamType, logger_warning, read_non_whitespace

6from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError

7from ._base import ByteStringObject, TextStringObject

10def hex_to_rgb(value: str) -> Tuple[float, float, float]:

11 return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore

14def read_hex_string_from_stream(

15 stream: StreamType,

16 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,

17) -> Union["TextStringObject", "ByteStringObject"]:

18 stream.read(1)

19 arr = []

20 x = b""

21 while True:

22 tok = read_non_whitespace(stream)

23 if not tok:

24 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

25 if tok == b">":

26 break

27 x += tok

28 if len(x) == 2:

29 arr.append(int(x, base=16))

30 x = b""

31 if len(x) == 1:

32 x += b"0"

33 if x != b"":

34 arr.append(int(x, base=16))

35 return create_string_object(bytes(arr), forced_encoding)

38__ESCAPE_DICT__ = {

39 b"n": ord(b"\n"),

40 b"r": ord(b"\r"),

41 b"t": ord(b"\t"),

42 b"b": ord(b"\b"),

43 b"f": ord(b"\f"),

44 b"(": ord(b"("),

45 b")": ord(b")"),

46 b"/": ord(b"/"),

47 b"\\": ord(b"\\"),

48 b" ": ord(b" "),

49 b"%": ord(b"%"),

50 b"<": ord(b"<"),

51 b">": ord(b">"),

52 b"[": ord(b"["),

53 b"]": ord(b"]"),

54 b"#": ord(b"#"),

55 b"_": ord(b"_"),

56 b"&": ord(b"&"),

57 b"$": ord(b"$"),

58}

59__BACKSLASH_CODE__ = 92

62def read_string_from_stream(

63 stream: StreamType,

64 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,

65) -> Union["TextStringObject", "ByteStringObject"]:

66 tok = stream.read(1)

67 parens = 1

68 txt = []

69 while True:

70 tok = stream.read(1)

71 if not tok:

72 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

73 if tok == b"(":

74 parens += 1

75 elif tok == b")":

76 parens -= 1

77 if parens == 0:

78 break

79 elif tok == b"\\":

80 tok = stream.read(1)

81 try:

82 txt.append(__ESCAPE_DICT__[tok])

83 continue

84 except KeyError:

85 if b"0" <= tok <= b"7":

86 # "The number ddd may consist of one, two, or three

87 # octal digits; high-order overflow shall be ignored.

88 # Three octal digits shall be used, with leading zeros

89 # as needed, if the next character of the string is also

90 # a digit." (PDF reference 7.3.4.2, p 16)

91 sav = stream.tell() - 1

92 for _ in range(2):

93 ntok = stream.read(1)

94 if b"0" <= ntok <= b"7":

95 tok += ntok

96 else:

97 stream.seek(-1, 1) # ntok has to be analyzed

98 break

99 i = int(tok, base=8)

100 if i > 255:

101 txt.append(__BACKSLASH_CODE__)

102 stream.seek(sav)

103 else:

104 txt.append(i)

105 continue

106 if tok in b"\n\r":

107 # This case is hit when a backslash followed by a line

108 # break occurs. If it's a multi-char EOL, consume the

109 # second character:

110 tok = stream.read(1)

111 if tok not in b"\n\r":

112 stream.seek(-1, 1)

113 # Then don't add anything to the actual string, since this

114 # line break was escaped:

115 continue

116 msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}"

117 logger_warning(msg, __name__)

118 txt.append(__BACKSLASH_CODE__)

119 txt.append(ord(tok))

120 return create_string_object(bytes(txt), forced_encoding)

121

122

123def create_string_object(

124 string: Union[str, bytes],

125 forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,

126) -> Union[TextStringObject, ByteStringObject]:

127 """

128 Create a ByteStringObject or a TextStringObject from a string to represent the string.

129

130 Args:

131 string: The data being used

132 forced_encoding: Typically None, or an encoding string

133

134 Returns:

135 A ByteStringObject

136

137 Raises:

138 TypeError: If string is not of type str or bytes.

139

140 """

141 if isinstance(string, str):

142 return TextStringObject(string)

143 if isinstance(string, bytes):

144 if isinstance(forced_encoding, (list, dict)):

145 out = ""

146 for x in string:

147 try:

148 out += forced_encoding[x]

149 except Exception:

150 out += bytes((x,)).decode("charmap")

151 obj = TextStringObject(out)

152 obj._original_bytes = string

153 return obj

154 if isinstance(forced_encoding, str):

155 if forced_encoding == "bytes":

156 return ByteStringObject(string)

157 obj = TextStringObject(string.decode(forced_encoding))

158 obj._original_bytes = string

159 return obj

160 try:

161 if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):

162 retval = TextStringObject(string.decode("utf-16"))

163 retval._original_bytes = string

164 retval.autodetect_utf16 = True

165 retval.utf16_bom = string[:2]

166 return retval

167 if string.startswith(b"\x00"):

168 retval = TextStringObject(string.decode("utf-16be"))

169 retval._original_bytes = string

170 retval.autodetect_utf16 = True

171 retval.utf16_bom = codecs.BOM_UTF16_BE

172 return retval

173 if string[1:2] == b"\x00":

174 retval = TextStringObject(string.decode("utf-16le"))

175 retval._original_bytes = string

176 retval.autodetect_utf16 = True

177 retval.utf16_bom = codecs.BOM_UTF16_LE

178 return retval

179

180 # This is probably a big performance hit here, but we need

181 # to convert string objects into the text/unicode-aware

182 # version if possible... and the only way to check if that's

183 # possible is to try.

184 # Some strings are strings, some are just byte arrays.

185 retval = TextStringObject(decode_pdfdocencoding(string))

186 retval._original_bytes = string

187 retval.autodetect_pdfdocencoding = True

188 return retval

189 except UnicodeDecodeError:

190 return ByteStringObject(string)

191 else:

192 raise TypeError("create_string_object should have str or unicode arg")

193

194

195def decode_pdfdocencoding(byte_array: bytes) -> str:

196 retval = ""

197 for b in byte_array:

198 c = _pdfdoc_encoding[b]

199 if c == "\u0000":

200 raise UnicodeDecodeError(

201 "pdfdocencoding",

202 bytearray(b),

203 -1,

204 -1,

205 "does not exist in translation table",

206 )

207 retval += c

208 return retval

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_utils.py: 12%

127 statements