Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/

1import codecs

2from typing import Union

4from .._codecs import _pdfdoc_encoding

5from .._utils import StreamType, logger_warning, read_non_whitespace

6from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError

7from ._base import ByteStringObject, TextStringObject

10def hex_to_rgb(value: str) -> tuple[float, float, float]:

11 return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore[return-value]

14def read_hex_string_from_stream(

15 stream: StreamType,

16 forced_encoding: Union[None, str, list[str], dict[int, str]] = None,

17) -> Union["TextStringObject", "ByteStringObject"]:

18 stream.read(1)

19 arr = []

20 x = b""

21 while True:

22 tok = read_non_whitespace(stream)

23 if not tok:

24 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

25 if tok == b">":

26 break

27 x += tok

28 if len(x) == 2:

29 arr.append(int(x, base=16))

30 x = b""

31 if len(x) == 1:

32 x += b"0"

33 if x != b"":

34 arr.append(int(x, base=16))

35 return create_string_object(bytes(arr), forced_encoding)

38__ESCAPE_DICT__ = {

39 b"n": ord(b"\n"),

40 b"r": ord(b"\r"),

41 b"t": ord(b"\t"),

42 b"b": ord(b"\b"),

43 b"f": ord(b"\f"),

44 b"(": ord(b"("),

45 b")": ord(b")"),

46 b"/": ord(b"/"),

47 b"\\": ord(b"\\"),

48 b" ": ord(b" "),

49 b"%": ord(b"%"),

50 b"<": ord(b"<"),

51 b">": ord(b">"),

52 b"[": ord(b"["),

53 b"]": ord(b"]"),

54 b"#": ord(b"#"),

55 b"_": ord(b"_"),

56 b"&": ord(b"&"),

57 b"$": ord(b"$"),

58}

59__BACKSLASH_CODE__ = 92

62def read_string_from_stream(

63 stream: StreamType,

64 forced_encoding: Union[None, str, list[str], dict[int, str]] = None,

65) -> Union["TextStringObject", "ByteStringObject"]:

66 tok = stream.read(1)

67 parens = 1

68 txt = []

69 while True:

70 tok = stream.read(1)

71 if not tok:

72 raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)

73 if tok == b"(":

74 parens += 1

75 elif tok == b")":

76 parens -= 1

77 if parens == 0:

78 break

79 elif tok == b"\\":

80 tok = stream.read(1)

81 try:

82 txt.append(__ESCAPE_DICT__[tok])

83 continue

84 except KeyError:

85 if b"0" <= tok <= b"7":

86 # "The number ddd may consist of one, two, or three

87 # octal digits; high-order overflow shall be ignored.

88 # Three octal digits shall be used, with leading zeros

89 # as needed, if the next character of the string is also

90 # a digit." (PDF reference 7.3.4.2, p 16)

91 sav = stream.tell() - 1

92 for _ in range(2):

93 ntok = stream.read(1)

94 if b"0" <= ntok <= b"7":

95 tok += ntok

96 else:

97 stream.seek(-1, 1) # ntok has to be analyzed

98 break

99 i = int(tok, base=8)

100 if i > 255:

101 txt.append(__BACKSLASH_CODE__)

102 stream.seek(sav)

103 else:

104 txt.append(i)

105 continue

106 if tok in b"\n\r":

107 # This case is hit when a backslash followed by a line

108 # break occurs. If it's a multi-char EOL, consume the

109 # second character:

110 tok = stream.read(1)

111 if tok not in b"\n\r":

112 stream.seek(-1, 1)

113 # Then don't add anything to the actual string, since this

114 # line break was escaped:

115 continue

116 logger_warning(

117 "Unexpected escaped string: %(token)s",

118 source=__name__,

119 token=tok.decode("utf-8", "ignore"),

120 )

121 txt.append(__BACKSLASH_CODE__)

122 txt.append(ord(tok))

123 return create_string_object(bytes(txt), forced_encoding)

124

125

126def create_string_object(

127 string: Union[str, bytes],

128 forced_encoding: Union[None, str, list[str], dict[int, str]] = None,

129) -> Union[TextStringObject, ByteStringObject]:

130 """

131 Create a ByteStringObject or a TextStringObject from a string to represent the string.

132

133 Args:

134 string: The data being used

135 forced_encoding: Typically None, or an encoding string

136

137 Returns:

138 A ByteStringObject

139

140 Raises:

141 TypeError: If string is not of type str or bytes.

142

143 """

144 if isinstance(string, str):

145 return TextStringObject(string)

146 if isinstance(string, bytes):

147 if isinstance(forced_encoding, (list, dict)):

148 out = ""

149 for x in string:

150 try:

151 out += forced_encoding[x]

152 except Exception:

153 out += bytes((x,)).decode("charmap")

154 obj = TextStringObject(out)

155 obj._original_bytes = string

156 return obj

157 if isinstance(forced_encoding, str):

158 if forced_encoding == "bytes":

159 return ByteStringObject(string)

160 obj = TextStringObject(string.decode(forced_encoding))

161 obj._original_bytes = string

162 return obj

163 try:

164 if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):

165 retval = TextStringObject(string.decode("utf-16"))

166 retval._original_bytes = string

167 retval.autodetect_utf16 = True

168 retval.utf16_bom = string[:2]

169 return retval

170 if string.startswith(b"\x00"):

171 retval = TextStringObject(string.decode("utf-16be"))

172 retval._original_bytes = string

173 retval.autodetect_utf16 = True

174 retval.utf16_bom = codecs.BOM_UTF16_BE

175 return retval

176 if string[1:2] == b"\x00":

177 retval = TextStringObject(string.decode("utf-16le"))

178 retval._original_bytes = string

179 retval.autodetect_utf16 = True

180 retval.utf16_bom = codecs.BOM_UTF16_LE

181 return retval

182

183 # This is probably a big performance hit here, but we need

184 # to convert string objects into the text/unicode-aware

185 # version if possible... and the only way to check if that's

186 # possible is to try.

187 # Some strings are strings, some are just byte arrays.

188 retval = TextStringObject(decode_pdfdocencoding(string))

189 retval._original_bytes = string

190 retval.autodetect_pdfdocencoding = True

191 return retval

192 except UnicodeDecodeError:

193 return ByteStringObject(string)

194 else:

195 raise TypeError("create_string_object should have str or unicode arg")

196

197

198def decode_pdfdocencoding(byte_array: bytes) -> str:

199 retval = ""

200 for b in byte_array:

201 c = _pdfdoc_encoding[b]

202 if c == "\u0000":

203 raise UnicodeDecodeError(

204 "pdfdocencoding",

205 bytearray(b),

206 -1,

207 -1,

208 "does not exist in translation table",

209 )

210 retval += c

211 return retval

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_utils.py: 48%

126 statements