Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1import binascii

2from binascii import Error as BinasciiError

3from binascii import unhexlify

4from math import ceil

5from typing import Any, Union, cast

7from ._codecs import adobe_glyphs, charset_encoding

8from ._utils import logger_error, logger_warning

9from .generic import (

10 DecodedStreamObject,

11 DictionaryObject,

12 NullObject,

13 StreamObject,

14 is_null_or_none,

15)

17_predefined_cmap: dict[str, str] = {

18 "/Identity-H": "utf-16-be",

19 "/Identity-V": "utf-16-be",

20 "/GB-EUC-H": "gbk",

21 "/GB-EUC-V": "gbk",

22 "/GBpc-EUC-H": "gb2312",

23 "/GBpc-EUC-V": "gb2312",

24 "/GBK-EUC-H": "gbk",

25 "/GBK-EUC-V": "gbk",

26 "/GBK2K-H": "gb18030",

27 "/GBK2K-V": "gb18030",

28 "/ETen-B5-H": "cp950",

29 "/ETen-B5-V": "cp950",

30 "/ETenms-B5-H": "cp950",

31 "/ETenms-B5-V": "cp950",

32 "/UniCNS-UTF16-H": "utf-16-be",

33 "/UniCNS-UTF16-V": "utf-16-be",

34 "/UniGB-UTF16-H": "gb18030",

35 "/UniGB-UTF16-V": "gb18030",

36 # UCS2 in code

37}

40def get_encoding(

41 ft: DictionaryObject

42) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:

43 encoding = _parse_encoding(ft)

44 map_dict, int_entry = _parse_to_unicode(ft)

46 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:

47 # if cmap not empty encoding should be discarded

48 # (here transformed into identity for those characters)

49 # If encoding is a string, it is expected to be an identity translation.

50 if isinstance(encoding, dict):

51 for x in int_entry:

52 if x <= 255:

53 encoding[x] = chr(x)

55 return encoding, map_dict

58def _parse_encoding(

59 ft: DictionaryObject

60) -> Union[str, dict[int, str]]:

61 encoding: Union[str, list[str], dict[int, str]] = []

62 if "/Encoding" not in ft:

63 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:

64 encoding = dict(

65 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])

66 )

67 else:

68 encoding = "charmap"

69 return encoding

70 enc: Union[str, DictionaryObject, NullObject] = cast(

71 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()

72 )

73 if isinstance(enc, str):

74 try:

75 # already done : enc = NameObject.unnumber(enc.encode()).decode()

76 # for #xx decoding

77 if enc in charset_encoding:

78 encoding = charset_encoding[enc].copy()

79 elif enc in _predefined_cmap:

80 encoding = _predefined_cmap[enc]

81 elif "-UCS2-" in enc:

82 encoding = "utf-16-be"

83 else:

84 raise Exception("not found")

85 except Exception:

86 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)

87 encoding = enc

88 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:

89 try:

90 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()

91 except Exception:

92 logger_error(

93 f"Advanced encoding {encoding} not implemented yet",

94 __name__,

95 )

96 encoding = charset_encoding["/StandardEncoding"].copy()

97 else:

98 encoding = charset_encoding["/StandardEncoding"].copy()

99 if isinstance(enc, DictionaryObject) and "/Differences" in enc:

100 x: int = 0

101 o: Union[int, str]

102 for o in cast(DictionaryObject, enc["/Differences"]):

103 if isinstance(o, int):

104 x = o

105 else: # isinstance(o, str):

106 try:

107 if x < len(encoding):

108 encoding[x] = adobe_glyphs[o] # type: ignore

109 except Exception:

110 encoding[x] = o # type: ignore

111 x += 1

112 if isinstance(encoding, list):

113 encoding = dict(zip(range(256), encoding))

114 return encoding

115

116

117def _parse_to_unicode(

118 ft: DictionaryObject

119) -> tuple[dict[Any, Any], list[int]]:

120 # will store all translation code

121 # and map_dict[-1] we will have the number of bytes to convert

122 map_dict: dict[Any, Any] = {}

123

124 # will provide the list of cmap keys as int to correct encoding

125 int_entry: list[int] = []

126

127 if "/ToUnicode" not in ft:

128 if ft.get("/Subtype", "") == "/Type1":

129 return _type1_alternative(ft, map_dict, int_entry)

130 return {}, []

131 process_rg: bool = False

132 process_char: bool = False

133 multiline_rg: Union[

134 None, tuple[int, int]

135 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file

136 cm = prepare_cm(ft)

137 for line in cm.split(b"\n"):

138 process_rg, process_char, multiline_rg = process_cm_line(

139 line.strip(b" \t"),

140 process_rg,

141 process_char,

142 multiline_rg,

143 map_dict,

144 int_entry,

145 )

146

147 return map_dict, int_entry

148

149

150def prepare_cm(ft: DictionaryObject) -> bytes:

151 tu = ft["/ToUnicode"]

152 cm: bytes

153 if isinstance(tu, StreamObject):

154 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()

155 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):

156 # the full range 0000-FFFF will be processed

157 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"

158 if isinstance(cm, str):

159 cm = cm.encode()

160 # we need to prepare cm before due to missing return line in pdf printed

161 # to pdf from word

162 cm = (

163 cm.strip()

164 .replace(b"beginbfchar", b"\nbeginbfchar\n")

165 .replace(b"endbfchar", b"\nendbfchar\n")

166 .replace(b"beginbfrange", b"\nbeginbfrange\n")

167 .replace(b"endbfrange", b"\nendbfrange\n")

168 .replace(b"<<", b"\n{\n") # text between << and >> not used but

169 .replace(b">>", b"\n}\n") # some solution to find it back

170 )

171 ll = cm.split(b"<")

172 for i in range(len(ll)):

173 j = ll[i].find(b">")

174 if j >= 0:

175 if j == 0:

176 # string is empty: stash a placeholder here (see below)

177 # see https://github.com/py-pdf/pypdf/issues/1111

178 content = b"."

179 else:

180 content = ll[i][:j].replace(b" ", b"")

181 ll[i] = content + b" " + ll[i][j + 1 :]

182 cm = (

183 (b" ".join(ll))

184 .replace(b"[", b" [ ")

185 .replace(b"]", b" ]\n ")

186 .replace(b"\r", b"\n")

187 )

188 return cm

189

190

191def process_cm_line(

192 line: bytes,

193 process_rg: bool,

194 process_char: bool,

195 multiline_rg: Union[None, tuple[int, int]],

196 map_dict: dict[Any, Any],

197 int_entry: list[int],

198) -> tuple[bool, bool, Union[None, tuple[int, int]]]:

199 if line == b"" or line[0] == 37: # 37 = %

200 return process_rg, process_char, multiline_rg

201 line = line.replace(b"\t", b" ")

202 if b"beginbfrange" in line:

203 process_rg = True

204 elif b"endbfrange" in line:

205 process_rg = False

206 elif b"beginbfchar" in line:

207 process_char = True

208 elif b"endbfchar" in line:

209 process_char = False

210 elif process_rg:

211 try:

212 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)

213 except binascii.Error as error:

214 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)

215 elif process_char:

216 parse_bfchar(line, map_dict, int_entry)

217 return process_rg, process_char, multiline_rg

218

219

220def parse_bfrange(

221 line: bytes,

222 map_dict: dict[Any, Any],

223 int_entry: list[int],

224 multiline_rg: Union[None, tuple[int, int]],

225) -> Union[None, tuple[int, int]]:

226 lst = [x for x in line.split(b" ") if x]

227 closure_found = False

228 if multiline_rg is not None:

229 fmt = b"%%0%dX" % (map_dict[-1] * 2)

230 a = multiline_rg[0] # a, b not in the current line

231 b = multiline_rg[1]

232 for sq in lst:

233 if sq == b"]":

234 closure_found = True

235 break

236 map_dict[

237 unhexlify(fmt % a).decode(

238 "charmap" if map_dict[-1] == 1 else "utf-16-be",

239 "surrogatepass",

240 )

241 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

242 int_entry.append(a)

243 a += 1

244 else:

245 a = int(lst[0], 16)

246 b = int(lst[1], 16)

247 nbi = max(len(lst[0]), len(lst[1]))

248 map_dict[-1] = ceil(nbi / 2)

249 fmt = b"%%0%dX" % (map_dict[-1] * 2)

250 if lst[2] == b"[":

251 for sq in lst[3:]:

252 if sq == b"]":

253 closure_found = True

254 break

255 map_dict[

256 unhexlify(fmt % a).decode(

257 "charmap" if map_dict[-1] == 1 else "utf-16-be",

258 "surrogatepass",

259 )

260 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

261 int_entry.append(a)

262 a += 1

263 else: # case without list

264 c = int(lst[2], 16)

265 fmt2 = b"%%0%dX" % max(4, len(lst[2]))

266 closure_found = True

267 while a <= b:

268 map_dict[

269 unhexlify(fmt % a).decode(

270 "charmap" if map_dict[-1] == 1 else "utf-16-be",

271 "surrogatepass",

272 )

273 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")

274 int_entry.append(a)

275 a += 1

276 c += 1

277 return None if closure_found else (a, b)

278

279

280def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:

281 lst = [x for x in line.split(b" ") if x]

282 map_dict[-1] = len(lst[0]) // 2

283 while len(lst) > 1:

284 map_to = ""

285 # placeholder (see above) means empty string

286 if lst[1] != b".":

287 try:

288 map_to = unhexlify(lst[1]).decode(

289 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"

290 ) # join is here as some cases where the code was split

291 except BinasciiError as exception:

292 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)

293 map_dict[

294 unhexlify(lst[0]).decode(

295 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"

296 )

297 ] = map_to

298 int_entry.append(int(lst[0], 16))

299 lst = lst[2:]

300

301

302def _type1_alternative(

303 ft: DictionaryObject,

304 map_dict: dict[Any, Any],

305 int_entry: list[int],

306) -> tuple[dict[Any, Any], list[int]]:

307 if "/FontDescriptor" not in ft:

308 return map_dict, int_entry

309 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")

310 if is_null_or_none(ft_desc):

311 return map_dict, int_entry

312 assert ft_desc is not None, "mypy"

313 txt = ft_desc.get_object().get_data()

314 txt = txt.split(b"eexec\n")[0] # only clear part

315 txt = txt.split(b"/Encoding")[1] # to get the encoding part

316 lines = txt.replace(b"\r", b"\n").split(b"\n")

317 for li in lines:

318 if li.startswith(b"dup"):

319 words = [_w for _w in li.split(b" ") if _w != b""]

320 if len(words) > 3 and words[3] != b"put":

321 continue

322 try:

323 i = int(words[1])

324 except ValueError: # pragma: no cover

325 continue

326 try:

327 v = adobe_glyphs[words[2].decode()]

328 except KeyError:

329 if words[2].startswith(b"/uni"):

330 try:

331 v = chr(int(words[2][4:], 16))

332 except ValueError: # pragma: no cover

333 continue

334 else:

335 continue

336 map_dict[chr(i)] = v

337 int_entry.append(i)

338 return map_dict, int_entry

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

187 statements