Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1import binascii

2from binascii import Error as BinasciiError

3from binascii import unhexlify

4from math import ceil

5from typing import Any, Union, cast

7from ._codecs import adobe_glyphs, charset_encoding

8from ._utils import logger_error, logger_warning

9from .errors import LimitReachedError

10from .generic import (

11 DecodedStreamObject,

12 DictionaryObject,

13 NullObject,

14 StreamObject,

15 is_null_or_none,

16)

18_predefined_cmap: dict[str, str] = {

19 "/Identity-H": "utf-16-be",

20 "/Identity-V": "utf-16-be",

21 "/GB-EUC-H": "gbk",

22 "/GB-EUC-V": "gbk",

23 "/GBpc-EUC-H": "gb2312",

24 "/GBpc-EUC-V": "gb2312",

25 "/GBK-EUC-H": "gbk",

26 "/GBK-EUC-V": "gbk",

27 "/GBK2K-H": "gb18030",

28 "/GBK2K-V": "gb18030",

29 "/ETen-B5-H": "cp950",

30 "/ETen-B5-V": "cp950",

31 "/ETenms-B5-H": "cp950",

32 "/ETenms-B5-V": "cp950",

33 "/UniCNS-UTF16-H": "utf-16-be",

34 "/UniCNS-UTF16-V": "utf-16-be",

35 "/UniGB-UTF16-H": "gb18030",

36 "/UniGB-UTF16-V": "gb18030",

37 # UCS2 in code

38}

41def get_encoding(

42 ft: DictionaryObject

43) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:

44 encoding = _parse_encoding(ft)

45 map_dict, int_entry = _parse_to_unicode(ft)

47 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:

48 # if cmap not empty encoding should be discarded

49 # (here transformed into identity for those characters)

50 # If encoding is a string, it is expected to be an identity translation.

51 if isinstance(encoding, dict):

52 for x in int_entry:

53 if x <= 255:

54 encoding[x] = chr(x)

56 return encoding, map_dict

59def _parse_encoding(

60 ft: DictionaryObject

61) -> Union[str, dict[int, str]]:

62 encoding: Union[str, list[str], dict[int, str]] = []

63 if "/Encoding" not in ft:

64 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:

65 encoding = dict(

66 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])

67 )

68 else:

69 encoding = "charmap"

70 return encoding

71 enc: Union[str, DictionaryObject, NullObject] = cast(

72 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()

73 )

74 if isinstance(enc, str):

75 try:

76 # already done : enc = NameObject.unnumber(enc.encode()).decode()

77 # for #xx decoding

78 if enc in charset_encoding:

79 encoding = charset_encoding[enc].copy()

80 elif enc in _predefined_cmap:

81 encoding = _predefined_cmap[enc]

82 elif "-UCS2-" in enc:

83 encoding = "utf-16-be"

84 else:

85 raise Exception("not found")

86 except Exception:

87 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)

88 encoding = enc

89 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:

90 try:

91 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()

92 except Exception:

93 logger_error(

94 f"Advanced encoding {encoding} not implemented yet",

95 __name__,

96 )

97 encoding = charset_encoding["/StandardEncoding"].copy()

98 else:

99 encoding = charset_encoding["/StandardEncoding"].copy()

100 if isinstance(enc, DictionaryObject) and "/Differences" in enc:

101 x: int = 0

102 o: Union[int, str]

103 for o in cast(DictionaryObject, enc["/Differences"]):

104 if isinstance(o, int):

105 x = o

106 else: # isinstance(o, str):

107 try:

108 if x < len(encoding):

109 encoding[x] = adobe_glyphs[o] # type: ignore

110 except Exception:

111 encoding[x] = o # type: ignore

112 x += 1

113 if isinstance(encoding, list):

114 encoding = dict(zip(range(256), encoding))

115 return encoding

116

117

118def _parse_to_unicode(

119 ft: DictionaryObject

120) -> tuple[dict[Any, Any], list[int]]:

121 # will store all translation code

122 # and map_dict[-1] we will have the number of bytes to convert

123 map_dict: dict[Any, Any] = {}

124

125 # will provide the list of cmap keys as int to correct encoding

126 int_entry: list[int] = []

127

128 if "/ToUnicode" not in ft:

129 if ft.get("/Subtype", "") == "/Type1":

130 return _type1_alternative(ft, map_dict, int_entry)

131 return {}, []

132 process_rg: bool = False

133 process_char: bool = False

134 multiline_rg: Union[

135 None, tuple[int, int]

136 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file

137 cm = prepare_cm(ft)

138 for line in cm.split(b"\n"):

139 process_rg, process_char, multiline_rg = process_cm_line(

140 line.strip(b" \t"),

141 process_rg,

142 process_char,

143 multiline_rg,

144 map_dict,

145 int_entry,

146 )

147

148 return map_dict, int_entry

149

150

151def prepare_cm(ft: DictionaryObject) -> bytes:

152 tu = ft["/ToUnicode"]

153 cm: bytes

154 if isinstance(tu, StreamObject):

155 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()

156 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):

157 # the full range 0000-FFFF will be processed

158 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"

159 if isinstance(cm, str):

160 cm = cm.encode()

161 # we need to prepare cm before due to missing return line in pdf printed

162 # to pdf from word

163 cm = (

164 cm.strip()

165 .replace(b"beginbfchar", b"\nbeginbfchar\n")

166 .replace(b"endbfchar", b"\nendbfchar\n")

167 .replace(b"beginbfrange", b"\nbeginbfrange\n")

168 .replace(b"endbfrange", b"\nendbfrange\n")

169 .replace(b"<<", b"\n{\n") # text between << and >> not used but

170 .replace(b">>", b"\n}\n") # some solution to find it back

171 )

172 ll = cm.split(b"<")

173 for i in range(len(ll)):

174 j = ll[i].find(b">")

175 if j >= 0:

176 if j == 0:

177 # string is empty: stash a placeholder here (see below)

178 # see https://github.com/py-pdf/pypdf/issues/1111

179 content = b"."

180 else:

181 content = ll[i][:j].replace(b" ", b"")

182 ll[i] = content + b" " + ll[i][j + 1 :]

183 cm = (

184 (b" ".join(ll))

185 .replace(b"[", b" [ ")

186 .replace(b"]", b" ]\n ")

187 .replace(b"\r", b"\n")

188 )

189 return cm

190

191

192def process_cm_line(

193 line: bytes,

194 process_rg: bool,

195 process_char: bool,

196 multiline_rg: Union[None, tuple[int, int]],

197 map_dict: dict[Any, Any],

198 int_entry: list[int],

199) -> tuple[bool, bool, Union[None, tuple[int, int]]]:

200 if line == b"" or line[0] == 37: # 37 = %

201 return process_rg, process_char, multiline_rg

202 line = line.replace(b"\t", b" ")

203 if b"beginbfrange" in line:

204 process_rg = True

205 elif b"endbfrange" in line:

206 process_rg = False

207 elif b"beginbfchar" in line:

208 process_char = True

209 elif b"endbfchar" in line:

210 process_char = False

211 elif process_rg:

212 try:

213 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)

214 except binascii.Error as error:

215 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)

216 elif process_char:

217 parse_bfchar(line, map_dict, int_entry)

218 return process_rg, process_char, multiline_rg

219

220

221# Usual values should be up to 65_536.

222MAPPING_DICTIONARY_SIZE_LIMIT = 100_000

223

224

225def _check_mapping_size(size: int) -> None:

226 if size > MAPPING_DICTIONARY_SIZE_LIMIT:

227 raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.")

228

229

230def parse_bfrange(

231 line: bytes,

232 map_dict: dict[Any, Any],

233 int_entry: list[int],

234 multiline_rg: Union[None, tuple[int, int]],

235) -> Union[None, tuple[int, int]]:

236 lst = [x for x in line.split(b" ") if x]

237 closure_found = False

238 entry_count = len(int_entry)

239 _check_mapping_size(entry_count)

240 if multiline_rg is not None:

241 fmt = b"%%0%dX" % (map_dict[-1] * 2)

242 a = multiline_rg[0] # a, b not in the current line

243 b = multiline_rg[1]

244 for sq in lst:

245 if sq == b"]":

246 closure_found = True

247 break

248 entry_count += 1

249 _check_mapping_size(entry_count)

250 map_dict[

251 unhexlify(fmt % a).decode(

252 "charmap" if map_dict[-1] == 1 else "utf-16-be",

253 "surrogatepass",

254 )

255 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

256 int_entry.append(a)

257 a += 1

258 else:

259 a = int(lst[0], 16)

260 b = int(lst[1], 16)

261 nbi = max(len(lst[0]), len(lst[1]))

262 map_dict[-1] = ceil(nbi / 2)

263 fmt = b"%%0%dX" % (map_dict[-1] * 2)

264 if lst[2] == b"[":

265 for sq in lst[3:]:

266 if sq == b"]":

267 closure_found = True

268 break

269 entry_count += 1

270 _check_mapping_size(entry_count)

271 map_dict[

272 unhexlify(fmt % a).decode(

273 "charmap" if map_dict[-1] == 1 else "utf-16-be",

274 "surrogatepass",

275 )

276 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

277 int_entry.append(a)

278 a += 1

279 else: # case without list

280 c = int(lst[2], 16)

281 fmt2 = b"%%0%dX" % max(4, len(lst[2]))

282 closure_found = True

283 range_size = max(0, b - a + 1)

284 _check_mapping_size(entry_count + range_size) # This can be checked beforehand.

285 while a <= b:

286 map_dict[

287 unhexlify(fmt % a).decode(

288 "charmap" if map_dict[-1] == 1 else "utf-16-be",

289 "surrogatepass",

290 )

291 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")

292 int_entry.append(a)

293 a += 1

294 c += 1

295 return None if closure_found else (a, b)

296

297

298def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:

299 lst = [x for x in line.split(b" ") if x]

300 new_count = len(lst) // 2

301 _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand.

302 map_dict[-1] = len(lst[0]) // 2

303 while len(lst) > 1:

304 map_to = ""

305 # placeholder (see above) means empty string

306 if lst[1] != b".":

307 try:

308 map_to = unhexlify(lst[1]).decode(

309 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"

310 ) # join is here as some cases where the code was split

311 except BinasciiError as exception:

312 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)

313 map_dict[

314 unhexlify(lst[0]).decode(

315 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"

316 )

317 ] = map_to

318 int_entry.append(int(lst[0], 16))

319 lst = lst[2:]

320

321

322def _type1_alternative(

323 ft: DictionaryObject,

324 map_dict: dict[Any, Any],

325 int_entry: list[int],

326) -> tuple[dict[Any, Any], list[int]]:

327 if "/FontDescriptor" not in ft:

328 return map_dict, int_entry

329 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")

330 if is_null_or_none(ft_desc):

331 return map_dict, int_entry

332 assert ft_desc is not None, "mypy"

333 txt = ft_desc.get_object().get_data()

334 txt = txt.split(b"eexec\n")[0] # only clear part

335 txt = txt.split(b"/Encoding")[1] # to get the encoding part

336 lines = txt.replace(b"\r", b"\n").split(b"\n")

337 for li in lines:

338 if li.startswith(b"dup"):

339 words = [_w for _w in li.split(b" ") if _w != b""]

340 if len(words) > 3 and words[3] != b"put":

341 continue

342 try:

343 i = int(words[1])

344 except ValueError: # pragma: no cover

345 continue

346 try:

347 v = adobe_glyphs[words[2].decode()]

348 except KeyError:

349 if words[2].startswith(b"/uni"):

350 try:

351 v = chr(int(words[2][4:], 16))

352 except ValueError: # pragma: no cover

353 continue

354 else:

355 continue

356 map_dict[chr(i)] = v

357 int_entry.append(i)

358 return map_dict, int_entry

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 10%

202 statements