Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1import binascii

2from binascii import Error as BinasciiError

3from binascii import unhexlify

4from math import ceil

5from typing import Any, Union, cast

7from ._codecs import adobe_glyphs, charset_encoding

8from ._utils import logger_error, logger_warning

9from .errors import LimitReachedError

10from .generic import (

11 DecodedStreamObject,

12 DictionaryObject,

13 NullObject,

14 StreamObject,

15 is_null_or_none,

16)

18_predefined_cmap: dict[str, str] = {

19 "/Identity-H": "utf-16-be",

20 "/Identity-V": "utf-16-be",

21 "/GB-EUC-H": "gbk",

22 "/GB-EUC-V": "gbk",

23 "/GBpc-EUC-H": "gb2312",

24 "/GBpc-EUC-V": "gb2312",

25 "/GBK-EUC-H": "gbk",

26 "/GBK-EUC-V": "gbk",

27 "/GBK2K-H": "gb18030",

28 "/GBK2K-V": "gb18030",

29 "/ETen-B5-H": "cp950",

30 "/ETen-B5-V": "cp950",

31 "/ETenms-B5-H": "cp950",

32 "/ETenms-B5-V": "cp950",

33 "/UniCNS-UTF16-H": "utf-16-be",

34 "/UniCNS-UTF16-V": "utf-16-be",

35 "/UniGB-UTF16-H": "gb18030",

36 "/UniGB-UTF16-V": "gb18030",

37 # Japanese CMaps (PDF Reference 1.7, Appendix H)

38 "/90ms-RKSJ-H": "cp932", # Shift-JIS (JIS X 0208-1990), horizontal

39 "/90ms-RKSJ-V": "cp932", # Shift-JIS (JIS X 0208-1990), vertical

40 "/UniJIS-UTF16-H": "utf-16-be", # Unicode UTF-16BE -> JIS, horizontal

41 "/UniJIS-UTF16-V": "utf-16-be", # Unicode UTF-16BE -> JIS, vertical

42 # UCS2 in code

43}

46def get_encoding(

47 ft: DictionaryObject

48) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:

49 encoding = _parse_encoding(ft)

50 map_dict, int_entry = _parse_to_unicode(ft)

52 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:

53 # if cmap not empty encoding should be discarded

54 # (here transformed into identity for those characters)

55 # If encoding is a string, it is expected to be an identity translation.

56 if isinstance(encoding, dict):

57 for x in int_entry:

58 if x <= 255:

59 encoding[x] = chr(x)

61 return encoding, map_dict

64def _parse_encoding(

65 ft: DictionaryObject

66) -> Union[str, dict[int, str]]:

67 encoding: Union[str, list[str], dict[int, str]] = []

68 if "/Encoding" not in ft:

69 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:

70 encoding = dict(

71 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])

72 )

73 else:

74 encoding = "charmap"

75 return encoding

76 enc: Union[str, DictionaryObject, NullObject] = cast(

77 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()

78 )

79 if isinstance(enc, str):

80 try:

81 # already done : enc = NameObject.unnumber(enc.encode()).decode()

82 # for #xx decoding

83 if enc in charset_encoding:

84 encoding = charset_encoding[enc].copy()

85 elif enc in _predefined_cmap:

86 encoding = _predefined_cmap[enc]

87 elif "-UCS2-" in enc:

88 encoding = "utf-16-be"

89 else:

90 raise Exception("not found")

91 except Exception:

92 logger_error("Advanced encoding %(encoding)s not implemented yet", source=__name__, encoding=enc)

93 encoding = enc

94 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:

95 try:

96 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()

97 except Exception:

98 logger_error(

99 "Advanced encoding %(encoding)s not implemented yet",

100 source=__name__, encoding=encoding

101 )

102 encoding = charset_encoding["/StandardEncoding"].copy()

103 else:

104 encoding = charset_encoding["/StandardEncoding"].copy()

105 if isinstance(enc, DictionaryObject) and "/Differences" in enc:

106 x: int = 0

107 o: Union[int, str]

108 for o in cast(DictionaryObject, enc["/Differences"]):

109 if isinstance(o, int):

110 x = o

111 else: # isinstance(o, str):

112 try:

113 if x < len(encoding):

114 encoding[x] = adobe_glyphs[o] # type: ignore[index]

115 except Exception:

116 encoding[x] = o # type: ignore[index]

117 x += 1

118 if isinstance(encoding, list):

119 encoding = dict(zip(range(256), encoding))

120 return encoding

121

122

123def _parse_to_unicode(

124 ft: DictionaryObject

125) -> tuple[dict[Any, Any], list[int]]:

126 # will store all translation code

127 # and map_dict[-1] we will have the number of bytes to convert

128 map_dict: dict[Any, Any] = {}

129

130 # will provide the list of cmap keys as int to correct encoding

131 int_entry: list[int] = []

132

133 if "/ToUnicode" not in ft:

134 if ft.get("/Subtype", "") == "/Type1":

135 return _type1_alternative(ft, map_dict, int_entry)

136 return {}, []

137 process_rg: bool = False

138 process_char: bool = False

139 multiline_rg: Union[

140 None, tuple[int, int]

141 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file

142 cm = prepare_cm(ft)

143 for line in cm.split(b"\n"):

144 process_rg, process_char, multiline_rg = process_cm_line(

145 line.strip(b" \t"),

146 process_rg,

147 process_char,

148 multiline_rg,

149 map_dict,

150 int_entry,

151 )

152

153 map_dict.pop(-1, None) # Don't pass the -1 key, we only used it to temporarily store encoding length

154

155 return map_dict, int_entry

156

157

158def prepare_cm(ft: DictionaryObject) -> bytes:

159 tu = ft["/ToUnicode"]

160 cm: bytes

161 if isinstance(tu, StreamObject):

162 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()

163 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):

164 # the full range 0000-FFFF will be processed

165 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"

166 if isinstance(cm, str):

167 cm = cm.encode()

168 # we need to prepare cm before due to missing return line in pdf printed

169 # to pdf from word

170 cm = (

171 cm.strip()

172 .replace(b"beginbfchar", b"\nbeginbfchar\n")

173 .replace(b"endbfchar", b"\nendbfchar\n")

174 .replace(b"beginbfrange", b"\nbeginbfrange\n")

175 .replace(b"endbfrange", b"\nendbfrange\n")

176 .replace(b"<<", b"\n{\n") # text between << and >> not used but

177 .replace(b">>", b"\n}\n") # some solution to find it back

178 )

179 ll = cm.split(b"<")

180 for i in range(len(ll)):

181 j = ll[i].find(b">")

182 if j >= 0:

183 if j == 0:

184 # string is empty: stash a placeholder here (see below)

185 # see https://github.com/py-pdf/pypdf/issues/1111

186 content = b"."

187 else:

188 content = ll[i][:j].replace(b" ", b"")

189 ll[i] = content + b" " + ll[i][j + 1 :]

190 cm = (

191 (b" ".join(ll))

192 .replace(b"[", b" [ ")

193 .replace(b"]", b" ]\n ")

194 .replace(b"\r", b"\n")

195 )

196 return cm

197

198

199def process_cm_line(

200 line: bytes,

201 process_rg: bool,

202 process_char: bool,

203 multiline_rg: Union[None, tuple[int, int]],

204 map_dict: dict[Any, Any],

205 int_entry: list[int],

206) -> tuple[bool, bool, Union[None, tuple[int, int]]]:

207 if line == b"" or line[0] == 37: # 37 = %

208 return process_rg, process_char, multiline_rg

209 line = line.replace(b"\t", b" ")

210 if b"beginbfrange" in line:

211 process_rg = True

212 elif b"endbfrange" in line:

213 process_rg = False

214 elif b"beginbfchar" in line:

215 process_char = True

216 elif b"endbfchar" in line:

217 process_char = False

218 elif process_rg:

219 try:

220 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)

221 except binascii.Error as error:

222 logger_warning("Skipping broken line %(line)r: %(error)s", source=__name__, line=line, error=error)

223 elif process_char:

224 parse_bfchar(line, map_dict, int_entry)

225 return process_rg, process_char, multiline_rg

226

227

228# Usual values should be up to 65_536.

229MAPPING_DICTIONARY_SIZE_LIMIT = 100_000

230

231

232def _check_mapping_size(size: int) -> None:

233 if size > MAPPING_DICTIONARY_SIZE_LIMIT:

234 raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.")

235

236

237def parse_bfrange(

238 line: bytes,

239 map_dict: dict[Any, Any],

240 int_entry: list[int],

241 multiline_rg: Union[None, tuple[int, int]],

242) -> Union[None, tuple[int, int]]:

243 lst = [x for x in line.split(b" ") if x]

244 closure_found = False

245 entry_count = len(int_entry)

246 _check_mapping_size(entry_count)

247 if multiline_rg is not None:

248 fmt = b"%%0%dX" % (map_dict[-1] * 2)

249 a = multiline_rg[0] # a, b not in the current line

250 b = multiline_rg[1]

251 for sq in lst:

252 if sq == b"]":

253 closure_found = True

254 break

255 entry_count += 1

256 _check_mapping_size(entry_count)

257 map_dict[

258 unhexlify(fmt % a).decode(

259 "charmap" if map_dict[-1] == 1 else "utf-16-be",

260 "surrogatepass",

261 )

262 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

263 int_entry.append(a)

264 a += 1

265 else:

266 a = int(lst[0], 16)

267 b = int(lst[1], 16)

268 nbi = max(len(lst[0]), len(lst[1]))

269 map_dict[-1] = ceil(nbi / 2)

270 fmt = b"%%0%dX" % (map_dict[-1] * 2)

271 if lst[2] == b"[":

272 for sq in lst[3:]:

273 if sq == b"]":

274 closure_found = True

275 break

276 entry_count += 1

277 _check_mapping_size(entry_count)

278 map_dict[

279 unhexlify(fmt % a).decode(

280 "charmap" if map_dict[-1] == 1 else "utf-16-be",

281 "surrogatepass",

282 )

283 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

284 int_entry.append(a)

285 a += 1

286 else: # case without list

287 c = int(lst[2], 16)

288 fmt2 = b"%%0%dX" % max(4, len(lst[2]))

289 closure_found = True

290 range_size = max(0, b - a + 1)

291 _check_mapping_size(entry_count + range_size) # This can be checked beforehand.

292 while a <= b:

293 map_dict[

294 unhexlify(fmt % a).decode(

295 "charmap" if map_dict[-1] == 1 else "utf-16-be",

296 "surrogatepass",

297 )

298 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")

299 int_entry.append(a)

300 a += 1

301 c += 1

302 return None if closure_found else (a, b)

303

304

305def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:

306 lst = [x for x in line.split(b" ") if x]

307 new_count = len(lst) // 2

308 _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand.

309 map_dict[-1] = len(lst[0]) // 2

310 while len(lst) > 1:

311 map_to = ""

312 # placeholder (see above) means empty string

313 if lst[1] != b".":

314 try:

315 map_to = unhexlify(lst[1]).decode(

316 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"

317 ) # join is here as some cases where the code was split

318 except BinasciiError as exception:

319 logger_warning(

320 "Got invalid hex string: %(exception)s (%(lst_value)r)",

321 source=__name__,

322 exception=exception,

323 lst_value=lst[1],

324 )

325 map_dict[

326 unhexlify(lst[0]).decode(

327 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"

328 )

329 ] = map_to

330 int_entry.append(int(lst[0], 16))

331 lst = lst[2:]

332

333

334def _type1_alternative(

335 ft: DictionaryObject,

336 map_dict: dict[Any, Any],

337 int_entry: list[int],

338) -> tuple[dict[Any, Any], list[int]]:

339 if "/FontDescriptor" not in ft:

340 return map_dict, int_entry

341 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")

342 if is_null_or_none(ft_desc):

343 return map_dict, int_entry

344 assert ft_desc is not None, "mypy"

345 txt = ft_desc.get_object().get_data()

346 txt = txt.split(b"eexec\n")[0] # only clear part

347 txt = txt.split(b"/Encoding")[1] # to get the encoding part

348 lines = txt.replace(b"\r", b"\n").split(b"\n")

349 for li in lines:

350 if li.startswith(b"dup"):

351 words = [_w for _w in li.split(b" ") if _w != b""]

352 if len(words) > 3 and words[3] != b"put":

353 continue

354 try:

355 i = int(words[1])

356 except ValueError: # pragma: no cover

357 continue

358 try:

359 v = adobe_glyphs[words[2].decode()]

360 except KeyError:

361 if words[2].startswith(b"/uni"):

362 try:

363 v = chr(int(words[2][4:], 16))

364 except ValueError: # pragma: no cover

365 continue

366 else:

367 continue

368 map_dict[chr(i)] = v

369 int_entry.append(i)

370 return map_dict, int_entry

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 10%

203 statements