Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1import binascii

2from binascii import Error as BinasciiError

3from binascii import unhexlify

4from math import ceil

5from typing import Any, Union, cast

7from ._codecs import adobe_glyphs, charset_encoding

8from ._codecs.core_fontmetrics import CORE_FONT_METRICS

9from ._utils import logger_error, logger_warning

10from .generic import (

11 ArrayObject,

12 DecodedStreamObject,

13 DictionaryObject,

14 NullObject,

15 StreamObject,

16 is_null_or_none,

17)

20# code freely inspired from @twiggy ; see #711

21def build_char_map(

22 font_name: str, space_width: float, obj: DictionaryObject

23) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any], DictionaryObject]:

24 """

25 Determine information about a font.

27 Args:

28 font_name: font name as a string

29 space_width: default space width if no data is found.

30 obj: XObject or Page where you can find a /Resource dictionary

32 Returns:

33 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.

34 The font-dictionary itself is suitable for the curious.

36 """

37 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore

38 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(

39 space_width, ft

40 )

41 return font_subtype, font_halfspace, font_encoding, font_map, ft

44def build_char_map_from_dict(

45 space_width: float, ft: DictionaryObject

46) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]]:

47 """

48 Determine information about a font.

50 Args:

51 space_width: default space with if no data found

52 (normally half the width of a character).

53 ft: Font Dictionary

55 Returns:

56 Font sub-type, space_width criteria(50% of width), encoding, map character-map.

57 The font-dictionary itself is suitable for the curious.

59 """

60 font_type = cast(str, ft["/Subtype"].get_object())

61 encoding, map_dict = get_encoding(ft)

63 space_key_char = get_actual_str_key(" ", encoding, map_dict)

64 font_width_map = build_font_width_map(ft, space_width * 2.0)

65 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0

67 return (

68 font_type,

69 half_space_width,

70 encoding,

71 # https://github.com/python/mypy/issues/4374

72 map_dict

73 )

76# used when missing data, e.g. font def missing

77unknown_char_map: tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]] = (

78 "Unknown",

79 9999,

80 dict.fromkeys(range(256), "�"),

81 {},

82)

85_predefined_cmap: dict[str, str] = {

86 "/Identity-H": "utf-16-be",

87 "/Identity-V": "utf-16-be",

88 "/GB-EUC-H": "gbk",

89 "/GB-EUC-V": "gbk",

90 "/GBpc-EUC-H": "gb2312",

91 "/GBpc-EUC-V": "gb2312",

92 "/GBK-EUC-H": "gbk",

93 "/GBK-EUC-V": "gbk",

94 "/GBK2K-H": "gb18030",

95 "/GBK2K-V": "gb18030",

96 "/ETen-B5-H": "cp950",

97 "/ETen-B5-V": "cp950",

98 "/ETenms-B5-H": "cp950",

99 "/ETenms-B5-V": "cp950",

100 "/UniCNS-UTF16-H": "utf-16-be",

101 "/UniCNS-UTF16-V": "utf-16-be",

102 "/UniGB-UTF16-H": "gb18030",

103 "/UniGB-UTF16-V": "gb18030",

104 # UCS2 in code

105}

106

107

108def get_encoding(

109 ft: DictionaryObject

110) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:

111 encoding = _parse_encoding(ft)

112 map_dict, int_entry = _parse_to_unicode(ft)

113

114 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:

115 # if cmap not empty encoding should be discarded

116 # (here transformed into identity for those characters)

117 # If encoding is a string, it is expected to be an identity translation.

118 if isinstance(encoding, dict):

119 for x in int_entry:

120 if x <= 255:

121 encoding[x] = chr(x)

122

123 return encoding, map_dict

124

125

126def _parse_encoding(

127 ft: DictionaryObject

128) -> Union[str, dict[int, str]]:

129 encoding: Union[str, list[str], dict[int, str]] = []

130 if "/Encoding" not in ft:

131 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:

132 encoding = dict(

133 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])

134 )

135 else:

136 encoding = "charmap"

137 return encoding

138 enc: Union[str, DictionaryObject, NullObject] = cast(

139 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()

140 )

141 if isinstance(enc, str):

142 try:

143 # already done : enc = NameObject.unnumber(enc.encode()).decode()

144 # for #xx decoding

145 if enc in charset_encoding:

146 encoding = charset_encoding[enc].copy()

147 elif enc in _predefined_cmap:

148 encoding = _predefined_cmap[enc]

149 elif "-UCS2-" in enc:

150 encoding = "utf-16-be"

151 else:

152 raise Exception("not found")

153 except Exception:

154 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)

155 encoding = enc

156 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:

157 try:

158 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()

159 except Exception:

160 logger_error(

161 f"Advanced encoding {encoding} not implemented yet",

162 __name__,

163 )

164 encoding = charset_encoding["/StandardEncoding"].copy()

165 else:

166 encoding = charset_encoding["/StandardEncoding"].copy()

167 if isinstance(enc, DictionaryObject) and "/Differences" in enc:

168 x: int = 0

169 o: Union[int, str]

170 for o in cast(DictionaryObject, enc["/Differences"]):

171 if isinstance(o, int):

172 x = o

173 else: # isinstance(o, str):

174 try:

175 if x < len(encoding):

176 encoding[x] = adobe_glyphs[o] # type: ignore

177 except Exception:

178 encoding[x] = o # type: ignore

179 x += 1

180 if isinstance(encoding, list):

181 encoding = dict(zip(range(256), encoding))

182 return encoding

183

184

185def _parse_to_unicode(

186 ft: DictionaryObject

187) -> tuple[dict[Any, Any], list[int]]:

188 # will store all translation code

189 # and map_dict[-1] we will have the number of bytes to convert

190 map_dict: dict[Any, Any] = {}

191

192 # will provide the list of cmap keys as int to correct encoding

193 int_entry: list[int] = []

194

195 if "/ToUnicode" not in ft:

196 if ft.get("/Subtype", "") == "/Type1":

197 return _type1_alternative(ft, map_dict, int_entry)

198 return {}, []

199 process_rg: bool = False

200 process_char: bool = False

201 multiline_rg: Union[

202 None, tuple[int, int]

203 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file

204 cm = prepare_cm(ft)

205 for line in cm.split(b"\n"):

206 process_rg, process_char, multiline_rg = process_cm_line(

207 line.strip(b" \t"),

208 process_rg,

209 process_char,

210 multiline_rg,

211 map_dict,

212 int_entry,

213 )

214

215 return map_dict, int_entry

216

217

218def get_actual_str_key(

219 value_char: str, encoding: Union[str, dict[int, str]], map_dict: dict[Any, Any]

220) -> str:

221 key_dict = {}

222 if isinstance(encoding, dict):

223 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}

224 else:

225 key_dict = {value: key for key, value in map_dict.items() if value == value_char}

226 return key_dict.get(value_char, value_char)

227

228

229def prepare_cm(ft: DictionaryObject) -> bytes:

230 tu = ft["/ToUnicode"]

231 cm: bytes

232 if isinstance(tu, StreamObject):

233 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()

234 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):

235 # the full range 0000-FFFF will be processed

236 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"

237 if isinstance(cm, str):

238 cm = cm.encode()

239 # we need to prepare cm before due to missing return line in pdf printed

240 # to pdf from word

241 cm = (

242 cm.strip()

243 .replace(b"beginbfchar", b"\nbeginbfchar\n")

244 .replace(b"endbfchar", b"\nendbfchar\n")

245 .replace(b"beginbfrange", b"\nbeginbfrange\n")

246 .replace(b"endbfrange", b"\nendbfrange\n")

247 .replace(b"<<", b"\n{\n") # text between << and >> not used but

248 .replace(b">>", b"\n}\n") # some solution to find it back

249 )

250 ll = cm.split(b"<")

251 for i in range(len(ll)):

252 j = ll[i].find(b">")

253 if j >= 0:

254 if j == 0:

255 # string is empty: stash a placeholder here (see below)

256 # see https://github.com/py-pdf/pypdf/issues/1111

257 content = b"."

258 else:

259 content = ll[i][:j].replace(b" ", b"")

260 ll[i] = content + b" " + ll[i][j + 1 :]

261 cm = (

262 (b" ".join(ll))

263 .replace(b"[", b" [ ")

264 .replace(b"]", b" ]\n ")

265 .replace(b"\r", b"\n")

266 )

267 return cm

268

269

270def process_cm_line(

271 line: bytes,

272 process_rg: bool,

273 process_char: bool,

274 multiline_rg: Union[None, tuple[int, int]],

275 map_dict: dict[Any, Any],

276 int_entry: list[int],

277) -> tuple[bool, bool, Union[None, tuple[int, int]]]:

278 if line == b"" or line[0] == 37: # 37 = %

279 return process_rg, process_char, multiline_rg

280 line = line.replace(b"\t", b" ")

281 if b"beginbfrange" in line:

282 process_rg = True

283 elif b"endbfrange" in line:

284 process_rg = False

285 elif b"beginbfchar" in line:

286 process_char = True

287 elif b"endbfchar" in line:

288 process_char = False

289 elif process_rg:

290 try:

291 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)

292 except binascii.Error as error:

293 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)

294 elif process_char:

295 parse_bfchar(line, map_dict, int_entry)

296 return process_rg, process_char, multiline_rg

297

298

299def parse_bfrange(

300 line: bytes,

301 map_dict: dict[Any, Any],

302 int_entry: list[int],

303 multiline_rg: Union[None, tuple[int, int]],

304) -> Union[None, tuple[int, int]]:

305 lst = [x for x in line.split(b" ") if x]

306 closure_found = False

307 if multiline_rg is not None:

308 fmt = b"%%0%dX" % (map_dict[-1] * 2)

309 a = multiline_rg[0] # a, b not in the current line

310 b = multiline_rg[1]

311 for sq in lst:

312 if sq == b"]":

313 closure_found = True

314 break

315 map_dict[

316 unhexlify(fmt % a).decode(

317 "charmap" if map_dict[-1] == 1 else "utf-16-be",

318 "surrogatepass",

319 )

320 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

321 int_entry.append(a)

322 a += 1

323 else:

324 a = int(lst[0], 16)

325 b = int(lst[1], 16)

326 nbi = max(len(lst[0]), len(lst[1]))

327 map_dict[-1] = ceil(nbi / 2)

328 fmt = b"%%0%dX" % (map_dict[-1] * 2)

329 if lst[2] == b"[":

330 for sq in lst[3:]:

331 if sq == b"]":

332 closure_found = True

333 break

334 map_dict[

335 unhexlify(fmt % a).decode(

336 "charmap" if map_dict[-1] == 1 else "utf-16-be",

337 "surrogatepass",

338 )

339 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

340 int_entry.append(a)

341 a += 1

342 else: # case without list

343 c = int(lst[2], 16)

344 fmt2 = b"%%0%dX" % max(4, len(lst[2]))

345 closure_found = True

346 while a <= b:

347 map_dict[

348 unhexlify(fmt % a).decode(

349 "charmap" if map_dict[-1] == 1 else "utf-16-be",

350 "surrogatepass",

351 )

352 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")

353 int_entry.append(a)

354 a += 1

355 c += 1

356 return None if closure_found else (a, b)

357

358

359def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:

360 lst = [x for x in line.split(b" ") if x]

361 map_dict[-1] = len(lst[0]) // 2

362 while len(lst) > 1:

363 map_to = ""

364 # placeholder (see above) means empty string

365 if lst[1] != b".":

366 try:

367 map_to = unhexlify(lst[1]).decode(

368 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"

369 ) # join is here as some cases where the code was split

370 except BinasciiError as exception:

371 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)

372 map_dict[

373 unhexlify(lst[0]).decode(

374 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"

375 )

376 ] = map_to

377 int_entry.append(int(lst[0], 16))

378 lst = lst[2:]

379

380

381def build_font_width_map(

382 ft: DictionaryObject, default_font_width: float

383) -> dict[Any, float]:

384 font_width_map: dict[Any, float] = {}

385 st: int = 0

386 en: int = 0

387 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):

388 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")

389 # Widths for a CIDFont are defined using the DW and W entries.

390 # DW2 and W2 are for vertical use. Vertical type is not implemented.

391 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore

392 if "/DW" in ft1:

393 font_width_map["default"] = cast(float, ft1["/DW"].get_object())

394 else:

395 font_name = str(ft["/BaseFont"]).removeprefix("/")

396 if font_name in CORE_FONT_METRICS:

397 # This applies to test_tounicode_is_identity, which has a CID CourierNew font that

398 # apparently does not specify the width of a space.

399 font_width_map["default"] = CORE_FONT_METRICS[font_name].character_widths[" "] * 2

400 else:

401 font_width_map["default"] = default_font_width

402 if "/W" in ft1:

403 w = ft1["/W"].get_object()

404 else:

405 w = []

406 while len(w) > 0:

407 st = w[0] if isinstance(w[0], int) else w[0].get_object()

408 second = w[1].get_object()

409 if isinstance(second, int):

410 # C_first C_last same_W

411 en = second

412 width = w[2].get_object()

413 if not isinstance(width, (int, float)):

414 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)

415 w = w[3:]

416 continue

417 for c_code in range(st, en + 1):

418 font_width_map[chr(c_code)] = width

419 w = w[3:]

420 elif isinstance(second, list):

421 # Starting_C [W1 W2 ... Wn]

422 c_code = st

423 for ww in second:

424 width = ww.get_object()

425 font_width_map[chr(c_code)] = width

426 c_code += 1

427 w = w[2:]

428 else:

429 logger_warning(

430 "unknown widths : \n" + (ft1["/W"]).__repr__(),

431 __name__,

432 )

433 break

434 elif "/Widths" in ft:

435 w = cast(ArrayObject, ft["/Widths"].get_object())

436 if "/FontDescriptor" in ft and "/MissingWidth" in cast(

437 DictionaryObject, ft["/FontDescriptor"]

438 ):

439 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore

440 else:

441 # will consider width of char as avg(width)

442 m = 0

443 cpt = 0

444 for xx in w:

445 xx = xx.get_object()

446 if xx > 0:

447 m += xx

448 cpt += 1

449 font_width_map["default"] = m / max(1, cpt)

450 st = cast(int, ft["/FirstChar"])

451 en = cast(int, ft["/LastChar"])

452 for c_code in range(st, en + 1):

453 try:

454 width = w[c_code - st].get_object()

455 font_width_map[chr(c_code)] = width

456 except (IndexError, KeyError):

457 # The PDF structure is invalid. The array is too small

458 # for the specified font width.

459 pass

460 else:

461 font_name = str(ft["/BaseFont"]).removeprefix("/")

462 if font_name in CORE_FONT_METRICS:

463 font_width_map = cast(dict[str, float], CORE_FONT_METRICS[font_name].character_widths)

464 font_width_map["default"] = font_width_map[" "] * 2

465 if is_null_or_none(font_width_map.get("default")):

466 font_width_map["default"] = 0

467 return font_width_map

468

469

470def compute_space_width(

471 font_width_map: dict[Any, float], space_char: str

472) -> float:

473 try:

474 sp_width = font_width_map[space_char]

475 if sp_width == 0:

476 raise ValueError("Zero width")

477 except (KeyError, ValueError):

478 sp_width = (

479 font_width_map["default"] / 2.0

480 ) # if using default we consider space will be only half size

481

482 return sp_width

483

484

485def compute_font_width(

486 font_width_map: dict[Any, float],

487 char: str

488) -> float:

489 char_width: float = 0.0

490 try:

491 char_width = font_width_map[char]

492 except KeyError:

493 char_width = (

494 font_width_map["default"]

495 )

496

497 return char_width

498

499

500def _type1_alternative(

501 ft: DictionaryObject,

502 map_dict: dict[Any, Any],

503 int_entry: list[int],

504) -> tuple[dict[Any, Any], list[int]]:

505 if "/FontDescriptor" not in ft:

506 return map_dict, int_entry

507 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")

508 if is_null_or_none(ft_desc):

509 return map_dict, int_entry

510 assert ft_desc is not None, "mypy"

511 txt = ft_desc.get_object().get_data()

512 txt = txt.split(b"eexec\n")[0] # only clear part

513 txt = txt.split(b"/Encoding")[1] # to get the encoding part

514 lines = txt.replace(b"\r", b"\n").split(b"\n")

515 for li in lines:

516 if li.startswith(b"dup"):

517 words = [_w for _w in li.split(b" ") if _w != b""]

518 if len(words) > 3 and words[3] != b"put":

519 continue

520 try:

521 i = int(words[1])

522 except ValueError: # pragma: no cover

523 continue

524 try:

525 v = adobe_glyphs[words[2].decode()]

526 except KeyError:

527 if words[2].startswith(b"/uni"):

528 try:

529 v = chr(int(words[2][4:], 16))

530 except ValueError: # pragma: no cover

531 continue

532 else:

533 continue

534 map_dict[chr(i)] = v

535 int_entry.append(i)

536 return map_dict, int_entry

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

285 statements