Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1import binascii

2from binascii import Error as BinasciiError

3from binascii import unhexlify

4from math import ceil

5from typing import Any, Dict, List, Tuple, Union, cast

7from ._codecs import adobe_glyphs, charset_encoding

8from ._utils import logger_error, logger_warning

9from .generic import (

10 ArrayObject,

11 DecodedStreamObject,

12 DictionaryObject,

13 NullObject,

14 StreamObject,

15 is_null_or_none,

16)

19# code freely inspired from @twiggy ; see #711

20def build_char_map(

21 font_name: str, space_width: float, obj: DictionaryObject

22) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:

23 """

24 Determine information about a font.

26 Args:

27 font_name: font name as a string

28 space_width: default space width if no data is found.

29 obj: XObject or Page where you can find a /Resource dictionary

31 Returns:

32 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.

33 The font-dictionary itself is suitable for the curious.

35 """

36 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore

37 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(

38 space_width, ft

39 )

40 return font_subtype, font_halfspace, font_encoding, font_map, ft

43def build_char_map_from_dict(

44 space_width: float, ft: DictionaryObject

45) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:

46 """

47 Determine information about a font.

49 Args:

50 space_width: default space with if no data found

51 (normally half the width of a character).

52 ft: Font Dictionary

54 Returns:

55 Font sub-type, space_width criteria(50% of width), encoding, map character-map.

56 The font-dictionary itself is suitable for the curious.

58 """

59 font_type = cast(str, ft["/Subtype"].get_object())

60 encoding, map_dict = get_encoding(ft)

62 space_key_char = get_actual_str_key(" ", encoding, map_dict)

63 font_width_map = build_font_width_map(ft, space_width * 2.0)

64 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0

66 return (

67 font_type,

68 half_space_width,

69 encoding,

70 # https://github.com/python/mypy/issues/4374

71 map_dict

72 )

75# used when missing data, e.g. font def missing

76unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (

77 "Unknown",

78 9999,

79 dict.fromkeys(range(256), "�"),

80 {},

81)

84_predefined_cmap: Dict[str, str] = {

85 "/Identity-H": "utf-16-be",

86 "/Identity-V": "utf-16-be",

87 "/GB-EUC-H": "gbk",

88 "/GB-EUC-V": "gbk",

89 "/GBpc-EUC-H": "gb2312",

90 "/GBpc-EUC-V": "gb2312",

91 "/GBK-EUC-H": "gbk",

92 "/GBK-EUC-V": "gbk",

93 "/GBK2K-H": "gb18030",

94 "/GBK2K-V": "gb18030",

95 "/ETen-B5-H": "cp950",

96 "/ETen-B5-V": "cp950",

97 "/ETenms-B5-H": "cp950",

98 "/ETenms-B5-V": "cp950",

99 "/UniCNS-UTF16-H": "utf-16-be",

100 "/UniCNS-UTF16-V": "utf-16-be",

101 "/UniGB-UTF16-H": "gb18030",

102 "/UniGB-UTF16-V": "gb18030",

103 # UCS2 in code

104}

105

106# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz

107_default_fonts_space_width: Dict[str, int] = {

108 "/Courier": 600,

109 "/Courier-Bold": 600,

110 "/Courier-BoldOblique": 600,

111 "/Courier-Oblique": 600,

112 "/Helvetica": 278,

113 "/Helvetica-Bold": 278,

114 "/Helvetica-BoldOblique": 278,

115 "/Helvetica-Oblique": 278,

116 "/Helvetica-Narrow": 228,

117 "/Helvetica-NarrowBold": 228,

118 "/Helvetica-NarrowBoldOblique": 228,

119 "/Helvetica-NarrowOblique": 228,

120 "/Times-Roman": 250,

121 "/Times-Bold": 250,

122 "/Times-BoldItalic": 250,

123 "/Times-Italic": 250,

124 "/Symbol": 250,

125 "/ZapfDingbats": 278,

126}

127

128

129def get_encoding(

130 ft: DictionaryObject

131) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:

132 encoding = _parse_encoding(ft)

133 map_dict, int_entry = _parse_to_unicode(ft)

134

135 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:

136 # if cmap not empty encoding should be discarded

137 # (here transformed into identity for those characters)

138 # If encoding is a string, it is expected to be an identity translation.

139 if isinstance(encoding, dict):

140 for x in int_entry:

141 if x <= 255:

142 encoding[x] = chr(x)

143

144 return encoding, map_dict

145

146

147def _parse_encoding(

148 ft: DictionaryObject

149) -> Union[str, Dict[int, str]]:

150 encoding: Union[str, List[str], Dict[int, str]] = []

151 if "/Encoding" not in ft:

152 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:

153 encoding = dict(

154 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])

155 )

156 else:

157 encoding = "charmap"

158 return encoding

159 enc: Union[str, DictionaryObject, NullObject] = cast(

160 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()

161 )

162 if isinstance(enc, str):

163 try:

164 # already done : enc = NameObject.unnumber(enc.encode()).decode()

165 # for #xx decoding

166 if enc in charset_encoding:

167 encoding = charset_encoding[enc].copy()

168 elif enc in _predefined_cmap:

169 encoding = _predefined_cmap[enc]

170 elif "-UCS2-" in enc:

171 encoding = "utf-16-be"

172 else:

173 raise Exception("not found")

174 except Exception:

175 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)

176 encoding = enc

177 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:

178 try:

179 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()

180 except Exception:

181 logger_error(

182 f"Advanced encoding {encoding} not implemented yet",

183 __name__,

184 )

185 encoding = charset_encoding["/StandardEncoding"].copy()

186 else:

187 encoding = charset_encoding["/StandardEncoding"].copy()

188 if isinstance(enc, DictionaryObject) and "/Differences" in enc:

189 x: int = 0

190 o: Union[int, str]

191 for o in cast(DictionaryObject, enc["/Differences"]):

192 if isinstance(o, int):

193 x = o

194 else: # isinstance(o, str):

195 try:

196 if x < len(encoding):

197 encoding[x] = adobe_glyphs[o] # type: ignore

198 except Exception:

199 encoding[x] = o # type: ignore

200 x += 1

201 if isinstance(encoding, list):

202 encoding = dict(zip(range(256), encoding))

203 return encoding

204

205

206def _parse_to_unicode(

207 ft: DictionaryObject

208) -> Tuple[Dict[Any, Any], List[int]]:

209 # will store all translation code

210 # and map_dict[-1] we will have the number of bytes to convert

211 map_dict: Dict[Any, Any] = {}

212

213 # will provide the list of cmap keys as int to correct encoding

214 int_entry: List[int] = []

215

216 if "/ToUnicode" not in ft:

217 if ft.get("/Subtype", "") == "/Type1":

218 return _type1_alternative(ft, map_dict, int_entry)

219 return {}, []

220 process_rg: bool = False

221 process_char: bool = False

222 multiline_rg: Union[

223 None, Tuple[int, int]

224 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file

225 cm = prepare_cm(ft)

226 for line in cm.split(b"\n"):

227 process_rg, process_char, multiline_rg = process_cm_line(

228 line.strip(b" \t"),

229 process_rg,

230 process_char,

231 multiline_rg,

232 map_dict,

233 int_entry,

234 )

235

236 return map_dict, int_entry

237

238

239def get_actual_str_key(

240 value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]

241) -> str:

242 key_dict = {}

243 if isinstance(encoding, dict):

244 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}

245 else:

246 key_dict = {value: key for key, value in map_dict.items() if value == value_char}

247 return key_dict.get(value_char, value_char)

248

249

250def prepare_cm(ft: DictionaryObject) -> bytes:

251 tu = ft["/ToUnicode"]

252 cm: bytes

253 if isinstance(tu, StreamObject):

254 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()

255 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):

256 # the full range 0000-FFFF will be processed

257 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"

258 if isinstance(cm, str):

259 cm = cm.encode()

260 # we need to prepare cm before due to missing return line in pdf printed

261 # to pdf from word

262 cm = (

263 cm.strip()

264 .replace(b"beginbfchar", b"\nbeginbfchar\n")

265 .replace(b"endbfchar", b"\nendbfchar\n")

266 .replace(b"beginbfrange", b"\nbeginbfrange\n")

267 .replace(b"endbfrange", b"\nendbfrange\n")

268 .replace(b"<<", b"\n{\n") # text between << and >> not used but

269 .replace(b">>", b"\n}\n") # some solution to find it back

270 )

271 ll = cm.split(b"<")

272 for i in range(len(ll)):

273 j = ll[i].find(b">")

274 if j >= 0:

275 if j == 0:

276 # string is empty: stash a placeholder here (see below)

277 # see https://github.com/py-pdf/pypdf/issues/1111

278 content = b"."

279 else:

280 content = ll[i][:j].replace(b" ", b"")

281 ll[i] = content + b" " + ll[i][j + 1 :]

282 cm = (

283 (b" ".join(ll))

284 .replace(b"[", b" [ ")

285 .replace(b"]", b" ]\n ")

286 .replace(b"\r", b"\n")

287 )

288 return cm

289

290

291def process_cm_line(

292 line: bytes,

293 process_rg: bool,

294 process_char: bool,

295 multiline_rg: Union[None, Tuple[int, int]],

296 map_dict: Dict[Any, Any],

297 int_entry: List[int],

298) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:

299 if line == b"" or line[0] == 37: # 37 = %

300 return process_rg, process_char, multiline_rg

301 line = line.replace(b"\t", b" ")

302 if b"beginbfrange" in line:

303 process_rg = True

304 elif b"endbfrange" in line:

305 process_rg = False

306 elif b"beginbfchar" in line:

307 process_char = True

308 elif b"endbfchar" in line:

309 process_char = False

310 elif process_rg:

311 try:

312 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)

313 except binascii.Error as error:

314 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)

315 elif process_char:

316 parse_bfchar(line, map_dict, int_entry)

317 return process_rg, process_char, multiline_rg

318

319

320def parse_bfrange(

321 line: bytes,

322 map_dict: Dict[Any, Any],

323 int_entry: List[int],

324 multiline_rg: Union[None, Tuple[int, int]],

325) -> Union[None, Tuple[int, int]]:

326 lst = [x for x in line.split(b" ") if x]

327 closure_found = False

328 if multiline_rg is not None:

329 fmt = b"%%0%dX" % (map_dict[-1] * 2)

330 a = multiline_rg[0] # a, b not in the current line

331 b = multiline_rg[1]

332 for sq in lst:

333 if sq == b"]":

334 closure_found = True

335 break

336 map_dict[

337 unhexlify(fmt % a).decode(

338 "charmap" if map_dict[-1] == 1 else "utf-16-be",

339 "surrogatepass",

340 )

341 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

342 int_entry.append(a)

343 a += 1

344 else:

345 a = int(lst[0], 16)

346 b = int(lst[1], 16)

347 nbi = max(len(lst[0]), len(lst[1]))

348 map_dict[-1] = ceil(nbi / 2)

349 fmt = b"%%0%dX" % (map_dict[-1] * 2)

350 if lst[2] == b"[":

351 for sq in lst[3:]:

352 if sq == b"]":

353 closure_found = True

354 break

355 map_dict[

356 unhexlify(fmt % a).decode(

357 "charmap" if map_dict[-1] == 1 else "utf-16-be",

358 "surrogatepass",

359 )

360 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

361 int_entry.append(a)

362 a += 1

363 else: # case without list

364 c = int(lst[2], 16)

365 fmt2 = b"%%0%dX" % max(4, len(lst[2]))

366 closure_found = True

367 while a <= b:

368 map_dict[

369 unhexlify(fmt % a).decode(

370 "charmap" if map_dict[-1] == 1 else "utf-16-be",

371 "surrogatepass",

372 )

373 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")

374 int_entry.append(a)

375 a += 1

376 c += 1

377 return None if closure_found else (a, b)

378

379

380def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:

381 lst = [x for x in line.split(b" ") if x]

382 map_dict[-1] = len(lst[0]) // 2

383 while len(lst) > 1:

384 map_to = ""

385 # placeholder (see above) means empty string

386 if lst[1] != b".":

387 try:

388 map_to = unhexlify(lst[1]).decode(

389 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"

390 ) # join is here as some cases where the code was split

391 except BinasciiError as exception:

392 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)

393 map_dict[

394 unhexlify(lst[0]).decode(

395 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"

396 )

397 ] = map_to

398 int_entry.append(int(lst[0], 16))

399 lst = lst[2:]

400

401

402def build_font_width_map(

403 ft: DictionaryObject, default_font_width: float

404) -> Dict[Any, float]:

405 font_width_map: Dict[Any, float] = {}

406 st: int = 0

407 en: int = 0

408 try:

409 default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0

410 except KeyError:

411 pass

412 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):

413 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")

414 # Widths for a CIDFont are defined using the DW and W entries.

415 # DW2 and W2 are for vertical use. Vertical type is not implemented.

416 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore

417 if "/DW" in ft1:

418 font_width_map["default"] = cast(float, ft1["/DW"].get_object())

419 else:

420 font_width_map["default"] = default_font_width

421 if "/W" in ft1:

422 w = ft1["/W"].get_object()

423 else:

424 w = []

425 while len(w) > 0:

426 st = w[0] if isinstance(w[0], int) else w[0].get_object()

427 second = w[1].get_object()

428 if isinstance(second, int):

429 # C_first C_last same_W

430 en = second

431 width = w[2].get_object()

432 if not isinstance(width, (int, float)):

433 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)

434 w = w[3:]

435 continue

436 for c_code in range(st, en + 1):

437 font_width_map[chr(c_code)] = width

438 w = w[3:]

439 elif isinstance(second, list):

440 # Starting_C [W1 W2 ... Wn]

441 c_code = st

442 for ww in second:

443 width = ww.get_object()

444 font_width_map[chr(c_code)] = width

445 c_code += 1

446 w = w[2:]

447 else:

448 logger_warning(

449 "unknown widths : \n" + (ft1["/W"]).__repr__(),

450 __name__,

451 )

452 break

453 elif "/Widths" in ft:

454 w = cast(ArrayObject, ft["/Widths"].get_object())

455 if "/FontDescriptor" in ft and "/MissingWidth" in cast(

456 DictionaryObject, ft["/FontDescriptor"]

457 ):

458 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore

459 else:

460 # will consider width of char as avg(width)

461 m = 0

462 cpt = 0

463 for xx in w:

464 xx = xx.get_object()

465 if xx > 0:

466 m += xx

467 cpt += 1

468 font_width_map["default"] = m / max(1, cpt)

469 st = cast(int, ft["/FirstChar"])

470 en = cast(int, ft["/LastChar"])

471 for c_code in range(st, en + 1):

472 try:

473 width = w[c_code - st].get_object()

474 font_width_map[chr(c_code)] = width

475 except (IndexError, KeyError):

476 # The PDF structure is invalid. The array is too small

477 # for the specified font width.

478 pass

479 if is_null_or_none(font_width_map.get("default")):

480 font_width_map["default"] = default_font_width if default_font_width else 0.0

481 return font_width_map

482

483

484def compute_space_width(

485 font_width_map: Dict[Any, float], space_char: str

486) -> float:

487 try:

488 sp_width = font_width_map[space_char]

489 if sp_width == 0:

490 raise ValueError("Zero width")

491 except (KeyError, ValueError):

492 sp_width = (

493 font_width_map["default"] / 2.0

494 ) # if using default we consider space will be only half size

495

496 return sp_width

497

498

499def compute_font_width(

500 font_width_map: Dict[Any, float],

501 char: str

502) -> float:

503 char_width: float = 0.0

504 try:

505 char_width = font_width_map[char]

506 except KeyError:

507 char_width = (

508 font_width_map["default"]

509 )

510

511 return char_width

512

513

514def _type1_alternative(

515 ft: DictionaryObject,

516 map_dict: Dict[Any, Any],

517 int_entry: List[int],

518) -> Tuple[Dict[Any, Any], List[int]]:

519 if "/FontDescriptor" not in ft:

520 return map_dict, int_entry

521 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")

522 if is_null_or_none(ft_desc):

523 return map_dict, int_entry

524 assert ft_desc is not None, "mypy"

525 txt = ft_desc.get_object().get_data()

526 txt = txt.split(b"eexec\n")[0] # only clear part

527 txt = txt.split(b"/Encoding")[1] # to get the encoding part

528 lines = txt.replace(b"\r", b"\n").split(b"\n")

529 for li in lines:

530 if li.startswith(b"dup"):

531 words = [_w for _w in li.split(b" ") if _w != b""]

532 if len(words) > 3 and words[3] != b"put":

533 continue

534 try:

535 i = int(words[1])

536 except ValueError: # pragma: no cover

537 continue

538 try:

539 v = adobe_glyphs[words[2].decode()]

540 except KeyError:

541 if words[2].startswith(b"/uni"):

542 try:

543 v = chr(int(words[2][4:], 16))

544 except ValueError: # pragma: no cover

545 continue

546 else:

547 continue

548 map_dict[chr(i)] = v

549 int_entry.append(i)

550 return map_dict, int_entry

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

282 statements