Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1import binascii

2from binascii import unhexlify

3from math import ceil

4from typing import Any, Dict, List, Tuple, Union, cast

6from ._codecs import adobe_glyphs, charset_encoding

7from ._utils import logger_error, logger_warning

8from .generic import (

9 ArrayObject,

10 DecodedStreamObject,

11 DictionaryObject,

12 NullObject,

13 StreamObject,

14 is_null_or_none,

15)

18# code freely inspired from @twiggy ; see #711

19def build_char_map(

20 font_name: str, space_width: float, obj: DictionaryObject

21) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:

22 """

23 Determine information about a font.

25 Args:

26 font_name: font name as a string

27 space_width: default space width if no data is found.

28 obj: XObject or Page where you can find a /Resource dictionary

30 Returns:

31 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.

32 The font-dictionary itself is suitable for the curious.

34 """

35 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore

36 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(

37 space_width, ft

38 )

39 return font_subtype, font_halfspace, font_encoding, font_map, ft

42def build_char_map_from_dict(

43 space_width: float, ft: DictionaryObject

44) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:

45 """

46 Determine information about a font.

48 Args:

49 space_width: default space with if no data found

50 (normally half the width of a character).

51 ft: Font Dictionary

53 Returns:

54 Font sub-type, space_width criteria(50% of width), encoding, map character-map.

55 The font-dictionary itself is suitable for the curious.

57 """

58 font_type = cast(str, ft["/Subtype"].get_object())

59 encoding, map_dict = get_encoding(ft)

61 space_key_char = get_actual_str_key(" ", encoding, map_dict)

62 font_width_map = build_font_width_map(ft, space_width * 2.0)

63 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0

65 return (

66 font_type,

67 half_space_width,

68 encoding,

69 # https://github.com/python/mypy/issues/4374

70 map_dict

71 )

74# used when missing data, e.g. font def missing

75unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (

76 "Unknown",

77 9999,

78 dict.fromkeys(range(256), "�"),

79 {},

80)

83_predefined_cmap: Dict[str, str] = {

84 "/Identity-H": "utf-16-be",

85 "/Identity-V": "utf-16-be",

86 "/GB-EUC-H": "gbk",

87 "/GB-EUC-V": "gbk",

88 "/GBpc-EUC-H": "gb2312",

89 "/GBpc-EUC-V": "gb2312",

90 "/GBK-EUC-H": "gbk",

91 "/GBK-EUC-V": "gbk",

92 "/GBK2K-H": "gb18030",

93 "/GBK2K-V": "gb18030",

94 "/ETen-B5-H": "cp950",

95 "/ETen-B5-V": "cp950",

96 "/ETenms-B5-H": "cp950",

97 "/ETenms-B5-V": "cp950",

98 "/UniCNS-UTF16-H": "utf-16-be",

99 "/UniCNS-UTF16-V": "utf-16-be",

100 "/UniGB-UTF16-H": "gb18030",

101 "/UniGB-UTF16-V": "gb18030",

102 # UCS2 in code

103}

104

105# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz

106_default_fonts_space_width: Dict[str, int] = {

107 "/Courier": 600,

108 "/Courier-Bold": 600,

109 "/Courier-BoldOblique": 600,

110 "/Courier-Oblique": 600,

111 "/Helvetica": 278,

112 "/Helvetica-Bold": 278,

113 "/Helvetica-BoldOblique": 278,

114 "/Helvetica-Oblique": 278,

115 "/Helvetica-Narrow": 228,

116 "/Helvetica-NarrowBold": 228,

117 "/Helvetica-NarrowBoldOblique": 228,

118 "/Helvetica-NarrowOblique": 228,

119 "/Times-Roman": 250,

120 "/Times-Bold": 250,

121 "/Times-BoldItalic": 250,

122 "/Times-Italic": 250,

123 "/Symbol": 250,

124 "/ZapfDingbats": 278,

125}

126

127

128def get_encoding(

129 ft: DictionaryObject

130) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:

131 encoding = _parse_encoding(ft)

132 map_dict, int_entry = _parse_to_unicode(ft)

133

134 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:

135 # if cmap not empty encoding should be discarded

136 # (here transformed into identity for those characters)

137 # If encoding is a string, it is expected to be an identity translation.

138 if isinstance(encoding, dict):

139 for x in int_entry:

140 if x <= 255:

141 encoding[x] = chr(x)

142

143 return encoding, map_dict

144

145

146def _parse_encoding(

147 ft: DictionaryObject

148) -> Union[str, Dict[int, str]]:

149 encoding: Union[str, List[str], Dict[int, str]] = []

150 if "/Encoding" not in ft:

151 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:

152 encoding = dict(

153 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])

154 )

155 else:

156 encoding = "charmap"

157 return encoding

158 enc: Union[str, DictionaryObject, NullObject] = cast(

159 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()

160 )

161 if isinstance(enc, str):

162 try:

163 # already done : enc = NameObject.unnumber(enc.encode()).decode()

164 # for #xx decoding

165 if enc in charset_encoding:

166 encoding = charset_encoding[enc].copy()

167 elif enc in _predefined_cmap:

168 encoding = _predefined_cmap[enc]

169 elif "-UCS2-" in enc:

170 encoding = "utf-16-be"

171 else:

172 raise Exception("not found")

173 except Exception:

174 logger_error(f"Advanced encoding {enc} not implemented yet", __name__)

175 encoding = enc

176 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:

177 try:

178 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()

179 except Exception:

180 logger_error(

181 f"Advanced encoding {encoding} not implemented yet",

182 __name__,

183 )

184 encoding = charset_encoding["/StandardEncoding"].copy()

185 else:

186 encoding = charset_encoding["/StandardEncoding"].copy()

187 if isinstance(enc, DictionaryObject) and "/Differences" in enc:

188 x: int = 0

189 o: Union[int, str]

190 for o in cast(DictionaryObject, enc["/Differences"]):

191 if isinstance(o, int):

192 x = o

193 else: # isinstance(o, str):

194 try:

195 if x < len(encoding):

196 encoding[x] = adobe_glyphs[o] # type: ignore

197 except Exception:

198 encoding[x] = o # type: ignore

199 x += 1

200 if isinstance(encoding, list):

201 encoding = dict(zip(range(256), encoding))

202 return encoding

203

204

205def _parse_to_unicode(

206 ft: DictionaryObject

207) -> Tuple[Dict[Any, Any], List[int]]:

208 # will store all translation code

209 # and map_dict[-1] we will have the number of bytes to convert

210 map_dict: Dict[Any, Any] = {}

211

212 # will provide the list of cmap keys as int to correct encoding

213 int_entry: List[int] = []

214

215 if "/ToUnicode" not in ft:

216 if ft.get("/Subtype", "") == "/Type1":

217 return _type1_alternative(ft, map_dict, int_entry)

218 return {}, []

219 process_rg: bool = False

220 process_char: bool = False

221 multiline_rg: Union[

222 None, Tuple[int, int]

223 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file

224 cm = prepare_cm(ft)

225 for line in cm.split(b"\n"):

226 process_rg, process_char, multiline_rg = process_cm_line(

227 line.strip(b" \t"),

228 process_rg,

229 process_char,

230 multiline_rg,

231 map_dict,

232 int_entry,

233 )

234

235 return map_dict, int_entry

236

237

238def get_actual_str_key(

239 value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]

240) -> str:

241 key_dict = {}

242 if isinstance(encoding, dict):

243 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}

244 else:

245 key_dict = {value: key for key, value in map_dict.items() if value == value_char}

246 return key_dict.get(value_char, value_char)

247

248

249def prepare_cm(ft: DictionaryObject) -> bytes:

250 tu = ft["/ToUnicode"]

251 cm: bytes

252 if isinstance(tu, StreamObject):

253 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()

254 else: # if (tu is None) or cast(str, tu).startswith("/Identity"):

255 # the full range 0000-FFFF will be processed

256 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"

257 if isinstance(cm, str):

258 cm = cm.encode()

259 # we need to prepare cm before due to missing return line in pdf printed

260 # to pdf from word

261 cm = (

262 cm.strip()

263 .replace(b"beginbfchar", b"\nbeginbfchar\n")

264 .replace(b"endbfchar", b"\nendbfchar\n")

265 .replace(b"beginbfrange", b"\nbeginbfrange\n")

266 .replace(b"endbfrange", b"\nendbfrange\n")

267 .replace(b"<<", b"\n{\n") # text between << and >> not used but

268 .replace(b">>", b"\n}\n") # some solution to find it back

269 )

270 ll = cm.split(b"<")

271 for i in range(len(ll)):

272 j = ll[i].find(b">")

273 if j >= 0:

274 if j == 0:

275 # string is empty: stash a placeholder here (see below)

276 # see https://github.com/py-pdf/pypdf/issues/1111

277 content = b"."

278 else:

279 content = ll[i][:j].replace(b" ", b"")

280 ll[i] = content + b" " + ll[i][j + 1 :]

281 cm = (

282 (b" ".join(ll))

283 .replace(b"[", b" [ ")

284 .replace(b"]", b" ]\n ")

285 .replace(b"\r", b"\n")

286 )

287 return cm

288

289

290def process_cm_line(

291 line: bytes,

292 process_rg: bool,

293 process_char: bool,

294 multiline_rg: Union[None, Tuple[int, int]],

295 map_dict: Dict[Any, Any],

296 int_entry: List[int],

297) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:

298 if line == b"" or line[0] == 37: # 37 = %

299 return process_rg, process_char, multiline_rg

300 line = line.replace(b"\t", b" ")

301 if b"beginbfrange" in line:

302 process_rg = True

303 elif b"endbfrange" in line:

304 process_rg = False

305 elif b"beginbfchar" in line:

306 process_char = True

307 elif b"endbfchar" in line:

308 process_char = False

309 elif process_rg:

310 try:

311 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)

312 except binascii.Error as error:

313 logger_warning(f"Skipping broken line {line!r}: {error}", __name__)

314 elif process_char:

315 parse_bfchar(line, map_dict, int_entry)

316 return process_rg, process_char, multiline_rg

317

318

319def parse_bfrange(

320 line: bytes,

321 map_dict: Dict[Any, Any],

322 int_entry: List[int],

323 multiline_rg: Union[None, Tuple[int, int]],

324) -> Union[None, Tuple[int, int]]:

325 lst = [x for x in line.split(b" ") if x]

326 closure_found = False

327 if multiline_rg is not None:

328 fmt = b"%%0%dX" % (map_dict[-1] * 2)

329 a = multiline_rg[0] # a, b not in the current line

330 b = multiline_rg[1]

331 for sq in lst:

332 if sq == b"]":

333 closure_found = True

334 break

335 map_dict[

336 unhexlify(fmt % a).decode(

337 "charmap" if map_dict[-1] == 1 else "utf-16-be",

338 "surrogatepass",

339 )

340 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

341 int_entry.append(a)

342 a += 1

343 else:

344 a = int(lst[0], 16)

345 b = int(lst[1], 16)

346 nbi = max(len(lst[0]), len(lst[1]))

347 map_dict[-1] = ceil(nbi / 2)

348 fmt = b"%%0%dX" % (map_dict[-1] * 2)

349 if lst[2] == b"[":

350 for sq in lst[3:]:

351 if sq == b"]":

352 closure_found = True

353 break

354 map_dict[

355 unhexlify(fmt % a).decode(

356 "charmap" if map_dict[-1] == 1 else "utf-16-be",

357 "surrogatepass",

358 )

359 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")

360 int_entry.append(a)

361 a += 1

362 else: # case without list

363 c = int(lst[2], 16)

364 fmt2 = b"%%0%dX" % max(4, len(lst[2]))

365 closure_found = True

366 while a <= b:

367 map_dict[

368 unhexlify(fmt % a).decode(

369 "charmap" if map_dict[-1] == 1 else "utf-16-be",

370 "surrogatepass",

371 )

372 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")

373 int_entry.append(a)

374 a += 1

375 c += 1

376 return None if closure_found else (a, b)

377

378

379def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:

380 lst = [x for x in line.split(b" ") if x]

381 map_dict[-1] = len(lst[0]) // 2

382 while len(lst) > 1:

383 map_to = ""

384 # placeholder (see above) means empty string

385 if lst[1] != b".":

386 map_to = unhexlify(lst[1]).decode(

387 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"

388 ) # join is here as some cases where the code was split

389 map_dict[

390 unhexlify(lst[0]).decode(

391 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"

392 )

393 ] = map_to

394 int_entry.append(int(lst[0], 16))

395 lst = lst[2:]

396

397

398def build_font_width_map(

399 ft: DictionaryObject, default_font_width: float

400) -> Dict[Any, float]:

401 font_width_map: Dict[Any, float] = {}

402 st: int = 0

403 en: int = 0

404 try:

405 default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0

406 except KeyError:

407 pass

408 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):

409 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")

410 # Widths for a CIDFont are defined using the DW and W entries.

411 # DW2 and W2 are for vertical use. Vertical type is not implemented.

412 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore

413 if "/DW" in ft1:

414 font_width_map["default"] = cast(float, ft1["/DW"].get_object())

415 else:

416 font_width_map["default"] = default_font_width

417 if "/W" in ft1:

418 w = ft1["/W"].get_object()

419 else:

420 w = []

421 while len(w) > 0:

422 st = w[0] if isinstance(w[0], int) else w[0].get_object()

423 second = w[1].get_object()

424 if isinstance(second, int):

425 # C_first C_last same_W

426 en = second

427 width = w[2].get_object()

428 if not isinstance(width, (int, float)):

429 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)

430 w = w[3:]

431 continue

432 for c_code in range(st, en + 1):

433 font_width_map[chr(c_code)] = width

434 w = w[3:]

435 elif isinstance(second, list):

436 # Starting_C [W1 W2 ... Wn]

437 c_code = st

438 for ww in second:

439 width = ww.get_object()

440 font_width_map[chr(c_code)] = width

441 c_code += 1

442 w = w[2:]

443 else:

444 logger_warning(

445 "unknown widths : \n" + (ft1["/W"]).__repr__(),

446 __name__,

447 )

448 break

449 elif "/Widths" in ft:

450 w = cast(ArrayObject, ft["/Widths"].get_object())

451 if "/FontDescriptor" in ft and "/MissingWidth" in cast(

452 DictionaryObject, ft["/FontDescriptor"]

453 ):

454 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore

455 else:

456 # will consider width of char as avg(width)

457 m = 0

458 cpt = 0

459 for xx in w:

460 xx = xx.get_object()

461 if xx > 0:

462 m += xx

463 cpt += 1

464 font_width_map["default"] = m / max(1, cpt)

465 st = cast(int, ft["/FirstChar"])

466 en = cast(int, ft["/LastChar"])

467 for c_code in range(st, en + 1):

468 try:

469 width = w[c_code - st].get_object()

470 font_width_map[chr(c_code)] = width

471 except (IndexError, KeyError):

472 # The PDF structure is invalid. The array is too small

473 # for the specified font width.

474 pass

475 if is_null_or_none(font_width_map.get("default")):

476 font_width_map["default"] = default_font_width if default_font_width else 0.0

477 return font_width_map

478

479

480def compute_space_width(

481 font_width_map: Dict[Any, float], space_char: str

482) -> float:

483 try:

484 sp_width = font_width_map[space_char]

485 if sp_width == 0:

486 raise ValueError("Zero width")

487 except (KeyError, ValueError):

488 sp_width = (

489 font_width_map["default"] / 2.0

490 ) # if using default we consider space will be only half size

491

492 return sp_width

493

494

495def compute_font_width(

496 font_width_map: Dict[Any, float],

497 char: str

498) -> float:

499 char_width: float = 0.0

500 try:

501 char_width = font_width_map[char]

502 except KeyError:

503 char_width = (

504 font_width_map["default"]

505 )

506

507 return char_width

508

509

510def _type1_alternative(

511 ft: DictionaryObject,

512 map_dict: Dict[Any, Any],

513 int_entry: List[int],

514) -> Tuple[Dict[Any, Any], List[int]]:

515 if "/FontDescriptor" not in ft:

516 return map_dict, int_entry

517 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")

518 if is_null_or_none(ft_desc):

519 return map_dict, int_entry

520 assert ft_desc is not None, "mypy"

521 txt = ft_desc.get_object().get_data()

522 txt = txt.split(b"eexec\n")[0] # only clear part

523 txt = txt.split(b"/Encoding")[1] # to get the encoding part

524 lines = txt.replace(b"\r", b"\n").split(b"\n")

525 for li in lines:

526 if li.startswith(b"dup"):

527 words = [_w for _w in li.split(b" ") if _w != b""]

528 if len(words) > 3 and words[3] != b"put":

529 continue

530 try:

531 i = int(words[1])

532 except ValueError: # pragma: no cover

533 continue

534 try:

535 v = adobe_glyphs[words[2].decode()]

536 except KeyError:

537 if words[2].startswith(b"/uni"):

538 try:

539 v = chr(int(words[2][4:], 16))

540 except ValueError: # pragma: no cover

541 continue

542 else:

543 continue

544 map_dict[chr(i)] = v

545 int_entry.append(i)

546 return map_dict, int_entry

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

278 statements