Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1from __future__ import annotations

3from collections.abc import Sequence

4from dataclasses import dataclass, field

5from typing import TYPE_CHECKING, Any, cast

7from pypdf.generic import (

8 ArrayObject,

9 DictionaryObject,

10 FloatObject,

11 IndirectObject,

12 NameObject,

13 NumberObject,

14 PdfObject,

15 StreamObject,

16 TextStringObject,

17)

19from ._cmap import get_encoding

20from ._codecs.adobe_glyphs import adobe_glyphs

21from ._utils import logger_warning

22from .constants import FontFlags

23from .errors import PdfReadError

25if TYPE_CHECKING:

26 from io import BytesIO

28 from fontTools.ttLib.tables._h_e_a_d import table__h_e_a_d

29 from fontTools.ttLib.tables._p_o_s_t import table__p_o_s_t

30 from fontTools.ttLib.tables.O_S_2f_2 import table_O_S_2f_2

32 from ._writer import PdfWriter

34try:

35 from fontTools.ttLib import TTFont

36 HAS_FONTTOOLS = True

37except ImportError:

38 HAS_FONTTOOLS = False

41# Some constants from truetype font tables that we use:

42HEADER_MACSTYLE_ITALIC = 0x02

43OS2_FSSELECTION_ITALIC = 0x01

44OS2_PANOSE_BFAMILYTYPE_SCRIPT = 3

45OS2_PANOSE_BFAMILYTYPE_DECORATIVE = 4

46OS2_PANOSE_BFAMILYTYPE_PICTORIAL = 5

47OS2_PANOSE_BPROPORTION_MONOSPACED = 9

48OS2_SFAMILYSCLASS_SCRIPTS = 10

49OS2_SFAMILYSCLASS_SYMBOLIC = 12

52@dataclass(frozen=True)

53class FontDescriptor:

54 """

55 Represents the FontDescriptor dictionary as defined in the PDF specification.

56 This contains both descriptive and metric information.

58 The defaults are derived from the mean values of the 14 core fonts, rounded

59 to 100.

60 """

62 name: str = "Unknown"

63 family: str = "Unknown"

64 weight: str = "Unknown"

66 ascent: float = 700.0

67 descent: float = -200.0

68 cap_height: float = 600.0

69 x_height: float = 500.0

70 italic_angle: float = 0.0 # Non-italic

71 flags: int = 32 # Non-serif, non-symbolic, not fixed width

72 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))

73 font_file: StreamObject | None = None

75 def as_font_descriptor_resource(self) -> DictionaryObject:

76 font_descriptor_resource = DictionaryObject({

77 NameObject("/Type"): NameObject("/FontDescriptor"),

78 NameObject("/FontName"): NameObject(f"/{self.name}"),

79 NameObject("/Flags"): NumberObject(self.flags),

80 NameObject("/FontBBox"): ArrayObject([FloatObject(n) for n in self.bbox]),

81 NameObject("/ItalicAngle"): FloatObject(self.italic_angle),

82 NameObject("/Ascent"): FloatObject(self.ascent),

83 NameObject("/Descent"): FloatObject(self.descent),

84 NameObject("/CapHeight"): FloatObject(self.cap_height),

85 NameObject("/XHeight"): FloatObject(self.x_height),

86 })

88 if self.font_file:

89 # Add the stream. For now, we assume a TrueType font (FontFile2)

90 font_descriptor_resource[NameObject("/FontFile2")] = self.font_file

92 return font_descriptor_resource

95@dataclass(frozen=True)

96class CoreFontMetrics:

97 font_descriptor: FontDescriptor

98 character_widths: dict[str, int]

100

101@dataclass

102class Font:

103 """

104 A font object for use during text extraction and for producing

105 text appearance streams.

106

107 Attributes:

108 name: Font name, derived from font["/BaseFont"]

109 character_map: The font's character map

110 encoding: Font encoding

111 sub_type: The font type, such as Type1, TrueType, or Type3.

112 font_descriptor: Font metrics, including a mapping of characters to widths

113 character_widths: A mapping of characters to widths

114 space_width: The width of a space, or an approximation

115 interpretable: Default True. If False, the font glyphs cannot

116 be translated to characters, e.g. Type3 fonts that do not define

117 a '/ToUnicode' mapping.

118

119 """

120

121 name: str

122 encoding: str | dict[int, str]

123 character_map: dict[Any, Any] = field(default_factory=dict)

124 sub_type: str = "Unknown"

125 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)

126 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})

127 space_width: float | int = 250

128 space_char: str = " "

129 interpretable: bool = True

130

131 @staticmethod

132 def _collect_tt_t1_character_widths(

133 pdf_font_dict: DictionaryObject,

134 char_map: dict[Any, Any],

135 encoding: str | dict[int, str],

136 current_widths: dict[str, int]

137 ) -> None:

138 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""

139 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])

140 first_char = pdf_font_dict.get("/FirstChar", 0)

141 for idx, width in enumerate(widths_array):

142 current_widths[chr(idx + first_char)] = int(width)

143

144 @staticmethod

145 def _collect_cid_character_widths(

146 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]

147 ) -> None:

148 """Parses the /W array from a DescendantFont dictionary and updates character widths."""

149 # /W width definitions have two valid formats which can be mixed and matched:

150 # (1) A character start index followed by a list of widths, e.g.

151 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.

152 # (2) A character start index, a character stop index, and a width, e.g.

153 # `45 65 500` applies width 500 to characters 45-65.

154 skip_count = 0

155 _w = d_font.get("/W", [])

156 for idx, w_entry in enumerate(_w):

157 w_entry = w_entry.get_object()

158 if skip_count:

159 skip_count -= 1

160 continue

161 if not isinstance(w_entry, (int, float)):

162 # We should never get here due to skip_count above. But

163 # sometimes we do.

164 logger_warning(

165 "Expected numeric value for width, got %(w_entry)s. Ignoring it.",

166 source=__name__,

167 w_entry=w_entry,

168 )

169 continue

170 # check for format (1): `int [int int int int ...]`

171 w_next_entry = _w[idx + 1].get_object()

172 if isinstance(w_next_entry, Sequence):

173 start_idx, width_list = w_entry, w_next_entry

174 current_widths.update(

175 {

176 chr(_cidx): _width

177 for _cidx, _width in zip(

178 range(

179 cast(int, start_idx),

180 cast(int, start_idx) + len(width_list),

181 1,

182 ),

183 width_list,

184 )

185 }

186 )

187 skip_count = 1

188 # check for format (2): `int int int`

189 elif isinstance(w_next_entry, (int, float)) and isinstance(

190 _w[idx + 2].get_object(), (int, float)

191 ):

192 start_idx, stop_idx, const_width = (

193 w_entry,

194 w_next_entry,

195 _w[idx + 2].get_object(),

196 )

197 current_widths.update(

198 {

199 chr(_cidx): const_width

200 for _cidx in range(

201 cast(int, start_idx), cast(int, stop_idx + 1), 1

202 )

203 }

204 )

205 skip_count = 2

206 else:

207 # This handles the case of out of bounds (reaching the end of the width definitions

208 # while expecting more elements).

209 logger_warning(

210 "Invalid font width definition. Last element: %(w_entry)s.",

211 source=__name__,

212 w_entry=w_entry,

213 )

214

215 @staticmethod

216 def _get_space_char(

217 encoding: str | dict[int, str],

218 character_map: dict[Any, Any],

219 ) -> str:

220 space_char = " "

221 if isinstance(encoding, dict):

222 for char_code, char_str in encoding.items():

223 if char_str == space_char:

224 return chr(char_code)

225

226 for glyph_id, char_str in character_map.items():

227 if char_str == space_char:

228 return str(glyph_id)

229

230 return space_char

231

232 @staticmethod

233 def _add_default_width(current_widths: dict[str, int], flags: int, space_char: str) -> None:

234 if not current_widths:

235 current_widths["default"] = 500

236 return

237

238 if space_char in current_widths and current_widths[space_char] != 0:

239 # Setting default to once or twice the space width, depending on fixed pitch

240 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:

241 current_widths["default"] = current_widths[space_char]

242 return

243

244 current_widths["default"] = int(2 * current_widths[space_char])

245 return

246

247 # Use the average width of existing glyph widths

248 valid_widths = [w for w in current_widths.values() if w > 0]

249 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500

250

251 @staticmethod

252 def _add_space_width(

253 character_widths: dict[str, int],

254 flags: int,

255 space_char: str

256 ) -> int:

257 space_width = character_widths.get(space_char, 0)

258 if space_width != 0:

259 return space_width

260

261 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:

262 return character_widths["default"]

263

264 return character_widths["default"] // 2

265

266 @staticmethod

267 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:

268 font_descriptor_kwargs: dict[Any, Any] = {}

269 for source_key, target_key in [

270 ("/FontName", "name"),

271 ("/FontFamily", "family"),

272 ("/FontWeight", "weight"),

273 ("/Ascent", "ascent"),

274 ("/Descent", "descent"),

275 ("/CapHeight", "cap_height"),

276 ("/XHeight", "x_height"),

277 ("/ItalicAngle", "italic_angle"),

278 ("/Flags", "flags"),

279 ("/FontBBox", "bbox")

280 ]:

281 if source_key in font_descriptor_obj:

282 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key]

283 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes

284 if "bbox" in font_descriptor_kwargs:

285 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))

286 assert len(bbox_tuple) == 4, bbox_tuple

287 font_descriptor_kwargs["bbox"] = bbox_tuple

288

289 # Find the binary stream for this font if there is one

290 for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]:

291 if source_key in font_descriptor_obj:

292 if "font_file" in font_descriptor_kwargs:

293 raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}")

294

295 try:

296 font_file = font_descriptor_obj[source_key].get_object()

297 font_descriptor_kwargs["font_file"] = font_file

298 except PdfReadError as e:

299 logger_warning(

300 "Failed to get %(source_key)r in %(font_descriptor_obj)s: %(error)s",

301 source=__name__,

302 source_key=source_key,

303 font_descriptor_obj=font_descriptor_obj,

304 error=e,

305 )

306 return font_descriptor_kwargs

307

308 @classmethod

309 def from_font_resource(

310 cls,

311 pdf_font_dict: DictionaryObject,

312 ) -> Font:

313 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415

314

315 # Can collect base_font, name and encoding directly from font resource

316 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")

317 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")

318 encoding, character_map = get_encoding(pdf_font_dict)

319 font_descriptor = None

320 character_widths: dict[str, int] = {}

321 interpretable = True

322

323 # Deal with fonts by type; Type1, TrueType and certain Type3

324 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):

325 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be

326 # reliably converted into character codes unless all named chars

327 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the

328 # PDF 1.7 standard.

329 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:

330 interpretable = all(

331 cname in adobe_glyphs

332 for cname in pdf_font_dict.get("/CharProcs") or []

333 )

334 if interpretable: # Save some overhead if font is not interpretable

335 if "/Widths" in pdf_font_dict:

336 cls._collect_tt_t1_character_widths(

337 pdf_font_dict, character_map, encoding, character_widths

338 )

339

340 elif name in CORE_FONT_METRICS:

341 font_descriptor = CORE_FONT_METRICS[name].font_descriptor

342 if isinstance(encoding, dict):

343 for code, character in encoding.items():

344 # Look up the width using the glyph name from the encoding

345 if character in CORE_FONT_METRICS[name].character_widths:

346 character_widths[chr(code)] = CORE_FONT_METRICS[name].character_widths[character]

347 else:

348 for code in range(256):

349 character = chr(code)

350 if character in CORE_FONT_METRICS[name].character_widths:

351 character_widths[character] = CORE_FONT_METRICS[name].character_widths[character]

352 if "/FontDescriptor" in pdf_font_dict:

353 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()

354 if "/MissingWidth" in font_descriptor_obj:

355 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object())

356 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))

357 elif "/FontBBox" in pdf_font_dict:

358 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0

359 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"])))

360 assert len(bbox_tuple) == 4, bbox_tuple

361 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple)

362

363 else:

364 # Composite font or CID font - CID fonts have a /W array mapping character codes

365 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,

366 # because all other fonts have already been dealt with.

367 d_font: DictionaryObject

368 for d_font_idx, d_font in enumerate(

369 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])

370 ):

371 d_font = cast(DictionaryObject, d_font.get_object())

372 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font

373 cls._collect_cid_character_widths(

374 d_font, character_map, character_widths

375 )

376 if "/DW" in d_font:

377 character_widths["default"] = cast(int, d_font["/DW"].get_object())

378 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object()

379 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))

380

381 if not font_descriptor:

382 font_descriptor = FontDescriptor(name=name)

383

384 space_char = cls._get_space_char(encoding, character_map)

385 if character_widths.get("default", 0) == 0:

386 cls._add_default_width(character_widths, font_descriptor.flags, space_char)

387 space_width = cls._add_space_width(character_widths, font_descriptor.flags, space_char)

388

389 return cls(

390 name=name,

391 sub_type=sub_type,

392 encoding=encoding,

393 font_descriptor=font_descriptor,

394 character_map=character_map,

395 character_widths=character_widths,

396 space_width=space_width,

397 space_char=space_char,

398 interpretable=interpretable

399 )

400

401 @staticmethod

402 def _font_flags_from_truetype_font_tables(

403 header: table__h_e_a_d,

404 postscript: table__p_o_s_t,

405 os2: table_O_S_2f_2

406 ) -> int:

407 # Get the font flags

408 if os2:

409 panose = os2.panose

410 # sFamilyClass is a two-byte field. The high byte describes the family class, whereas the low

411 # byte only describes the subclass. We only need the high byte, hence the bit shift below:

412 family_class = os2.sFamilyClass >> 8

413 flags: int = 0

414

415 # ITALIC

416 if header.macStyle & HEADER_MACSTYLE_ITALIC or (os2 and os2.fsSelection & OS2_FSSELECTION_ITALIC):

417 flags |= FontFlags.ITALIC

418 if postscript:

419 italic_angle = postscript.italicAngle

420 if italic_angle != 0.0:

421 flags |= FontFlags.ITALIC

422

423 # FIXED_PITCH

424 if (

425 (os2 and panose.bProportion == OS2_PANOSE_BPROPORTION_MONOSPACED) or

426 (postscript and postscript.isFixedPitch > 0) # Actually 1, but originally (older versions of the TTF

427 ): # specification) any non-zero value signified monospace.

428 flags |= FontFlags.FIXED_PITCH

429

430 # SCRIPT

431 if os2 and (

432 family_class == OS2_SFAMILYSCLASS_SCRIPTS or panose.bFamilyType == OS2_PANOSE_BFAMILYTYPE_SCRIPT

433 ):

434 flags |= FontFlags.SCRIPT

435

436 # SERIF

437 if os2 and (

438 2 <= panose.bSerifStyle <= 10

439 or 1 <= family_class <= 5 or family_class == 7 # 6 is reserved, all 8 and above are not serif

440 ):

441 flags |= FontFlags.SERIF

442

443 # SYMBOLIC

444 if os2 and (

445 family_class == OS2_SFAMILYSCLASS_SYMBOLIC or

446 panose.bFamilyType in {OS2_PANOSE_BFAMILYTYPE_DECORATIVE, OS2_PANOSE_BFAMILYTYPE_PICTORIAL}

447 ):

448 flags |= FontFlags.SYMBOLIC

449 else:

450 flags |= FontFlags.NONSYMBOLIC

451

452 return flags

453

454 @classmethod

455 def from_truetype_font_file(cls, font_file: BytesIO) -> Font:

456 if not HAS_FONTTOOLS:

457 raise ImportError("The 'fontTools' library is required to use 'from_truetype_font_file'")

458 with TTFont(font_file) as tt_font_object:

459 # See Chapter 6 of the TrueType reference manual for the definition of the head, OS/2 and post tables:

460 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6head.html

461 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6OS2.html

462 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6post.html

463 header = tt_font_object["head"]

464 horizontal_header = tt_font_object["hhea"]

465 metrics = tt_font_object["hmtx"].metrics

466

467 # Collect additional font tables to derive font information

468 postscript = tt_font_object.get("post", None)

469 os2 = tt_font_object.get("OS/2", None)

470

471 # Get the scaling factor to convert font file's units per em to PDF's 1000 units per em

472 units_per_em = header.unitsPerEm

473 scale_factor = 1000.0 / units_per_em

474

475 # Get the font descriptor

476 font_descriptor_kwargs: dict[Any, Any] = {}

477 names = tt_font_object.get("name", None)

478 if names:

479 font_descriptor_kwargs["name"] = names.getBestFullName()

480 font_descriptor_kwargs["family"] = names.getBestFamilyName()

481 font_descriptor_kwargs["weight"] = names.getBestSubFamilyName()

482 font_descriptor_kwargs["ascent"] = int(round(horizontal_header.ascent * scale_factor, 0))

483 font_descriptor_kwargs["descent"] = int(round(horizontal_header.descent * scale_factor, 0))

484 if os2:

485 try:

486 font_descriptor_kwargs["cap_height"] = int(round(os2.sCapHeight * scale_factor, 0))

487 font_descriptor_kwargs["x_height"] = int(round(os2.sxHeight * scale_factor, 0))

488 except AttributeError:

489 pass

490

491 font_descriptor_kwargs["flags"] = cls._font_flags_from_truetype_font_tables(header, postscript, os2)

492

493 font_descriptor_kwargs["bbox"] = (

494 round(header.xMin * scale_factor, 0),

495 round(header.yMin * scale_factor, 0),

496 round(header.xMax * scale_factor, 0),

497 round(header.yMax * scale_factor, 0)

498 )

499

500 font_file_data = StreamObject()

501 font_file_raw_bytes = font_file.getvalue()

502 font_file_data.set_data(font_file_raw_bytes)

503 font_file_data.update({NameObject("/Length1"): NumberObject(len(font_file_raw_bytes))})

504 font_descriptor_kwargs["font_file"] = font_file_data

505

506 font_descriptor = FontDescriptor(**font_descriptor_kwargs)

507 encoding = "utf_16_be" # Assume unicode

508

509 character_widths: dict[str, int] = {}

510 character_map: dict[str, str] = {}

511

512 glyph_order = tt_font_object.getGlyphOrder()

513 # Note that one glyph can be mapped to multiple unicode code points. However, buildReversedMin()

514 # creates a dictionary mapping glyphs to the minimum Unicode codepoint.

515 tt_font_cmap_table = tt_font_object.get("cmap")

516 if tt_font_cmap_table:

517 reverse_cmap = tt_font_cmap_table.buildReversedMin()

518 for gid, glyph in enumerate(glyph_order):

519 char_code = reverse_cmap.get(glyph)

520 if char_code is None:

521 continue

522 char = chr(char_code)

523 gid = tt_font_object.getGlyphID(glyph)

524 # The following is to comply with how font_glyph_byte_map works in _appearance_stream.py

525 gid_bytes = gid.to_bytes(2, "big")

526 gid_key_string = gid_bytes.decode("utf-16-be", "surrogatepass")

527 character_map[gid_key_string] = char

528 character_widths[gid_key_string] = int(round(metrics[glyph][0] * scale_factor, 0))

529 else:

530 raise PdfReadError("Font file does not have a cmap table")

531

532 space_char = cls._get_space_char(encoding, character_map)

533 cls._add_default_width(character_widths, font_descriptor_kwargs["flags"], space_char)

534 space_width = cls._add_space_width(

535 character_widths, font_descriptor_kwargs["flags"], space_char

536 )

537

538 return cls(

539 name=font_descriptor.name,

540 sub_type="TrueType",

541 encoding=encoding,

542 font_descriptor=font_descriptor,

543 character_map=character_map,

544 character_widths=character_widths,

545 space_width=space_width,

546 space_char=space_char,

547 interpretable=True

548 )

549

550 def _get_typographic_maps(self) -> tuple[dict[str, str], dict[str, bytes]]:

551 """

552 Generates maps to translate input unicode text to bytes in two steps:

553 Unicode code point -> raw_character (reverse cmap) -> PDF bytes (encoding cmap).

554 """

555 reverse_cmap = {}

556 encoding_cmap = {}

557

558 if isinstance(self.encoding, str):

559 for glyph_id, unicode_char in self.character_map.items():

560 glyph_id_str = str(glyph_id)

561 reverse_cmap[unicode_char] = glyph_id_str

562 encoding_cmap[glyph_id_str] = glyph_id_str.encode(self.encoding)

563 else:

564 for character_code, unicode_char in self.encoding.items():

565 character_str = chr(character_code)

566 reverse_cmap[unicode_char] = character_str

567 encoding_cmap[character_str] = bytes((character_code,))

568

569 unicode_to_bytes = {

570 unicode_char: bytes((character_code,)) for character_code, unicode_char in self.encoding.items()

571 }

572 for glyph_id, unicode_char in self.character_map.items(): # This code is not covered nor tested

573 reverse_cmap[unicode_char] = glyph_id

574 encoding_cmap[glyph_id] = unicode_to_bytes.get(unicode_char, bytes((glyph_id,)))

575

576 return reverse_cmap, encoding_cmap

577

578 def _create_widths_list_and_unicode_stream(self) -> tuple[list[PdfObject], StreamObject]:

579 widths_list = []

580 unicode_map = []

581 # In the loop, char is the decoded GID string (the reverse unicode hack)

582 # and character_map[char] is the actual character.

583 # The widths (/W) array can have two formats:

584 # [first_cid [w1 w2 w3]] or [first last width]

585 # Here we choose the first format and simply provide one array with one width for every cid.

586 for gid_str, actual_char in self.character_map.items():

587 uni_point = ord(actual_char)

588 # Only deal with Basic Multilingual Plane characters.

589 # TODO: Add all characters. However, this requires widths reworking first.

590 if uni_point <= 0xFFFF:

591 cid = ord(gid_str)

592 cid_hex = f"{cid:04X}"

593 uni_hex = f"{uni_point:04X}"

594 unicode_map.append(f"<{cid_hex}> <{uni_hex}>")

595

596 width = self.character_widths.get(gid_str, self.character_widths["default"])

597 widths_list.extend([NumberObject(cid), ArrayObject([NumberObject(width)])])

598

599 # Create the /ToUnicode CMap Stream

600 to_unicode_stream = StreamObject()

601 to_unicode_stream.set_data(

602 (

603 "/CIDInit /ProcSet findresource begin\n"

604 "12 dict begin\n"

605 "begincmap\n"

606 "/CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def\n"

607 "/CMapName /Adobe-Identity-UCS def\n"

608 "/CMapType 2 def\n"

609 "1 begincodespacerange <0000> <FFFF> endcodespacerange\n"

610 f"{len(unicode_map)} beginbfchar\n"

611 + "\n".join(unicode_map) + "\n"

612 "endbfchar\n"

613 "endcmap\n"

614 "CMapName currentdict /CMap defineresource pop\n"

615 "end end"

616 ).encode("ascii")

617 )

618

619 return widths_list, to_unicode_stream

620

621 def as_font_resource(self) -> DictionaryObject:

622 # If we have an embedded Truetype font, we assume that we need to produce a Type 2 CID font resource.

623 if self.font_descriptor.font_file and self.sub_type == "TrueType":

624 # Begin with creating the widths array (part of the descendant font) and the unicode cmap (part

625 # of the Type 0 font obect).

626 widths_list, to_unicode_stream = self._create_widths_list_and_unicode_stream()

627

628 # Create the descendant font object

629 cid_font = DictionaryObject({

630 NameObject("/Type"): NameObject("/Font"),

631 NameObject("/Subtype"): NameObject("/CIDFontType2"),

632 NameObject("/BaseFont"): NameObject(f"/{self.name}"),

633 NameObject("/CIDSystemInfo"): DictionaryObject({

634 NameObject("/Registry"): TextStringObject("Adobe"),

635 NameObject("/Ordering"): TextStringObject("Identity"),

636 NameObject("/Supplement"): NumberObject(0)

637 }),

638 NameObject("/FontDescriptor"): self.font_descriptor.as_font_descriptor_resource(),

639 NameObject("/W"): ArrayObject(widths_list),

640 NameObject("/DW"): NumberObject(self.character_widths["default"]),

641 NameObject("/CIDToGIDMap"): NameObject("/Identity")

642 })

643

644 # Create the Type 0 font object

645 return DictionaryObject({

646 NameObject("/Type"): NameObject("/Font"),

647 NameObject("/Subtype"): NameObject("/Type0"),

648 NameObject("/BaseFont"): NameObject(f"/{self.name}"),

649 NameObject("/Encoding"): NameObject("/Identity-H"),

650 NameObject("/DescendantFonts"): ArrayObject([cid_font]),

651 NameObject("/ToUnicode"): to_unicode_stream,

652 })

653

654 # Fallback: Return a font resource for one of the 14 Adobe Core fonts.

655 return DictionaryObject({

656 NameObject("/Type"): NameObject("/Font"),

657 NameObject("/Subtype"): NameObject("/Type1"),

658 NameObject("/Name"): NameObject(f"/{self.name}"),

659 NameObject("/BaseFont"): NameObject(f"/{self.name}"),

660 NameObject("/Encoding"): NameObject("/WinAnsiEncoding")

661 })

662

663 def _add_to_writer(

664 self,

665 writer: PdfWriter,

666 target_resource_dict: DictionaryObject,

667 font_resource_name: NameObject

668 ) -> IndirectObject:

669 """

670 Some objects in a font resource need to be indirect objects. This method

671 ensures that ToUnicode, FontDescriptor, FontFile, and, ultimately, the font

672 resource itself, are registered with the PdfWriter instance as indirect objects.

673 """

674 font_resource = self.as_font_resource()

675 if "/ToUnicode" in font_resource:

676 font_resource[NameObject("/ToUnicode")] = writer._add_object(font_resource["/ToUnicode"])

677

678 if "/DescendantFonts" in font_resource:

679 descendant_fonts = cast(ArrayObject, font_resource["/DescendantFonts"])

680 font_resource_dict = cast(DictionaryObject, descendant_fonts[0])

681 else:

682 font_resource_dict = font_resource

683

684 if "/FontDescriptor" in font_resource_dict:

685 font_descriptor_obj = cast(DictionaryObject, font_resource_dict["/FontDescriptor"])

686 for key in ["/FontFile", "/FontFile2", "/FontFile3"]:

687 if key in font_descriptor_obj:

688 font_descriptor_obj[NameObject(key)] = writer._add_object(font_descriptor_obj[key])

689 font_resource_dict[NameObject("/FontDescriptor")] = writer._add_object(

690 font_resource_dict["/FontDescriptor"]

691 )

692 font_resource_ref = writer._add_object(font_resource)

693 target_resource_dict[font_resource_name] = font_resource_ref

694 return font_resource_ref

695

696 def get_text_width(self, text: str = "") -> float:

697 """Sum of character widths specified in PDF font for the supplied text."""

698 return sum(

699 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0

700 )

701

702 def can_encode(self, text: str) -> bool:

703 """Check whether the font is able to encode a text string."""

704 try:

705 if self.character_map:

706 supported_chars = set(self.character_map.values())

707 return all(char in supported_chars for char in text)

708 if isinstance(self.encoding, str):

709 text.encode(self.encoding, "surrogatepass")

710 else:

711 supported_chars = set(self.encoding.values())

712 return all(char in supported_chars for char in text)

713

714 except UnicodeEncodeError:

715 return False

716

717 return True

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 22%

338 statements