Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 22%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3from collections.abc import Sequence
4from dataclasses import dataclass, field
5from typing import TYPE_CHECKING, Any, cast
7from pypdf.generic import (
8 ArrayObject,
9 DictionaryObject,
10 FloatObject,
11 IndirectObject,
12 NameObject,
13 NumberObject,
14 PdfObject,
15 StreamObject,
16 TextStringObject,
17)
19from ._cmap import get_encoding
20from ._codecs.adobe_glyphs import adobe_glyphs
21from ._utils import logger_warning
22from .constants import FontFlags
23from .errors import PdfReadError
25if TYPE_CHECKING:
26 from io import BytesIO
28 from fontTools.ttLib.tables._h_e_a_d import table__h_e_a_d
29 from fontTools.ttLib.tables._p_o_s_t import table__p_o_s_t
30 from fontTools.ttLib.tables.O_S_2f_2 import table_O_S_2f_2
32 from ._writer import PdfWriter
34try:
35 from fontTools.ttLib import TTFont
36 HAS_FONTTOOLS = True
37except ImportError:
38 HAS_FONTTOOLS = False
41# Some constants from truetype font tables that we use:
42HEADER_MACSTYLE_ITALIC = 0x02
43OS2_FSSELECTION_ITALIC = 0x01
44OS2_PANOSE_BFAMILYTYPE_SCRIPT = 3
45OS2_PANOSE_BFAMILYTYPE_DECORATIVE = 4
46OS2_PANOSE_BFAMILYTYPE_PICTORIAL = 5
47OS2_PANOSE_BPROPORTION_MONOSPACED = 9
48OS2_SFAMILYSCLASS_SCRIPTS = 10
49OS2_SFAMILYSCLASS_SYMBOLIC = 12
52@dataclass(frozen=True)
53class FontDescriptor:
54 """
55 Represents the FontDescriptor dictionary as defined in the PDF specification.
56 This contains both descriptive and metric information.
58 The defaults are derived from the mean values of the 14 core fonts, rounded
59 to 100.
60 """
62 name: str = "Unknown"
63 family: str = "Unknown"
64 weight: str = "Unknown"
66 ascent: float = 700.0
67 descent: float = -200.0
68 cap_height: float = 600.0
69 x_height: float = 500.0
70 italic_angle: float = 0.0 # Non-italic
71 flags: int = 32 # Non-serif, non-symbolic, not fixed width
72 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
73 font_file: StreamObject | None = None
75 def as_font_descriptor_resource(self) -> DictionaryObject:
76 font_descriptor_resource = DictionaryObject({
77 NameObject("/Type"): NameObject("/FontDescriptor"),
78 NameObject("/FontName"): NameObject(f"/{self.name}"),
79 NameObject("/Flags"): NumberObject(self.flags),
80 NameObject("/FontBBox"): ArrayObject([FloatObject(n) for n in self.bbox]),
81 NameObject("/ItalicAngle"): FloatObject(self.italic_angle),
82 NameObject("/Ascent"): FloatObject(self.ascent),
83 NameObject("/Descent"): FloatObject(self.descent),
84 NameObject("/CapHeight"): FloatObject(self.cap_height),
85 NameObject("/XHeight"): FloatObject(self.x_height),
86 })
88 if self.font_file:
89 # Add the stream. For now, we assume a TrueType font (FontFile2)
90 font_descriptor_resource[NameObject("/FontFile2")] = self.font_file
92 return font_descriptor_resource
95@dataclass(frozen=True)
96class CoreFontMetrics:
97 font_descriptor: FontDescriptor
98 character_widths: dict[str, int]
101@dataclass
102class Font:
103 """
104 A font object for use during text extraction and for producing
105 text appearance streams.
107 Attributes:
108 name: Font name, derived from font["/BaseFont"]
109 character_map: The font's character map
110 encoding: Font encoding
111 sub_type: The font type, such as Type1, TrueType, or Type3.
112 font_descriptor: Font metrics, including a mapping of characters to widths
113 character_widths: A mapping of characters to widths
114 space_width: The width of a space, or an approximation
115 interpretable: Default True. If False, the font glyphs cannot
116 be translated to characters, e.g. Type3 fonts that do not define
117 a '/ToUnicode' mapping.
119 """
121 name: str
122 encoding: str | dict[int, str]
123 character_map: dict[Any, Any] = field(default_factory=dict)
124 sub_type: str = "Unknown"
125 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)
126 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})
127 space_width: float | int = 250
128 space_char: str = " "
129 interpretable: bool = True
131 @staticmethod
132 def _collect_tt_t1_character_widths(
133 pdf_font_dict: DictionaryObject,
134 char_map: dict[Any, Any],
135 encoding: str | dict[int, str],
136 current_widths: dict[str, int]
137 ) -> None:
138 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""
139 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])
140 first_char = pdf_font_dict.get("/FirstChar", 0)
141 for idx, width in enumerate(widths_array):
142 current_widths[chr(idx + first_char)] = int(width)
144 @staticmethod
145 def _collect_cid_character_widths(
146 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
147 ) -> None:
148 """Parses the /W array from a DescendantFont dictionary and updates character widths."""
149 # /W width definitions have two valid formats which can be mixed and matched:
150 # (1) A character start index followed by a list of widths, e.g.
151 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
152 # (2) A character start index, a character stop index, and a width, e.g.
153 # `45 65 500` applies width 500 to characters 45-65.
154 skip_count = 0
155 _w = d_font.get("/W", [])
156 for idx, w_entry in enumerate(_w):
157 w_entry = w_entry.get_object()
158 if skip_count:
159 skip_count -= 1
160 continue
161 if not isinstance(w_entry, (int, float)):
162 # We should never get here due to skip_count above. But
163 # sometimes we do.
164 logger_warning(
165 "Expected numeric value for width, got %(w_entry)s. Ignoring it.",
166 source=__name__,
167 w_entry=w_entry,
168 )
169 continue
170 # check for format (1): `int [int int int int ...]`
171 w_next_entry = _w[idx + 1].get_object()
172 if isinstance(w_next_entry, Sequence):
173 start_idx, width_list = w_entry, w_next_entry
174 current_widths.update(
175 {
176 chr(_cidx): _width
177 for _cidx, _width in zip(
178 range(
179 cast(int, start_idx),
180 cast(int, start_idx) + len(width_list),
181 1,
182 ),
183 width_list,
184 )
185 }
186 )
187 skip_count = 1
188 # check for format (2): `int int int`
189 elif isinstance(w_next_entry, (int, float)) and isinstance(
190 _w[idx + 2].get_object(), (int, float)
191 ):
192 start_idx, stop_idx, const_width = (
193 w_entry,
194 w_next_entry,
195 _w[idx + 2].get_object(),
196 )
197 current_widths.update(
198 {
199 chr(_cidx): const_width
200 for _cidx in range(
201 cast(int, start_idx), cast(int, stop_idx + 1), 1
202 )
203 }
204 )
205 skip_count = 2
206 else:
207 # This handles the case of out of bounds (reaching the end of the width definitions
208 # while expecting more elements).
209 logger_warning(
210 "Invalid font width definition. Last element: %(w_entry)s.",
211 source=__name__,
212 w_entry=w_entry,
213 )
215 @staticmethod
216 def _get_space_char(
217 encoding: str | dict[int, str],
218 character_map: dict[Any, Any],
219 ) -> str:
220 space_char = " "
221 if isinstance(encoding, dict):
222 for char_code, char_str in encoding.items():
223 if char_str == space_char:
224 return chr(char_code)
226 for glyph_id, char_str in character_map.items():
227 if char_str == space_char:
228 return str(glyph_id)
230 return space_char
232 @staticmethod
233 def _add_default_width(current_widths: dict[str, int], flags: int, space_char: str) -> None:
234 if not current_widths:
235 current_widths["default"] = 500
236 return
238 if space_char in current_widths and current_widths[space_char] != 0:
239 # Setting default to once or twice the space width, depending on fixed pitch
240 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
241 current_widths["default"] = current_widths[space_char]
242 return
244 current_widths["default"] = int(2 * current_widths[space_char])
245 return
247 # Use the average width of existing glyph widths
248 valid_widths = [w for w in current_widths.values() if w > 0]
249 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500
251 @staticmethod
252 def _add_space_width(
253 character_widths: dict[str, int],
254 flags: int,
255 space_char: str
256 ) -> int:
257 space_width = character_widths.get(space_char, 0)
258 if space_width != 0:
259 return space_width
261 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
262 return character_widths["default"]
264 return character_widths["default"] // 2
266 @staticmethod
267 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
268 font_descriptor_kwargs: dict[Any, Any] = {}
269 for source_key, target_key in [
270 ("/FontName", "name"),
271 ("/FontFamily", "family"),
272 ("/FontWeight", "weight"),
273 ("/Ascent", "ascent"),
274 ("/Descent", "descent"),
275 ("/CapHeight", "cap_height"),
276 ("/XHeight", "x_height"),
277 ("/ItalicAngle", "italic_angle"),
278 ("/Flags", "flags"),
279 ("/FontBBox", "bbox")
280 ]:
281 if source_key in font_descriptor_obj:
282 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key]
283 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes
284 if "bbox" in font_descriptor_kwargs:
285 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))
286 assert len(bbox_tuple) == 4, bbox_tuple
287 font_descriptor_kwargs["bbox"] = bbox_tuple
289 # Find the binary stream for this font if there is one
290 for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]:
291 if source_key in font_descriptor_obj:
292 if "font_file" in font_descriptor_kwargs:
293 raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}")
295 try:
296 font_file = font_descriptor_obj[source_key].get_object()
297 font_descriptor_kwargs["font_file"] = font_file
298 except PdfReadError as e:
299 logger_warning(
300 "Failed to get %(source_key)r in %(font_descriptor_obj)s: %(error)s",
301 source=__name__,
302 source_key=source_key,
303 font_descriptor_obj=font_descriptor_obj,
304 error=e,
305 )
306 return font_descriptor_kwargs
308 @classmethod
309 def from_font_resource(
310 cls,
311 pdf_font_dict: DictionaryObject,
312 ) -> Font:
313 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415
315 # Can collect base_font, name and encoding directly from font resource
316 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
317 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")
318 encoding, character_map = get_encoding(pdf_font_dict)
319 font_descriptor = None
320 character_widths: dict[str, int] = {}
321 interpretable = True
323 # Deal with fonts by type; Type1, TrueType and certain Type3
324 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
325 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
326 # reliably converted into character codes unless all named chars
327 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
328 # PDF 1.7 standard.
329 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:
330 interpretable = all(
331 cname in adobe_glyphs
332 for cname in pdf_font_dict.get("/CharProcs") or []
333 )
334 if interpretable: # Save some overhead if font is not interpretable
335 if "/Widths" in pdf_font_dict:
336 cls._collect_tt_t1_character_widths(
337 pdf_font_dict, character_map, encoding, character_widths
338 )
340 elif name in CORE_FONT_METRICS:
341 font_descriptor = CORE_FONT_METRICS[name].font_descriptor
342 if isinstance(encoding, dict):
343 for code, character in encoding.items():
344 # Look up the width using the glyph name from the encoding
345 if character in CORE_FONT_METRICS[name].character_widths:
346 character_widths[chr(code)] = CORE_FONT_METRICS[name].character_widths[character]
347 else:
348 for code in range(256):
349 character = chr(code)
350 if character in CORE_FONT_METRICS[name].character_widths:
351 character_widths[character] = CORE_FONT_METRICS[name].character_widths[character]
352 if "/FontDescriptor" in pdf_font_dict:
353 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()
354 if "/MissingWidth" in font_descriptor_obj:
355 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object())
356 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))
357 elif "/FontBBox" in pdf_font_dict:
358 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0
359 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"])))
360 assert len(bbox_tuple) == 4, bbox_tuple
361 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple)
363 else:
364 # Composite font or CID font - CID fonts have a /W array mapping character codes
365 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,
366 # because all other fonts have already been dealt with.
367 d_font: DictionaryObject
368 for d_font_idx, d_font in enumerate(
369 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
370 ):
371 d_font = cast(DictionaryObject, d_font.get_object())
372 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
373 cls._collect_cid_character_widths(
374 d_font, character_map, character_widths
375 )
376 if "/DW" in d_font:
377 character_widths["default"] = cast(int, d_font["/DW"].get_object())
378 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object()
379 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))
381 if not font_descriptor:
382 font_descriptor = FontDescriptor(name=name)
384 space_char = cls._get_space_char(encoding, character_map)
385 if character_widths.get("default", 0) == 0:
386 cls._add_default_width(character_widths, font_descriptor.flags, space_char)
387 space_width = cls._add_space_width(character_widths, font_descriptor.flags, space_char)
389 return cls(
390 name=name,
391 sub_type=sub_type,
392 encoding=encoding,
393 font_descriptor=font_descriptor,
394 character_map=character_map,
395 character_widths=character_widths,
396 space_width=space_width,
397 space_char=space_char,
398 interpretable=interpretable
399 )
401 @staticmethod
402 def _font_flags_from_truetype_font_tables(
403 header: table__h_e_a_d,
404 postscript: table__p_o_s_t,
405 os2: table_O_S_2f_2
406 ) -> int:
407 # Get the font flags
408 if os2:
409 panose = os2.panose
410 # sFamilyClass is a two-byte field. The high byte describes the family class, whereas the low
411 # byte only describes the subclass. We only need the high byte, hence the bit shift below:
412 family_class = os2.sFamilyClass >> 8
413 flags: int = 0
415 # ITALIC
416 if header.macStyle & HEADER_MACSTYLE_ITALIC or (os2 and os2.fsSelection & OS2_FSSELECTION_ITALIC):
417 flags |= FontFlags.ITALIC
418 if postscript:
419 italic_angle = postscript.italicAngle
420 if italic_angle != 0.0:
421 flags |= FontFlags.ITALIC
423 # FIXED_PITCH
424 if (
425 (os2 and panose.bProportion == OS2_PANOSE_BPROPORTION_MONOSPACED) or
426 (postscript and postscript.isFixedPitch > 0) # Actually 1, but originally (older versions of the TTF
427 ): # specification) any non-zero value signified monospace.
428 flags |= FontFlags.FIXED_PITCH
430 # SCRIPT
431 if os2 and (
432 family_class == OS2_SFAMILYSCLASS_SCRIPTS or panose.bFamilyType == OS2_PANOSE_BFAMILYTYPE_SCRIPT
433 ):
434 flags |= FontFlags.SCRIPT
436 # SERIF
437 if os2 and (
438 2 <= panose.bSerifStyle <= 10
439 or 1 <= family_class <= 5 or family_class == 7 # 6 is reserved, all 8 and above are not serif
440 ):
441 flags |= FontFlags.SERIF
443 # SYMBOLIC
444 if os2 and (
445 family_class == OS2_SFAMILYSCLASS_SYMBOLIC or
446 panose.bFamilyType in {OS2_PANOSE_BFAMILYTYPE_DECORATIVE, OS2_PANOSE_BFAMILYTYPE_PICTORIAL}
447 ):
448 flags |= FontFlags.SYMBOLIC
449 else:
450 flags |= FontFlags.NONSYMBOLIC
452 return flags
454 @classmethod
455 def from_truetype_font_file(cls, font_file: BytesIO) -> Font:
456 if not HAS_FONTTOOLS:
457 raise ImportError("The 'fontTools' library is required to use 'from_truetype_font_file'")
458 with TTFont(font_file) as tt_font_object:
459 # See Chapter 6 of the TrueType reference manual for the definition of the head, OS/2 and post tables:
460 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6head.html
461 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6OS2.html
462 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6post.html
463 header = tt_font_object["head"]
464 horizontal_header = tt_font_object["hhea"]
465 metrics = tt_font_object["hmtx"].metrics
467 # Collect additional font tables to derive font information
468 postscript = tt_font_object.get("post", None)
469 os2 = tt_font_object.get("OS/2", None)
471 # Get the scaling factor to convert font file's units per em to PDF's 1000 units per em
472 units_per_em = header.unitsPerEm
473 scale_factor = 1000.0 / units_per_em
475 # Get the font descriptor
476 font_descriptor_kwargs: dict[Any, Any] = {}
477 names = tt_font_object.get("name", None)
478 if names:
479 font_descriptor_kwargs["name"] = names.getBestFullName()
480 font_descriptor_kwargs["family"] = names.getBestFamilyName()
481 font_descriptor_kwargs["weight"] = names.getBestSubFamilyName()
482 font_descriptor_kwargs["ascent"] = int(round(horizontal_header.ascent * scale_factor, 0))
483 font_descriptor_kwargs["descent"] = int(round(horizontal_header.descent * scale_factor, 0))
484 if os2:
485 try:
486 font_descriptor_kwargs["cap_height"] = int(round(os2.sCapHeight * scale_factor, 0))
487 font_descriptor_kwargs["x_height"] = int(round(os2.sxHeight * scale_factor, 0))
488 except AttributeError:
489 pass
491 font_descriptor_kwargs["flags"] = cls._font_flags_from_truetype_font_tables(header, postscript, os2)
493 font_descriptor_kwargs["bbox"] = (
494 round(header.xMin * scale_factor, 0),
495 round(header.yMin * scale_factor, 0),
496 round(header.xMax * scale_factor, 0),
497 round(header.yMax * scale_factor, 0)
498 )
500 font_file_data = StreamObject()
501 font_file_raw_bytes = font_file.getvalue()
502 font_file_data.set_data(font_file_raw_bytes)
503 font_file_data.update({NameObject("/Length1"): NumberObject(len(font_file_raw_bytes))})
504 font_descriptor_kwargs["font_file"] = font_file_data
506 font_descriptor = FontDescriptor(**font_descriptor_kwargs)
507 encoding = "utf_16_be" # Assume unicode
509 character_widths: dict[str, int] = {}
510 character_map: dict[str, str] = {}
512 glyph_order = tt_font_object.getGlyphOrder()
513 # Note that one glyph can be mapped to multiple unicode code points. However, buildReversedMin()
514 # creates a dictionary mapping glyphs to the minimum Unicode codepoint.
515 tt_font_cmap_table = tt_font_object.get("cmap")
516 if tt_font_cmap_table:
517 reverse_cmap = tt_font_cmap_table.buildReversedMin()
518 for gid, glyph in enumerate(glyph_order):
519 char_code = reverse_cmap.get(glyph)
520 if char_code is None:
521 continue
522 char = chr(char_code)
523 gid = tt_font_object.getGlyphID(glyph)
524 # The following is to comply with how font_glyph_byte_map works in _appearance_stream.py
525 gid_bytes = gid.to_bytes(2, "big")
526 gid_key_string = gid_bytes.decode("utf-16-be", "surrogatepass")
527 character_map[gid_key_string] = char
528 character_widths[gid_key_string] = int(round(metrics[glyph][0] * scale_factor, 0))
529 else:
530 raise PdfReadError("Font file does not have a cmap table")
532 space_char = cls._get_space_char(encoding, character_map)
533 cls._add_default_width(character_widths, font_descriptor_kwargs["flags"], space_char)
534 space_width = cls._add_space_width(
535 character_widths, font_descriptor_kwargs["flags"], space_char
536 )
538 return cls(
539 name=font_descriptor.name,
540 sub_type="TrueType",
541 encoding=encoding,
542 font_descriptor=font_descriptor,
543 character_map=character_map,
544 character_widths=character_widths,
545 space_width=space_width,
546 space_char=space_char,
547 interpretable=True
548 )
550 def _get_typographic_maps(self) -> tuple[dict[str, str], dict[str, bytes]]:
551 """
552 Generates maps to translate input unicode text to bytes in two steps:
553 Unicode code point -> raw_character (reverse cmap) -> PDF bytes (encoding cmap).
554 """
555 reverse_cmap = {}
556 encoding_cmap = {}
558 if isinstance(self.encoding, str):
559 for glyph_id, unicode_char in self.character_map.items():
560 glyph_id_str = str(glyph_id)
561 reverse_cmap[unicode_char] = glyph_id_str
562 encoding_cmap[glyph_id_str] = glyph_id_str.encode(self.encoding)
563 else:
564 for character_code, unicode_char in self.encoding.items():
565 character_str = chr(character_code)
566 reverse_cmap[unicode_char] = character_str
567 encoding_cmap[character_str] = bytes((character_code,))
569 unicode_to_bytes = {
570 unicode_char: bytes((character_code,)) for character_code, unicode_char in self.encoding.items()
571 }
572 for glyph_id, unicode_char in self.character_map.items(): # This code is not covered nor tested
573 reverse_cmap[unicode_char] = glyph_id
574 encoding_cmap[glyph_id] = unicode_to_bytes.get(unicode_char, bytes((glyph_id,)))
576 return reverse_cmap, encoding_cmap
578 def _create_widths_list_and_unicode_stream(self) -> tuple[list[PdfObject], StreamObject]:
579 widths_list = []
580 unicode_map = []
581 # In the loop, char is the decoded GID string (the reverse unicode hack)
582 # and character_map[char] is the actual character.
583 # The widths (/W) array can have two formats:
584 # [first_cid [w1 w2 w3]] or [first last width]
585 # Here we choose the first format and simply provide one array with one width for every cid.
586 for gid_str, actual_char in self.character_map.items():
587 uni_point = ord(actual_char)
588 # Only deal with Basic Multilingual Plane characters.
589 # TODO: Add all characters. However, this requires widths reworking first.
590 if uni_point <= 0xFFFF:
591 cid = ord(gid_str)
592 cid_hex = f"{cid:04X}"
593 uni_hex = f"{uni_point:04X}"
594 unicode_map.append(f"<{cid_hex}> <{uni_hex}>")
596 width = self.character_widths.get(gid_str, self.character_widths["default"])
597 widths_list.extend([NumberObject(cid), ArrayObject([NumberObject(width)])])
599 # Create the /ToUnicode CMap Stream
600 to_unicode_stream = StreamObject()
601 to_unicode_stream.set_data(
602 (
603 "/CIDInit /ProcSet findresource begin\n"
604 "12 dict begin\n"
605 "begincmap\n"
606 "/CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def\n"
607 "/CMapName /Adobe-Identity-UCS def\n"
608 "/CMapType 2 def\n"
609 "1 begincodespacerange <0000> <FFFF> endcodespacerange\n"
610 f"{len(unicode_map)} beginbfchar\n"
611 + "\n".join(unicode_map) + "\n"
612 "endbfchar\n"
613 "endcmap\n"
614 "CMapName currentdict /CMap defineresource pop\n"
615 "end end"
616 ).encode("ascii")
617 )
619 return widths_list, to_unicode_stream
621 def as_font_resource(self) -> DictionaryObject:
622 # If we have an embedded Truetype font, we assume that we need to produce a Type 2 CID font resource.
623 if self.font_descriptor.font_file and self.sub_type == "TrueType":
624 # Begin with creating the widths array (part of the descendant font) and the unicode cmap (part
625 # of the Type 0 font obect).
626 widths_list, to_unicode_stream = self._create_widths_list_and_unicode_stream()
628 # Create the descendant font object
629 cid_font = DictionaryObject({
630 NameObject("/Type"): NameObject("/Font"),
631 NameObject("/Subtype"): NameObject("/CIDFontType2"),
632 NameObject("/BaseFont"): NameObject(f"/{self.name}"),
633 NameObject("/CIDSystemInfo"): DictionaryObject({
634 NameObject("/Registry"): TextStringObject("Adobe"),
635 NameObject("/Ordering"): TextStringObject("Identity"),
636 NameObject("/Supplement"): NumberObject(0)
637 }),
638 NameObject("/FontDescriptor"): self.font_descriptor.as_font_descriptor_resource(),
639 NameObject("/W"): ArrayObject(widths_list),
640 NameObject("/DW"): NumberObject(self.character_widths["default"]),
641 NameObject("/CIDToGIDMap"): NameObject("/Identity")
642 })
644 # Create the Type 0 font object
645 return DictionaryObject({
646 NameObject("/Type"): NameObject("/Font"),
647 NameObject("/Subtype"): NameObject("/Type0"),
648 NameObject("/BaseFont"): NameObject(f"/{self.name}"),
649 NameObject("/Encoding"): NameObject("/Identity-H"),
650 NameObject("/DescendantFonts"): ArrayObject([cid_font]),
651 NameObject("/ToUnicode"): to_unicode_stream,
652 })
654 # Fallback: Return a font resource for one of the 14 Adobe Core fonts.
655 return DictionaryObject({
656 NameObject("/Type"): NameObject("/Font"),
657 NameObject("/Subtype"): NameObject("/Type1"),
658 NameObject("/Name"): NameObject(f"/{self.name}"),
659 NameObject("/BaseFont"): NameObject(f"/{self.name}"),
660 NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
661 })
663 def _add_to_writer(
664 self,
665 writer: PdfWriter,
666 target_resource_dict: DictionaryObject,
667 font_resource_name: NameObject
668 ) -> IndirectObject:
669 """
670 Some objects in a font resource need to be indirect objects. This method
671 ensures that ToUnicode, FontDescriptor, FontFile, and, ultimately, the font
672 resource itself, are registered with the PdfWriter instance as indirect objects.
673 """
674 font_resource = self.as_font_resource()
675 if "/ToUnicode" in font_resource:
676 font_resource[NameObject("/ToUnicode")] = writer._add_object(font_resource["/ToUnicode"])
678 if "/DescendantFonts" in font_resource:
679 descendant_fonts = cast(ArrayObject, font_resource["/DescendantFonts"])
680 font_resource_dict = cast(DictionaryObject, descendant_fonts[0])
681 else:
682 font_resource_dict = font_resource
684 if "/FontDescriptor" in font_resource_dict:
685 font_descriptor_obj = cast(DictionaryObject, font_resource_dict["/FontDescriptor"])
686 for key in ["/FontFile", "/FontFile2", "/FontFile3"]:
687 if key in font_descriptor_obj:
688 font_descriptor_obj[NameObject(key)] = writer._add_object(font_descriptor_obj[key])
689 font_resource_dict[NameObject("/FontDescriptor")] = writer._add_object(
690 font_resource_dict["/FontDescriptor"]
691 )
692 font_resource_ref = writer._add_object(font_resource)
693 target_resource_dict[font_resource_name] = font_resource_ref
694 return font_resource_ref
696 def get_text_width(self, text: str = "") -> float:
697 """Sum of character widths specified in PDF font for the supplied text."""
698 return sum(
699 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0
700 )
702 def can_encode(self, text: str) -> bool:
703 """Check whether the font is able to encode a text string."""
704 try:
705 if self.character_map:
706 supported_chars = set(self.character_map.values())
707 return all(char in supported_chars for char in text)
708 if isinstance(self.encoding, str):
709 text.encode(self.encoding, "surrogatepass")
710 else:
711 supported_chars = set(self.encoding.values())
712 return all(char in supported_chars for char in text)
714 except UnicodeEncodeError:
715 return False
717 return True