Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1from __future__ import annotations

3from collections.abc import Sequence

4from dataclasses import dataclass, field

5from typing import TYPE_CHECKING, Any, cast

7from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject, StreamObject

9from ._cmap import get_encoding

10from ._codecs.adobe_glyphs import adobe_glyphs

11from ._utils import logger_warning

12from .constants import FontFlags

13from .errors import PdfReadError

15if TYPE_CHECKING:

16 from io import BytesIO

18 from fontTools.ttLib.tables._h_e_a_d import table__h_e_a_d

19 from fontTools.ttLib.tables._p_o_s_t import table__p_o_s_t

20 from fontTools.ttLib.tables.O_S_2f_2 import table_O_S_2f_2

22try:

23 from fontTools.ttLib import TTFont

24 HAS_FONTTOOLS = True

25except ImportError:

26 HAS_FONTTOOLS = False

29# Some constants from truetype font tables that we use:

30HEADER_MACSTYLE_ITALIC = 0x02

31OS2_FSSELECTION_ITALIC = 0x01

32OS2_PANOSE_BFAMILYTYPE_SCRIPT = 3

33OS2_PANOSE_BFAMILYTYPE_DECORATIVE = 4

34OS2_PANOSE_BFAMILYTYPE_PICTORIAL = 5

35OS2_PANOSE_BPROPORTION_MONOSPACED = 9

36OS2_SFAMILYSCLASS_SCRIPTS = 10

37OS2_SFAMILYSCLASS_SYMBOLIC = 12

40@dataclass(frozen=True)

41class FontDescriptor:

42 """

43 Represents the FontDescriptor dictionary as defined in the PDF specification.

44 This contains both descriptive and metric information.

46 The defaults are derived from the mean values of the 14 core fonts, rounded

47 to 100.

48 """

50 name: str = "Unknown"

51 family: str = "Unknown"

52 weight: str = "Unknown"

54 ascent: float = 700.0

55 descent: float = -200.0

56 cap_height: float = 600.0

57 x_height: float = 500.0

58 italic_angle: float = 0.0 # Non-italic

59 flags: int = 32 # Non-serif, non-symbolic, not fixed width

60 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))

61 font_file: StreamObject | None = None

64@dataclass(frozen=True)

65class CoreFontMetrics:

66 font_descriptor: FontDescriptor

67 character_widths: dict[str, int]

70@dataclass

71class Font:

72 """

73 A font object for use during text extraction and for producing

74 text appearance streams.

76 Attributes:

77 name: Font name, derived from font["/BaseFont"]

78 character_map: The font's character map

79 encoding: Font encoding

80 sub_type: The font type, such as Type1, TrueType, or Type3.

81 font_descriptor: Font metrics, including a mapping of characters to widths

82 character_widths: A mapping of characters to widths

83 space_width: The width of a space, or an approximation

84 interpretable: Default True. If False, the font glyphs cannot

85 be translated to characters, e.g. Type3 fonts that do not define

86 a '/ToUnicode' mapping.

88 """

90 name: str

91 encoding: str | dict[int, str]

92 character_map: dict[Any, Any] = field(default_factory=dict)

93 sub_type: str = "Unknown"

94 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)

95 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})

96 space_width: float | int = 250

97 interpretable: bool = True

99 @staticmethod

100 def _collect_tt_t1_character_widths(

101 pdf_font_dict: DictionaryObject,

102 char_map: dict[Any, Any],

103 encoding: str | dict[int, str],

104 current_widths: dict[str, int]

105 ) -> None:

106 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""

107 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])

108 first_char = pdf_font_dict.get("/FirstChar", 0)

109 if not isinstance(encoding, str):

110 # This means that encoding is a dict

111 current_widths.update({

112 encoding.get(idx + first_char, chr(idx + first_char)): width

113 for idx, width in enumerate(widths_array)

114 })

115 return

116

117 # We map the character code directly to the character

118 # using the string encoding

119 for idx, width in enumerate(widths_array):

120 # Often "idx == 0" will denote the .notdef character, but we add it anyway

121 char_code = idx + first_char # This is a raw code

122 # Get the "raw" character or byte representation

123 raw_char = bytes([char_code]).decode(encoding, "surrogatepass")

124 # Translate raw_char to the REAL Unicode character using the char_map

125 unicode_char = char_map.get(raw_char)

126 if unicode_char:

127 current_widths[unicode_char] = int(width)

128 else:

129 current_widths[raw_char] = int(width)

130

131 @staticmethod

132 def _collect_cid_character_widths(

133 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]

134 ) -> None:

135 """Parses the /W array from a DescendantFont dictionary and updates character widths."""

136 ord_map = {

137 ord(_target): _surrogate

138 for _target, _surrogate in char_map.items()

139 if isinstance(_target, str)

140 }

141 # /W width definitions have two valid formats which can be mixed and matched:

142 # (1) A character start index followed by a list of widths, e.g.

143 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.

144 # (2) A character start index, a character stop index, and a width, e.g.

145 # `45 65 500` applies width 500 to characters 45-65.

146 skip_count = 0

147 _w = d_font.get("/W", [])

148 for idx, w_entry in enumerate(_w):

149 w_entry = w_entry.get_object()

150 if skip_count:

151 skip_count -= 1

152 continue

153 if not isinstance(w_entry, (int, float)):

154 # We should never get here due to skip_count above. But

155 # sometimes we do.

156 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__)

157 continue

158 # check for format (1): `int [int int int int ...]`

159 w_next_entry = _w[idx + 1].get_object()

160 if isinstance(w_next_entry, Sequence):

161 start_idx, width_list = w_entry, w_next_entry

162 current_widths.update(

163 {

164 ord_map[_cidx]: _width

165 for _cidx, _width in zip(

166 range(

167 cast(int, start_idx),

168 cast(int, start_idx) + len(width_list),

169 1,

170 ),

171 width_list,

172 )

173 if _cidx in ord_map

174 }

175 )

176 skip_count = 1

177 # check for format (2): `int int int`

178 elif isinstance(w_next_entry, (int, float)) and isinstance(

179 _w[idx + 2].get_object(), (int, float)

180 ):

181 start_idx, stop_idx, const_width = (

182 w_entry,

183 w_next_entry,

184 _w[idx + 2].get_object(),

185 )

186 current_widths.update(

187 {

188 ord_map[_cidx]: const_width

189 for _cidx in range(

190 cast(int, start_idx), cast(int, stop_idx + 1), 1

191 )

192 if _cidx in ord_map

193 }

194 )

195 skip_count = 2

196 else:

197 # This handles the case of out of bounds (reaching the end of the width definitions

198 # while expecting more elements).

199 logger_warning(

200 f"Invalid font width definition. Last element: {w_entry}.",

201 __name__

202 )

203

204 @staticmethod

205 def _add_default_width(current_widths: dict[str, int], flags: int) -> None:

206 if not current_widths:

207 current_widths["default"] = 500

208 return

209

210 if " " in current_widths and current_widths[" "] != 0:

211 # Setting default to once or twice the space width, depending on fixed pitch

212 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:

213 current_widths["default"] = current_widths[" "]

214 return

215

216 current_widths["default"] = int(2 * current_widths[" "])

217 return

218

219 # Use the average width of existing glyph widths

220 valid_widths = [w for w in current_widths.values() if w > 0]

221 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500

222

223 @staticmethod

224 def _add_space_width(character_widths: dict[str, int], flags: int) -> int:

225 space_width = character_widths.get(" ", 0)

226 if space_width != 0:

227 return space_width

228

229 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:

230 return character_widths["default"]

231

232 return character_widths["default"] // 2

233

234 @staticmethod

235 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:

236 font_descriptor_kwargs: dict[Any, Any] = {}

237 for source_key, target_key in [

238 ("/FontName", "name"),

239 ("/FontFamily", "family"),

240 ("/FontWeight", "weight"),

241 ("/Ascent", "ascent"),

242 ("/Descent", "descent"),

243 ("/CapHeight", "cap_height"),

244 ("/XHeight", "x_height"),

245 ("/ItalicAngle", "italic_angle"),

246 ("/Flags", "flags"),

247 ("/FontBBox", "bbox")

248 ]:

249 if source_key in font_descriptor_obj:

250 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key]

251 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes

252 if "bbox" in font_descriptor_kwargs:

253 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))

254 assert len(bbox_tuple) == 4, bbox_tuple

255 font_descriptor_kwargs["bbox"] = bbox_tuple

256

257 # Find the binary stream for this font if there is one

258 for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]:

259 if source_key in font_descriptor_obj:

260 if "font_file" in font_descriptor_kwargs:

261 raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}")

262

263 try:

264 font_file = font_descriptor_obj[source_key].get_object()

265 font_descriptor_kwargs["font_file"] = font_file

266 except PdfReadError as e:

267 logger_warning(f"Failed to get {source_key!r} in {font_descriptor_obj}: {e}", __name__)

268 return font_descriptor_kwargs

269

270 @classmethod

271 def from_font_resource(

272 cls,

273 pdf_font_dict: DictionaryObject,

274 ) -> Font:

275 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415

276

277 # Can collect base_font, name and encoding directly from font resource

278 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")

279 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")

280 encoding, character_map = get_encoding(pdf_font_dict)

281 font_descriptor = None

282 character_widths: dict[str, int] = {}

283 interpretable = True

284

285 # Deal with fonts by type; Type1, TrueType and certain Type3

286 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):

287 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be

288 # reliably converted into character codes unless all named chars

289 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the

290 # PDF 1.7 standard.

291 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:

292 interpretable = all(

293 cname in adobe_glyphs

294 for cname in pdf_font_dict.get("/CharProcs") or []

295 )

296 if interpretable: # Save some overhead if font is not interpretable

297 if "/Widths" in pdf_font_dict:

298 cls._collect_tt_t1_character_widths(

299 pdf_font_dict, character_map, encoding, character_widths

300 )

301 elif name in CORE_FONT_METRICS:

302 font_descriptor = CORE_FONT_METRICS[name].font_descriptor

303 character_widths = CORE_FONT_METRICS[name].character_widths

304 if "/FontDescriptor" in pdf_font_dict:

305 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()

306 if "/MissingWidth" in font_descriptor_obj:

307 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object())

308 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))

309 elif "/FontBBox" in pdf_font_dict:

310 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0

311 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"])))

312 assert len(bbox_tuple) == 4, bbox_tuple

313 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple)

314

315 else:

316 # Composite font or CID font - CID fonts have a /W array mapping character codes

317 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,

318 # because all other fonts have already been dealt with.

319 d_font: DictionaryObject

320 for d_font_idx, d_font in enumerate(

321 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])

322 ):

323 d_font = cast(DictionaryObject, d_font.get_object())

324 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font

325 cls._collect_cid_character_widths(

326 d_font, character_map, character_widths

327 )

328 if "/DW" in d_font:

329 character_widths["default"] = cast(int, d_font["/DW"].get_object())

330 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object()

331 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))

332

333 if not font_descriptor:

334 font_descriptor = FontDescriptor(name=name)

335

336 if character_widths.get("default", 0) == 0:

337 cls._add_default_width(character_widths, font_descriptor.flags)

338

339 space_width = cls._add_space_width(character_widths, font_descriptor.flags)

340

341 return cls(

342 name=name,

343 sub_type=sub_type,

344 encoding=encoding,

345 font_descriptor=font_descriptor,

346 character_map=character_map,

347 character_widths=character_widths,

348 space_width=space_width,

349 interpretable=interpretable

350 )

351

352 @staticmethod

353 def _font_flags_from_truetype_font_tables(

354 header: table__h_e_a_d,

355 postscript: table__p_o_s_t,

356 os2: table_O_S_2f_2

357 ) -> int:

358 # Get the font flags

359 if os2:

360 panose = os2.panose

361 # sFamilyClass is a two-byte field. The high byte describes the family class, whereas the low

362 # byte only describes the subclass. We only need the high byte, hence the bit shift below:

363 family_class = os2.sFamilyClass >> 8

364 flags: int = 0

365

366 # ITALIC

367 if header.macStyle & HEADER_MACSTYLE_ITALIC or (os2 and os2.fsSelection & OS2_FSSELECTION_ITALIC):

368 flags |= FontFlags.ITALIC

369 if postscript:

370 italic_angle = postscript.italicAngle

371 if italic_angle != 0.0:

372 flags |= FontFlags.ITALIC

373

374 # FIXED_PITCH

375 if (

376 (os2 and panose.bProportion == OS2_PANOSE_BPROPORTION_MONOSPACED) or

377 (postscript and postscript.isFixedPitch > 0) # Actually 1, but originally (older versions of the TTF

378 ): # specification) any non-zero value signified monospace.

379 flags |= FontFlags.FIXED_PITCH

380

381 # SCRIPT

382 if os2 and (

383 family_class == OS2_SFAMILYSCLASS_SCRIPTS or panose.bFamilyType == OS2_PANOSE_BFAMILYTYPE_SCRIPT

384 ):

385 flags |= FontFlags.SCRIPT

386

387 # SERIF

388 if os2 and (

389 2 <= panose.bSerifStyle <= 10

390 or 1 <= family_class <= 5 or family_class == 7 # 6 is reserved, all 8 and above are not serif

391 ):

392 flags |= FontFlags.SERIF

393

394 # SYMBOLIC

395 if os2 and (

396 family_class == OS2_SFAMILYSCLASS_SYMBOLIC or

397 panose.bFamilyType in {OS2_PANOSE_BFAMILYTYPE_DECORATIVE, OS2_PANOSE_BFAMILYTYPE_PICTORIAL}

398 ):

399 flags |= FontFlags.SYMBOLIC

400 else:

401 flags |= FontFlags.NONSYMBOLIC

402

403 return flags

404

405 @classmethod

406 def from_truetype_font_file(cls, font_file: BytesIO) -> Font:

407 if not HAS_FONTTOOLS:

408 raise ImportError("The 'fontTools' library is required to use 'from_truetype_font_file'")

409 with TTFont(font_file) as tt_font_object:

410 # See Chapter 6 of the TrueType reference manual for the definition of the head, OS/2 and post tables:

411 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6head.html

412 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6OS2.html

413 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6post.html

414 header = tt_font_object["head"]

415 horizontal_header = tt_font_object["hhea"]

416 metrics = tt_font_object["hmtx"].metrics

417

418 # Collect additional font tables to derive font information

419 postscript = tt_font_object.get("post", None)

420 os2 = tt_font_object.get("OS/2", None)

421

422 # Get the scaling factor to convert font file's units per em to PDF's 1000 units per em

423 units_per_em = header.unitsPerEm

424 scale_factor = 1000.0 / units_per_em

425

426 # Get the font descriptor

427 font_descriptor_kwargs: dict[Any, Any] = {}

428 names = tt_font_object.get("name", None)

429 if names:

430 font_descriptor_kwargs["name"] = names.getBestFullName()

431 font_descriptor_kwargs["family"] = names.getBestFamilyName()

432 font_descriptor_kwargs["weight"] = names.getBestSubFamilyName()

433 font_descriptor_kwargs["ascent"] = int(round(horizontal_header.ascent * scale_factor, 0))

434 font_descriptor_kwargs["descent"] = int(round(horizontal_header.descent * scale_factor, 0))

435 if os2:

436 try:

437 font_descriptor_kwargs["cap_height"] = int(round(os2.sCapHeight * scale_factor, 0))

438 font_descriptor_kwargs["x_height"] = int(round(os2.sxHeight * scale_factor, 0))

439 except AttributeError:

440 pass

441

442 font_descriptor_kwargs["flags"] = cls._font_flags_from_truetype_font_tables(header, postscript, os2)

443

444 font_descriptor_kwargs["bbox"] = (

445 round(header.xMin * scale_factor, 0),

446 round(header.yMin * scale_factor, 0),

447 round(header.xMax * scale_factor, 0),

448 round(header.yMax * scale_factor, 0)

449 )

450

451 font_file_data = StreamObject()

452 font_file_raw_bytes = font_file.getvalue()

453 font_file_data.set_data(font_file_raw_bytes)

454 font_file_data.update({NameObject("/Length1"): NumberObject(len(font_file_raw_bytes))})

455 font_descriptor_kwargs["font_file"] = font_file_data

456

457 font_descriptor = FontDescriptor(**font_descriptor_kwargs)

458 encoding = "utf_16_be" # Assume unicode

459

460 character_widths: dict[str, int] = {}

461 character_map: dict[str, str] = {}

462

463 glyph_order = tt_font_object.getGlyphOrder()

464 # Note that one glyph can be mapped to multiple unicode code points. However, buildReversedMin()

465 # creates a dictionary mapping glyphs to the minimum Unicode codepoint.

466 tt_font_cmap_table = tt_font_object.get("cmap")

467 if tt_font_cmap_table:

468 reverse_cmap = tt_font_cmap_table.buildReversedMin()

469 for gid, glyph in enumerate(glyph_order):

470 char_code = reverse_cmap.get(glyph)

471 if char_code is None:

472 continue

473 char = chr(char_code)

474 gid = tt_font_object.getGlyphID(glyph)

475 # The following is to comply with how font_glyph_byte_map works in _appearance_stream.py

476 gid_bytes = gid.to_bytes(2, "big")

477 gid_key_string = gid_bytes.decode("utf-16-be", "surrogatepass")

478 character_map[gid_key_string] = char

479 character_widths[gid_key_string] = int(round(metrics[glyph][0] * scale_factor, 0))

480 else:

481 raise PdfReadError("Font file does not have a cmap table")

482

483 cls._add_default_width(character_widths, font_descriptor_kwargs["flags"])

484 space_width = cls._add_space_width(character_widths, font_descriptor_kwargs["flags"])

485

486 return cls(

487 name=font_descriptor.name,

488 sub_type="TrueType",

489 encoding=encoding,

490 font_descriptor=font_descriptor,

491 character_map=character_map,

492 character_widths=character_widths,

493 space_width=space_width,

494 interpretable=True

495 )

496

497 def as_font_resource(self) -> DictionaryObject:

498 # For now, this returns a font resource that only works with the 14 Adobe Core fonts.

499 return (

500 DictionaryObject({

501 NameObject("/Subtype"): NameObject("/Type1"),

502 NameObject("/Name"): NameObject(f"/{self.name}"),

503 NameObject("/Type"): NameObject("/Font"),

504 NameObject("/BaseFont"): NameObject(f"/{self.name}"),

505 NameObject("/Encoding"): NameObject("/WinAnsiEncoding")

506 })

507 )

508

509 def text_width(self, text: str = "") -> float:

510 """Sum of character widths specified in PDF font for the supplied text."""

511 return sum(

512 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0

513 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 27%

255 statements