Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 32%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

144 statements  

1from collections.abc import Sequence 

2from dataclasses import dataclass, field 

3from typing import Any, Union, cast 

4 

5from pypdf.generic import ArrayObject, DictionaryObject, NameObject 

6 

7from ._cmap import get_encoding 

8from ._codecs.adobe_glyphs import adobe_glyphs 

9from ._utils import logger_warning 

10from .constants import FontFlags 

11 

12 

13@dataclass(frozen=True) 

14class FontDescriptor: 

15 """ 

16 Represents the FontDescriptor dictionary as defined in the PDF specification. 

17 This contains both descriptive and metric information. 

18 

19 The defaults are derived from the mean values of the 14 core fonts, rounded 

20 to 100. 

21 """ 

22 

23 name: str = "Unknown" 

24 family: str = "Unknown" 

25 weight: str = "Unknown" 

26 

27 ascent: float = 700.0 

28 descent: float = -200.0 

29 cap_height: float = 600.0 

30 x_height: float = 500.0 

31 italic_angle: float = 0.0 # Non-italic 

32 flags: int = 32 # Non-serif, non-symbolic, not fixed width 

33 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) 

34 

35 

36@dataclass(frozen=True) 

37class CoreFontMetrics: 

38 font_descriptor: FontDescriptor 

39 character_widths: dict[str, int] 

40 

41 

42@dataclass 

43class Font: 

44 """ 

45 A font object for use during text extraction and for producing 

46 text appearance streams. 

47 

48 Attributes: 

49 name: Font name, derived from font["/BaseFont"] 

50 character_map: The font's character map 

51 encoding: Font encoding 

52 sub_type: The font type, such as Type1, TrueType, or Type3. 

53 font_descriptor: Font metrics, including a mapping of characters to widths 

54 character_widths: A mapping of characters to widths 

55 space_width: The width of a space, or an approximation 

56 interpretable: Default True. If False, the font glyphs cannot 

57 be translated to characters, e.g. Type3 fonts that do not define 

58 a '/ToUnicode' mapping. 

59 

60 """ 

61 

62 name: str 

63 encoding: Union[str, dict[int, str]] 

64 character_map: dict[Any, Any] = field(default_factory=dict) 

65 sub_type: str = "Unknown" 

66 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor) 

67 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500}) 

68 space_width: Union[float, int] = 250 

69 interpretable: bool = True 

70 

71 @staticmethod 

72 def _collect_tt_t1_character_widths( 

73 pdf_font_dict: DictionaryObject, 

74 char_map: dict[Any, Any], 

75 encoding: Union[str, dict[int, str]], 

76 current_widths: dict[str, int] 

77 ) -> None: 

78 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths""" 

79 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"]) 

80 first_char = pdf_font_dict.get("/FirstChar", 0) 

81 if not isinstance(encoding, str): 

82 # This means that encoding is a dict 

83 current_widths.update({ 

84 encoding.get(idx + first_char, chr(idx + first_char)): width 

85 for idx, width in enumerate(widths_array) 

86 }) 

87 return 

88 

89 # We map the character code directly to the character 

90 # using the string encoding 

91 for idx, width in enumerate(widths_array): 

92 # Often "idx == 0" will denote the .notdef character, but we add it anyway 

93 char_code = idx + first_char # This is a raw code 

94 # Get the "raw" character or byte representation 

95 raw_char = bytes([char_code]).decode(encoding, "surrogatepass") 

96 # Translate raw_char to the REAL Unicode character using the char_map 

97 unicode_char = char_map.get(raw_char) 

98 if unicode_char: 

99 current_widths[unicode_char] = int(width) 

100 else: 

101 current_widths[raw_char] = int(width) 

102 

103 @staticmethod 

104 def _collect_cid_character_widths( 

105 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int] 

106 ) -> None: 

107 """Parses the /W array from a DescendantFont dictionary and updates character widths.""" 

108 ord_map = { 

109 ord(_target): _surrogate 

110 for _target, _surrogate in char_map.items() 

111 if isinstance(_target, str) 

112 } 

113 # /W width definitions have two valid formats which can be mixed and matched: 

114 # (1) A character start index followed by a list of widths, e.g. 

115 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 

116 # (2) A character start index, a character stop index, and a width, e.g. 

117 # `45 65 500` applies width 500 to characters 45-65. 

118 skip_count = 0 

119 _w = d_font.get("/W", []) 

120 for idx, w_entry in enumerate(_w): 

121 w_entry = w_entry.get_object() 

122 if skip_count: 

123 skip_count -= 1 

124 continue 

125 if not isinstance(w_entry, (int, float)): 

126 # We should never get here due to skip_count above. But 

127 # sometimes we do. 

128 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__) 

129 continue 

130 # check for format (1): `int [int int int int ...]` 

131 w_next_entry = _w[idx + 1].get_object() 

132 if isinstance(w_next_entry, Sequence): 

133 start_idx, width_list = w_entry, w_next_entry 

134 current_widths.update( 

135 { 

136 ord_map[_cidx]: _width 

137 for _cidx, _width in zip( 

138 range( 

139 cast(int, start_idx), 

140 cast(int, start_idx) + len(width_list), 

141 1, 

142 ), 

143 width_list, 

144 ) 

145 if _cidx in ord_map 

146 } 

147 ) 

148 skip_count = 1 

149 # check for format (2): `int int int` 

150 elif isinstance(w_next_entry, (int, float)) and isinstance( 

151 _w[idx + 2].get_object(), (int, float) 

152 ): 

153 start_idx, stop_idx, const_width = ( 

154 w_entry, 

155 w_next_entry, 

156 _w[idx + 2].get_object(), 

157 ) 

158 current_widths.update( 

159 { 

160 ord_map[_cidx]: const_width 

161 for _cidx in range( 

162 cast(int, start_idx), cast(int, stop_idx + 1), 1 

163 ) 

164 if _cidx in ord_map 

165 } 

166 ) 

167 skip_count = 2 

168 else: 

169 # This handles the case of out of bounds (reaching the end of the width definitions 

170 # while expecting more elements). 

171 logger_warning( 

172 f"Invalid font width definition. Last element: {w_entry}.", 

173 __name__ 

174 ) 

175 

176 @staticmethod 

177 def _add_default_width(current_widths: dict[str, int], flags: int) -> None: 

178 if not current_widths: 

179 current_widths["default"] = 500 

180 return 

181 

182 if " " in current_widths and current_widths[" "] != 0: 

183 # Setting default to once or twice the space width, depending on fixed pitch 

184 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: 

185 current_widths["default"] = current_widths[" "] 

186 return 

187 

188 current_widths["default"] = int(2 * current_widths[" "]) 

189 return 

190 

191 # Use the average width of existing glyph widths 

192 valid_widths = [w for w in current_widths.values() if w > 0] 

193 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500 

194 

195 @staticmethod 

196 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]: 

197 font_descriptor_kwargs: dict[Any, Any] = {} 

198 for source_key, target_key in [ 

199 ("/FontName", "name"), 

200 ("/FontFamily", "family"), 

201 ("/FontWeight", "weight"), 

202 ("/Ascent", "ascent"), 

203 ("/Descent", "descent"), 

204 ("/CapHeight", "cap_height"), 

205 ("/XHeight", "x_height"), 

206 ("/ItalicAngle", "italic_angle"), 

207 ("/Flags", "flags"), 

208 ("/FontBBox", "bbox") 

209 ]: 

210 if source_key in font_descriptor_obj: 

211 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key] 

212 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes 

213 if "bbox" in font_descriptor_kwargs: 

214 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"])) 

215 assert len(bbox_tuple) == 4, bbox_tuple 

216 font_descriptor_kwargs["bbox"] = bbox_tuple 

217 return font_descriptor_kwargs 

218 

219 @classmethod 

220 def from_font_resource( 

221 cls, 

222 pdf_font_dict: DictionaryObject, 

223 ) -> "Font": 

224 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415 

225 

226 # Can collect base_font, name and encoding directly from font resource 

227 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") 

228 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/") 

229 encoding, character_map = get_encoding(pdf_font_dict) 

230 font_descriptor = None 

231 character_widths: dict[str, int] = {} 

232 interpretable = True 

233 

234 # Deal with fonts by type; Type1, TrueType and certain Type3 

235 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"): 

236 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 

237 # reliably converted into character codes unless all named chars 

238 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 

239 # PDF 1.7 standard. 

240 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict: 

241 interpretable = all( 

242 cname in adobe_glyphs 

243 for cname in pdf_font_dict.get("/CharProcs") or [] 

244 ) 

245 if interpretable: # Save some overhead if font is not interpretable 

246 if "/Widths" in pdf_font_dict: 

247 cls._collect_tt_t1_character_widths( 

248 pdf_font_dict, character_map, encoding, character_widths 

249 ) 

250 elif name in CORE_FONT_METRICS: 

251 font_descriptor = CORE_FONT_METRICS[name].font_descriptor 

252 character_widths = CORE_FONT_METRICS[name].character_widths 

253 if "/FontDescriptor" in pdf_font_dict: 

254 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object() 

255 if "/MissingWidth" in font_descriptor_obj: 

256 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object()) 

257 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) 

258 elif "/FontBBox" in pdf_font_dict: 

259 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0 

260 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"]))) 

261 assert len(bbox_tuple) == 4, bbox_tuple 

262 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple) 

263 

264 else: 

265 # Composite font or CID font - CID fonts have a /W array mapping character codes 

266 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though, 

267 # because all other fonts have already been dealt with. 

268 d_font: DictionaryObject 

269 for d_font_idx, d_font in enumerate( 

270 cast(ArrayObject, pdf_font_dict["/DescendantFonts"]) 

271 ): 

272 d_font = cast(DictionaryObject, d_font.get_object()) 

273 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font 

274 cls._collect_cid_character_widths( 

275 d_font, character_map, character_widths 

276 ) 

277 if "/DW" in d_font: 

278 character_widths["default"] = cast(int, d_font["/DW"].get_object()) 

279 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object() 

280 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) 

281 

282 if not font_descriptor: 

283 font_descriptor = FontDescriptor(name=name) 

284 

285 if character_widths.get("default", 0) == 0: 

286 cls._add_default_width(character_widths, font_descriptor.flags) 

287 space_width = character_widths.get(" ", 0) 

288 if space_width == 0: 

289 if (font_descriptor.flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: 

290 space_width = character_widths["default"] 

291 else: 

292 space_width = character_widths["default"] // 2 

293 

294 return cls( 

295 name=name, 

296 sub_type=sub_type, 

297 encoding=encoding, 

298 font_descriptor=font_descriptor, 

299 character_map=character_map, 

300 character_widths=character_widths, 

301 space_width=space_width, 

302 interpretable=interpretable 

303 ) 

304 

305 def as_font_resource(self) -> DictionaryObject: 

306 # For now, this returns a font resource that only works with the 14 Adobe Core fonts. 

307 return ( 

308 DictionaryObject({ 

309 NameObject("/Subtype"): NameObject("/Type1"), 

310 NameObject("/Name"): NameObject(f"/{self.name}"), 

311 NameObject("/Type"): NameObject("/Font"), 

312 NameObject("/BaseFont"): NameObject(f"/{self.name}"), 

313 NameObject("/Encoding"): NameObject("/WinAnsiEncoding") 

314 }) 

315 ) 

316 

317 def text_width(self, text: str = "") -> float: 

318 """Sum of character widths specified in PDF font for the supplied text.""" 

319 return sum( 

320 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0 

321 )