Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

142 statements  

1from collections.abc import Sequence 

2from dataclasses import dataclass, field 

3from typing import Any, Optional, Union, cast 

4 

5from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject 

6 

7from ._cmap import get_encoding 

8from ._codecs.adobe_glyphs import adobe_glyphs 

9from ._utils import logger_warning 

10 

11 

12@dataclass(frozen=True) 

13class FontDescriptor: 

14 """ 

15 Represents the FontDescriptor dictionary as defined in the PDF specification. 

16 This contains both descriptive and metric information. 

17 

18 The defaults are derived from the mean values of the 14 core fonts, rounded 

19 to 100. 

20 """ 

21 

22 name: str = "Unknown" 

23 family: str = "Unknown" 

24 weight: str = "Unknown" 

25 

26 ascent: float = 700.0 

27 descent: float = -200.0 

28 cap_height: float = 600.0 

29 x_height: float = 500.0 

30 italic_angle: float = 0.0 # Non-italic 

31 flags: int = 32 # Non-serif, non-symbolic, not fixed width 

32 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) 

33 

34 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500}) 

35 

36 @staticmethod 

37 def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]: 

38 font_descriptor_dict: DictionaryObject = ( 

39 font_descriptor_obj.get_object() 

40 if isinstance(font_descriptor_obj, IndirectObject) 

41 else font_descriptor_obj 

42 ) 

43 for source_key, target_key in [ 

44 ("/FontName", "name"), 

45 ("/FontFamily", "family"), 

46 ("/FontWeight", "weight"), 

47 ("/Ascent", "ascent"), 

48 ("/Descent", "descent"), 

49 ("/CapHeight", "cap_height"), 

50 ("/XHeight", "x_height"), 

51 ("/ItalicAngle", "italic_angle"), 

52 ("/Flags", "flags"), 

53 ("/FontBBox", "bbox") 

54 ]: 

55 if source_key in font_descriptor_dict: 

56 font_kwargs[target_key] = font_descriptor_dict[source_key] 

57 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes 

58 if "bbox" in font_kwargs: 

59 bbox_tuple = tuple(map(float, font_kwargs["bbox"])) 

60 assert len(bbox_tuple) == 4, bbox_tuple 

61 font_kwargs["bbox"] = bbox_tuple 

62 return font_kwargs 

63 

64 @staticmethod 

65 def _collect_tt_t1_character_widths( 

66 pdf_font_dict: DictionaryObject, 

67 char_map: dict[Any, Any], 

68 encoding: Union[str, dict[int, str]], 

69 current_widths: dict[str, int] 

70 ) -> None: 

71 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths""" 

72 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"]) 

73 first_char = pdf_font_dict.get("/FirstChar", 0) 

74 if not isinstance(encoding, str): 

75 # This means that encoding is a dict 

76 current_widths.update({ 

77 encoding.get(idx + first_char, chr(idx + first_char)): width 

78 for idx, width in enumerate(widths_array) 

79 }) 

80 return 

81 

82 # We map the character code directly to the character 

83 # using the string encoding 

84 for idx, width in enumerate(widths_array): 

85 # Often "idx == 0" will denote the .notdef character, but we add it anyway 

86 char_code = idx + first_char # This is a raw code 

87 # Get the "raw" character or byte representation 

88 raw_char = bytes([char_code]).decode(encoding, "surrogatepass") 

89 # Translate raw_char to the REAL Unicode character using the char_map 

90 unicode_char = char_map.get(raw_char) 

91 if unicode_char: 

92 current_widths[unicode_char] = int(width) 

93 else: 

94 current_widths[raw_char] = int(width) 

95 

96 @staticmethod 

97 def _collect_cid_character_widths( 

98 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int] 

99 ) -> None: 

100 """Parses the /W array from a DescendantFont dictionary and updates character widths.""" 

101 ord_map = { 

102 ord(_target): _surrogate 

103 for _target, _surrogate in char_map.items() 

104 if isinstance(_target, str) 

105 } 

106 # /W width definitions have two valid formats which can be mixed and matched: 

107 # (1) A character start index followed by a list of widths, e.g. 

108 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 

109 # (2) A character start index, a character stop index, and a width, e.g. 

110 # `45 65 500` applies width 500 to characters 45-65. 

111 skip_count = 0 

112 _w = d_font.get("/W", []) 

113 for idx, w_entry in enumerate(_w): 

114 w_entry = w_entry.get_object() 

115 if skip_count: 

116 skip_count -= 1 

117 continue 

118 if not isinstance(w_entry, (int, float)): 

119 # We should never get here due to skip_count above. But 

120 # sometimes we do. 

121 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__) 

122 continue 

123 # check for format (1): `int [int int int int ...]` 

124 w_next_entry = _w[idx + 1].get_object() 

125 if isinstance(w_next_entry, Sequence): 

126 start_idx, width_list = w_entry, w_next_entry 

127 current_widths.update( 

128 { 

129 ord_map[_cidx]: _width 

130 for _cidx, _width in zip( 

131 range( 

132 cast(int, start_idx), 

133 cast(int, start_idx) + len(width_list), 

134 1, 

135 ), 

136 width_list, 

137 ) 

138 if _cidx in ord_map 

139 } 

140 ) 

141 skip_count = 1 

142 # check for format (2): `int int int` 

143 elif isinstance(w_next_entry, (int, float)) and isinstance( 

144 _w[idx + 2].get_object(), (int, float) 

145 ): 

146 start_idx, stop_idx, const_width = ( 

147 w_entry, 

148 w_next_entry, 

149 _w[idx + 2].get_object(), 

150 ) 

151 current_widths.update( 

152 { 

153 ord_map[_cidx]: const_width 

154 for _cidx in range( 

155 cast(int, start_idx), cast(int, stop_idx + 1), 1 

156 ) 

157 if _cidx in ord_map 

158 } 

159 ) 

160 skip_count = 2 

161 else: 

162 # This handles the case of out of bounds (reaching the end of the width definitions 

163 # while expecting more elements). 

164 logger_warning( 

165 f"Invalid font width definition. Last element: {w_entry}.", 

166 __name__ 

167 ) 

168 

169 @staticmethod 

170 def _add_default_width(current_widths: dict[str, int]) -> None: 

171 if not current_widths: 

172 current_widths["default"] = 500 

173 return 

174 

175 if "default" in current_widths: 

176 return 

177 

178 if " " in current_widths and current_widths[" "] != 0: 

179 # Setting default to twice the space width 

180 current_widths["default"] = int(2 * current_widths[" "]) 

181 return 

182 

183 # Use the average width of existing glyph widths 

184 valid_widths = [w for w in current_widths.values() if w > 0] 

185 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500 

186 

187 @classmethod 

188 def from_font_resource( 

189 cls, 

190 pdf_font_dict: DictionaryObject, 

191 encoding: Optional[Union[str, dict[int, str]]] = None, 

192 char_map: Optional[dict[Any, Any]] = None 

193 ) -> "FontDescriptor": 

194 from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415 

195 # Prioritize information from the PDF font dictionary 

196 font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") 

197 font_kwargs: dict[str, Any] = {"character_widths": {}} 

198 

199 # Deal with fonts by type; Type1, TrueType and certain Type3 

200 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"): 

201 if "/Widths" in pdf_font_dict: 

202 if not (encoding and char_map): 

203 encoding, char_map = get_encoding(pdf_font_dict) 

204 cls._collect_tt_t1_character_widths( 

205 pdf_font_dict, char_map, encoding, font_kwargs["character_widths"] 

206 ) 

207 elif font_name in CORE_FONT_METRICS: 

208 font_descriptor = CORE_FONT_METRICS[font_name] 

209 cls._add_default_width(font_descriptor.character_widths) 

210 

211 return font_descriptor 

212 

213 if "/FontDescriptor" in pdf_font_dict: # TODO: This does not account for some Type3 fonts; 

214 # see tests/test_cmap.py::test_ascii_charset 

215 font_descriptor_resource = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object() 

216 font_descriptor_obj = cast(DictionaryObject, font_descriptor_resource) 

217 if "/MissingWidth" in font_descriptor_obj: 

218 font_kwargs["character_widths"]["default"] = font_descriptor_obj["/MissingWidth"].get_object() 

219 font_kwargs = cls._parse_font_descriptor( 

220 font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject()) 

221 ) 

222 if "default" not in font_kwargs["character_widths"]: 

223 cls._add_default_width(font_kwargs["character_widths"]) 

224 

225 return cls(**font_kwargs) 

226 

227 # Composite font or CID font - CID fonts have a /W array mapping character codes 

228 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though, 

229 # because all other fonts have already been dealt with. 

230 if not (encoding and char_map): 

231 encoding, char_map = get_encoding(pdf_font_dict) 

232 d_font: DictionaryObject 

233 for d_font_idx, d_font in enumerate( 

234 cast(ArrayObject, pdf_font_dict["/DescendantFonts"]) 

235 ): 

236 d_font = cast(DictionaryObject, d_font.get_object()) 

237 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font 

238 cls._collect_cid_character_widths( 

239 d_font, char_map, font_kwargs["character_widths"] 

240 ) 

241 if "/DW" in d_font: 

242 font_kwargs["character_widths"]["default"] = d_font["/DW"].get_object() 

243 else: 

244 cls._add_default_width(font_kwargs["character_widths"]) 

245 font_kwargs = cls._parse_font_descriptor( 

246 font_kwargs, d_font.get("/FontDescriptor", DictionaryObject()) 

247 ) 

248 

249 return cls(**font_kwargs) 

250 

251 

252@dataclass 

253class Font: 

254 """ 

255 A font object for use during text extraction and for producing 

256 text appearance streams. 

257 

258 Attributes: 

259 name: Font name, derived from font["/BaseFont"] 

260 character_map: The font's character map 

261 encoding: Font encoding 

262 sub_type: The font type, such as Type1, TrueType, or Type3. 

263 font_descriptor: Font metrics, including a mapping of characters to widths 

264 character_widths: A mapping of characters to widths 

265 space_width: The width of a space, or an approximation 

266 interpretable: Default True. If False, the font glyphs cannot 

267 be translated to characters, e.g. Type3 fonts that do not define 

268 a '/ToUnicode' mapping. 

269 

270 """ 

271 

272 name: str 

273 encoding: Union[str, dict[int, str]] 

274 character_map: dict[Any, Any] = field(default_factory=dict) 

275 sub_type: str = "Unknown" 

276 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor) 

277 character_widths: dict[str, int] = field(default_factory=dict) 

278 space_width: Union[float, int] = 250 

279 interpretable: bool = True 

280 

281 @classmethod 

282 def from_font_resource( 

283 cls, 

284 pdf_font_dict: DictionaryObject, 

285 ) -> "Font": 

286 # Can collect base_font, name and encoding directly from font resource 

287 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") 

288 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/") 

289 encoding, character_map = get_encoding(pdf_font_dict) 

290 

291 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 

292 # reliably converted into character codes unless all named chars 

293 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 

294 # PDF 1.7 standard. 

295 interpretable = True 

296 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict: 

297 interpretable = all( 

298 cname in adobe_glyphs 

299 for cname in pdf_font_dict.get("/CharProcs") or [] 

300 ) 

301 

302 if interpretable: 

303 font_descriptor = FontDescriptor.from_font_resource(pdf_font_dict, encoding, character_map) 

304 else: 

305 font_descriptor = FontDescriptor() # Save some overhead if font is not interpretable 

306 character_widths = font_descriptor.character_widths 

307 

308 space_width = font_descriptor.character_widths.get(" ") 

309 if not space_width or space_width == 0: 

310 space_width = font_descriptor.character_widths["default"] // 2 

311 

312 return cls( 

313 name=name, 

314 sub_type=sub_type, 

315 encoding=encoding, 

316 font_descriptor=font_descriptor, 

317 character_map=character_map, 

318 character_widths=character_widths, 

319 space_width=space_width, 

320 interpretable=interpretable 

321 ) 

322 

323 def text_width(self, text: str = "") -> float: 

324 """Sum of character widths specified in PDF font for the supplied text.""" 

325 return sum( 

326 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0 

327 )