Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

141 statements  

1from collections.abc import Sequence 

2from dataclasses import dataclass, field 

3from typing import Any, Optional, Union, cast 

4 

5from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject 

6 

7from ._cmap import get_encoding 

8from ._codecs.adobe_glyphs import adobe_glyphs 

9from ._utils import logger_warning 

10 

11 

12@dataclass(frozen=True) 

13class FontDescriptor: 

14 """ 

15 Represents the FontDescriptor dictionary as defined in the PDF specification. 

16 This contains both descriptive and metric information. 

17 

18 The defaults are derived from the mean values of the 14 core fonts, rounded 

19 to 100. 

20 """ 

21 

22 name: str = "Unknown" 

23 family: str = "Unknown" 

24 weight: str = "Unknown" 

25 

26 ascent: float = 700.0 

27 descent: float = -200.0 

28 cap_height: float = 600.0 

29 x_height: float = 500.0 

30 italic_angle: float = 0.0 # Non-italic 

31 flags: int = 32 # Non-serif, non-symbolic, not fixed width 

32 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) 

33 

34 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500}) 

35 

36 @staticmethod 

37 def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]: 

38 font_descriptor_dict: DictionaryObject = ( 

39 font_descriptor_obj.get_object() 

40 if isinstance(font_descriptor_obj, IndirectObject) 

41 else font_descriptor_obj 

42 ) 

43 for source_key, target_key in [ 

44 ("/FontName", "name"), 

45 ("/FontFamily", "family"), 

46 ("/FontWeight", "weight"), 

47 ("/Ascent", "ascent"), 

48 ("/Descent", "descent"), 

49 ("/CapHeight", "cap_height"), 

50 ("/XHeight", "x_height"), 

51 ("/ItalicAngle", "italic_angle"), 

52 ("/Flags", "flags"), 

53 ("/FontBBox", "bbox") 

54 ]: 

55 if source_key in font_descriptor_dict: 

56 font_kwargs[target_key] = font_descriptor_dict[source_key] 

57 # No need for an if statement here, bbox is a required key in a font descriptor 

58 bbox_tuple = tuple(map(float, font_kwargs["bbox"])) 

59 assert len(bbox_tuple) == 4, bbox_tuple 

60 font_kwargs["bbox"] = bbox_tuple 

61 return font_kwargs 

62 

63 @staticmethod 

64 def _collect_tt_t1_character_widths( 

65 pdf_font_dict: DictionaryObject, 

66 char_map: dict[Any, Any], 

67 encoding: Union[str, dict[int, str]], 

68 current_widths: dict[str, int] 

69 ) -> None: 

70 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths""" 

71 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"]) 

72 first_char = pdf_font_dict.get("/FirstChar", 0) 

73 if not isinstance(encoding, str): 

74 # This means that encoding is a dict 

75 current_widths.update({ 

76 encoding.get(idx + first_char, chr(idx + first_char)): width 

77 for idx, width in enumerate(widths_array) 

78 }) 

79 return 

80 

81 # We map the character code directly to the character 

82 # using the string encoding 

83 for idx, width in enumerate(widths_array): 

84 # Often "idx == 0" will denote the .notdef character, but we add it anyway 

85 char_code = idx + first_char # This is a raw code 

86 # Get the "raw" character or byte representation 

87 raw_char = bytes([char_code]).decode(encoding, "surrogatepass") 

88 # Translate raw_char to the REAL Unicode character using the char_map 

89 unicode_char = char_map.get(raw_char) 

90 if unicode_char: 

91 current_widths[unicode_char] = int(width) 

92 else: 

93 current_widths[raw_char] = int(width) 

94 

95 @staticmethod 

96 def _collect_cid_character_widths( 

97 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int] 

98 ) -> None: 

99 """Parses the /W array from a DescendantFont dictionary and updates character widths.""" 

100 ord_map = { 

101 ord(_target): _surrogate 

102 for _target, _surrogate in char_map.items() 

103 if isinstance(_target, str) 

104 } 

105 # /W width definitions have two valid formats which can be mixed and matched: 

106 # (1) A character start index followed by a list of widths, e.g. 

107 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 

108 # (2) A character start index, a character stop index, and a width, e.g. 

109 # `45 65 500` applies width 500 to characters 45-65. 

110 skip_count = 0 

111 _w = d_font.get("/W", []) 

112 for idx, w_entry in enumerate(_w): 

113 w_entry = w_entry.get_object() 

114 if skip_count: 

115 skip_count -= 1 

116 continue 

117 if not isinstance(w_entry, (int, float)): 

118 # We should never get here due to skip_count above. But 

119 # sometimes we do. 

120 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__) 

121 continue 

122 # check for format (1): `int [int int int int ...]` 

123 w_next_entry = _w[idx + 1].get_object() 

124 if isinstance(w_next_entry, Sequence): 

125 start_idx, width_list = w_entry, w_next_entry 

126 current_widths.update( 

127 { 

128 ord_map[_cidx]: _width 

129 for _cidx, _width in zip( 

130 range( 

131 cast(int, start_idx), 

132 cast(int, start_idx) + len(width_list), 

133 1, 

134 ), 

135 width_list, 

136 ) 

137 if _cidx in ord_map 

138 } 

139 ) 

140 skip_count = 1 

141 # check for format (2): `int int int` 

142 elif isinstance(w_next_entry, (int, float)) and isinstance( 

143 _w[idx + 2].get_object(), (int, float) 

144 ): 

145 start_idx, stop_idx, const_width = ( 

146 w_entry, 

147 w_next_entry, 

148 _w[idx + 2].get_object(), 

149 ) 

150 current_widths.update( 

151 { 

152 ord_map[_cidx]: const_width 

153 for _cidx in range( 

154 cast(int, start_idx), cast(int, stop_idx + 1), 1 

155 ) 

156 if _cidx in ord_map 

157 } 

158 ) 

159 skip_count = 2 

160 else: 

161 # This handles the case of out of bounds (reaching the end of the width definitions 

162 # while expecting more elements). 

163 logger_warning( 

164 f"Invalid font width definition. Last element: {w_entry}.", 

165 __name__ 

166 ) 

167 

168 @staticmethod 

169 def _add_default_width(current_widths: dict[str, int]) -> None: 

170 if not current_widths: 

171 current_widths["default"] = 500 

172 return 

173 

174 if "default" in current_widths: 

175 return 

176 

177 if " " in current_widths and current_widths[" "] != 0: 

178 # Setting default to twice the space width 

179 current_widths["default"] = int(2 * current_widths[" "]) 

180 return 

181 

182 # Use the average width of existing glyph widths 

183 valid_widths = [w for w in current_widths.values() if w > 0] 

184 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500 

185 

186 @classmethod 

187 def from_font_resource( 

188 cls, 

189 pdf_font_dict: DictionaryObject, 

190 encoding: Optional[Union[str, dict[int, str]]] = None, 

191 char_map: Optional[dict[Any, Any]] = None 

192 ) -> "FontDescriptor": 

193 from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415 

194 # Prioritize information from the PDF font dictionary 

195 font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") 

196 font_kwargs: dict[str, Any] = {"character_widths": {}} 

197 

198 # Deal with fonts by type; Type1, TrueType and certain Type3 

199 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"): 

200 if "/Widths" in pdf_font_dict: 

201 if not (encoding and char_map): 

202 encoding, char_map = get_encoding(pdf_font_dict) 

203 cls._collect_tt_t1_character_widths( 

204 pdf_font_dict, char_map, encoding, font_kwargs["character_widths"] 

205 ) 

206 elif font_name in CORE_FONT_METRICS: 

207 font_descriptor = CORE_FONT_METRICS[font_name] 

208 cls._add_default_width(font_descriptor.character_widths) 

209 

210 return font_descriptor 

211 

212 if "/FontDescriptor" in pdf_font_dict: # TODO: This does not account for some Type3 fonts; 

213 # see tests/test_cmap.py::test_ascii_charset 

214 font_descriptor_resource = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object() 

215 font_descriptor_obj = cast(DictionaryObject, font_descriptor_resource) 

216 if "/MissingWidth" in font_descriptor_obj: 

217 font_kwargs["character_widths"]["default"] = font_descriptor_obj["/MissingWidth"].get_object() 

218 font_kwargs = cls._parse_font_descriptor( 

219 font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject()) 

220 ) 

221 if "default" not in font_kwargs["character_widths"]: 

222 cls._add_default_width(font_kwargs["character_widths"]) 

223 

224 return cls(**font_kwargs) 

225 

226 # Composite font or CID font - CID fonts have a /W array mapping character codes 

227 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though, 

228 # because all other fonts have already been dealt with. 

229 if not (encoding and char_map): 

230 encoding, char_map = get_encoding(pdf_font_dict) 

231 d_font: DictionaryObject 

232 for d_font_idx, d_font in enumerate( 

233 cast(ArrayObject, pdf_font_dict["/DescendantFonts"]) 

234 ): 

235 d_font = cast(DictionaryObject, d_font.get_object()) 

236 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font 

237 cls._collect_cid_character_widths( 

238 d_font, char_map, font_kwargs["character_widths"] 

239 ) 

240 if "/DW" in d_font: 

241 font_kwargs["character_widths"]["default"] = d_font["/DW"].get_object() 

242 else: 

243 cls._add_default_width(font_kwargs["character_widths"]) 

244 font_kwargs = cls._parse_font_descriptor( 

245 font_kwargs, d_font.get("/FontDescriptor", DictionaryObject()) 

246 ) 

247 

248 return cls(**font_kwargs) 

249 

250 

251@dataclass 

252class Font: 

253 """ 

254 A font object for use during text extraction and for producing 

255 text appearance streams. 

256 

257 Attributes: 

258 name: Font name, derived from font["/BaseFont"] 

259 character_map: The font's character map 

260 encoding: Font encoding 

261 sub_type: The font type, such as Type1, TrueType, or Type3. 

262 font_descriptor: Font metrics, including a mapping of characters to widths 

263 character_widths: A mapping of characters to widths 

264 space_width: The width of a space, or an approximation 

265 interpretable: Default True. If False, the font glyphs cannot 

266 be translated to characters, e.g. Type3 fonts that do not define 

267 a '/ToUnicode' mapping. 

268 

269 """ 

270 

271 name: str 

272 encoding: Union[str, dict[int, str]] 

273 character_map: dict[Any, Any] = field(default_factory=dict) 

274 sub_type: str = "Unknown" 

275 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor) 

276 character_widths: dict[str, int] = field(default_factory=dict) 

277 space_width: Union[float, int] = 250 

278 interpretable: bool = True 

279 

280 @classmethod 

281 def from_font_resource( 

282 cls, 

283 pdf_font_dict: DictionaryObject, 

284 ) -> "Font": 

285 # Can collect base_font, name and encoding directly from font resource 

286 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") 

287 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/") 

288 encoding, character_map = get_encoding(pdf_font_dict) 

289 

290 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 

291 # reliably converted into character codes unless all named chars 

292 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 

293 # PDF 1.7 standard. 

294 interpretable = True 

295 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict: 

296 interpretable = all( 

297 cname in adobe_glyphs 

298 for cname in pdf_font_dict.get("/CharProcs") or [] 

299 ) 

300 

301 if interpretable: 

302 font_descriptor = FontDescriptor.from_font_resource(pdf_font_dict, encoding, character_map) 

303 else: 

304 font_descriptor = FontDescriptor() # Save some overhead if font is not interpretable 

305 character_widths = font_descriptor.character_widths 

306 

307 space_width = font_descriptor.character_widths.get(" ") 

308 if not space_width or space_width == 0: 

309 space_width = font_descriptor.character_widths["default"] // 2 

310 

311 return cls( 

312 name=name, 

313 sub_type=sub_type, 

314 encoding=encoding, 

315 font_descriptor=font_descriptor, 

316 character_map=character_map, 

317 character_widths=character_widths, 

318 space_width=space_width, 

319 interpretable=interpretable 

320 ) 

321 

322 def text_width(self, text: str = "") -> float: 

323 """Sum of character widths specified in PDF font for the supplied text.""" 

324 return sum( 

325 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0 

326 )