Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1from collections.abc import Sequence

2from dataclasses import dataclass, field

3from typing import Any, Union, cast

5from pypdf.generic import ArrayObject, DictionaryObject, NameObject

7from ._cmap import get_encoding

8from ._codecs.adobe_glyphs import adobe_glyphs

9from ._utils import logger_warning

10from .constants import FontFlags

13@dataclass(frozen=True)

14class FontDescriptor:

15 """

16 Represents the FontDescriptor dictionary as defined in the PDF specification.

17 This contains both descriptive and metric information.

19 The defaults are derived from the mean values of the 14 core fonts, rounded

20 to 100.

21 """

23 name: str = "Unknown"

24 family: str = "Unknown"

25 weight: str = "Unknown"

27 ascent: float = 700.0

28 descent: float = -200.0

29 cap_height: float = 600.0

30 x_height: float = 500.0

31 italic_angle: float = 0.0 # Non-italic

32 flags: int = 32 # Non-serif, non-symbolic, not fixed width

33 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))

36@dataclass(frozen=True)

37class CoreFontMetrics:

38 font_descriptor: FontDescriptor

39 character_widths: dict[str, int]

42@dataclass

43class Font:

44 """

45 A font object for use during text extraction and for producing

46 text appearance streams.

48 Attributes:

49 name: Font name, derived from font["/BaseFont"]

50 character_map: The font's character map

51 encoding: Font encoding

52 sub_type: The font type, such as Type1, TrueType, or Type3.

53 font_descriptor: Font metrics, including a mapping of characters to widths

54 character_widths: A mapping of characters to widths

55 space_width: The width of a space, or an approximation

56 interpretable: Default True. If False, the font glyphs cannot

57 be translated to characters, e.g. Type3 fonts that do not define

58 a '/ToUnicode' mapping.

60 """

62 name: str

63 encoding: Union[str, dict[int, str]]

64 character_map: dict[Any, Any] = field(default_factory=dict)

65 sub_type: str = "Unknown"

66 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)

67 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})

68 space_width: Union[float, int] = 250

69 interpretable: bool = True

71 @staticmethod

72 def _collect_tt_t1_character_widths(

73 pdf_font_dict: DictionaryObject,

74 char_map: dict[Any, Any],

75 encoding: Union[str, dict[int, str]],

76 current_widths: dict[str, int]

77 ) -> None:

78 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""

79 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])

80 first_char = pdf_font_dict.get("/FirstChar", 0)

81 if not isinstance(encoding, str):

82 # This means that encoding is a dict

83 current_widths.update({

84 encoding.get(idx + first_char, chr(idx + first_char)): width

85 for idx, width in enumerate(widths_array)

86 })

87 return

89 # We map the character code directly to the character

90 # using the string encoding

91 for idx, width in enumerate(widths_array):

92 # Often "idx == 0" will denote the .notdef character, but we add it anyway

93 char_code = idx + first_char # This is a raw code

94 # Get the "raw" character or byte representation

95 raw_char = bytes([char_code]).decode(encoding, "surrogatepass")

96 # Translate raw_char to the REAL Unicode character using the char_map

97 unicode_char = char_map.get(raw_char)

98 if unicode_char:

99 current_widths[unicode_char] = int(width)

100 else:

101 current_widths[raw_char] = int(width)

102

103 @staticmethod

104 def _collect_cid_character_widths(

105 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]

106 ) -> None:

107 """Parses the /W array from a DescendantFont dictionary and updates character widths."""

108 ord_map = {

109 ord(_target): _surrogate

110 for _target, _surrogate in char_map.items()

111 if isinstance(_target, str)

112 }

113 # /W width definitions have two valid formats which can be mixed and matched:

114 # (1) A character start index followed by a list of widths, e.g.

115 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.

116 # (2) A character start index, a character stop index, and a width, e.g.

117 # `45 65 500` applies width 500 to characters 45-65.

118 skip_count = 0

119 _w = d_font.get("/W", [])

120 for idx, w_entry in enumerate(_w):

121 w_entry = w_entry.get_object()

122 if skip_count:

123 skip_count -= 1

124 continue

125 if not isinstance(w_entry, (int, float)):

126 # We should never get here due to skip_count above. But

127 # sometimes we do.

128 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__)

129 continue

130 # check for format (1): `int [int int int int ...]`

131 w_next_entry = _w[idx + 1].get_object()

132 if isinstance(w_next_entry, Sequence):

133 start_idx, width_list = w_entry, w_next_entry

134 current_widths.update(

135 {

136 ord_map[_cidx]: _width

137 for _cidx, _width in zip(

138 range(

139 cast(int, start_idx),

140 cast(int, start_idx) + len(width_list),

141 1,

142 ),

143 width_list,

144 )

145 if _cidx in ord_map

146 }

147 )

148 skip_count = 1

149 # check for format (2): `int int int`

150 elif isinstance(w_next_entry, (int, float)) and isinstance(

151 _w[idx + 2].get_object(), (int, float)

152 ):

153 start_idx, stop_idx, const_width = (

154 w_entry,

155 w_next_entry,

156 _w[idx + 2].get_object(),

157 )

158 current_widths.update(

159 {

160 ord_map[_cidx]: const_width

161 for _cidx in range(

162 cast(int, start_idx), cast(int, stop_idx + 1), 1

163 )

164 if _cidx in ord_map

165 }

166 )

167 skip_count = 2

168 else:

169 # This handles the case of out of bounds (reaching the end of the width definitions

170 # while expecting more elements).

171 logger_warning(

172 f"Invalid font width definition. Last element: {w_entry}.",

173 __name__

174 )

175

176 @staticmethod

177 def _add_default_width(current_widths: dict[str, int], flags: int) -> None:

178 if not current_widths:

179 current_widths["default"] = 500

180 return

181

182 if " " in current_widths and current_widths[" "] != 0:

183 # Setting default to once or twice the space width, depending on fixed pitch

184 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:

185 current_widths["default"] = current_widths[" "]

186 return

187

188 current_widths["default"] = int(2 * current_widths[" "])

189 return

190

191 # Use the average width of existing glyph widths

192 valid_widths = [w for w in current_widths.values() if w > 0]

193 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500

194

195 @staticmethod

196 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:

197 font_descriptor_kwargs: dict[Any, Any] = {}

198 for source_key, target_key in [

199 ("/FontName", "name"),

200 ("/FontFamily", "family"),

201 ("/FontWeight", "weight"),

202 ("/Ascent", "ascent"),

203 ("/Descent", "descent"),

204 ("/CapHeight", "cap_height"),

205 ("/XHeight", "x_height"),

206 ("/ItalicAngle", "italic_angle"),

207 ("/Flags", "flags"),

208 ("/FontBBox", "bbox")

209 ]:

210 if source_key in font_descriptor_obj:

211 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key]

212 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes

213 if "bbox" in font_descriptor_kwargs:

214 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))

215 assert len(bbox_tuple) == 4, bbox_tuple

216 font_descriptor_kwargs["bbox"] = bbox_tuple

217 return font_descriptor_kwargs

218

219 @classmethod

220 def from_font_resource(

221 cls,

222 pdf_font_dict: DictionaryObject,

223 ) -> "Font":

224 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415

225

226 # Can collect base_font, name and encoding directly from font resource

227 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")

228 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")

229 encoding, character_map = get_encoding(pdf_font_dict)

230 font_descriptor = None

231 character_widths: dict[str, int] = {}

232 interpretable = True

233

234 # Deal with fonts by type; Type1, TrueType and certain Type3

235 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):

236 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be

237 # reliably converted into character codes unless all named chars

238 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the

239 # PDF 1.7 standard.

240 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:

241 interpretable = all(

242 cname in adobe_glyphs

243 for cname in pdf_font_dict.get("/CharProcs") or []

244 )

245 if interpretable: # Save some overhead if font is not interpretable

246 if "/Widths" in pdf_font_dict:

247 cls._collect_tt_t1_character_widths(

248 pdf_font_dict, character_map, encoding, character_widths

249 )

250 elif name in CORE_FONT_METRICS:

251 font_descriptor = CORE_FONT_METRICS[name].font_descriptor

252 character_widths = CORE_FONT_METRICS[name].character_widths

253 if "/FontDescriptor" in pdf_font_dict:

254 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()

255 if "/MissingWidth" in font_descriptor_obj:

256 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object())

257 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))

258 elif "/FontBBox" in pdf_font_dict:

259 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0

260 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"])))

261 assert len(bbox_tuple) == 4, bbox_tuple

262 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple)

263

264 else:

265 # Composite font or CID font - CID fonts have a /W array mapping character codes

266 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,

267 # because all other fonts have already been dealt with.

268 d_font: DictionaryObject

269 for d_font_idx, d_font in enumerate(

270 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])

271 ):

272 d_font = cast(DictionaryObject, d_font.get_object())

273 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font

274 cls._collect_cid_character_widths(

275 d_font, character_map, character_widths

276 )

277 if "/DW" in d_font:

278 character_widths["default"] = cast(int, d_font["/DW"].get_object())

279 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object()

280 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))

281

282 if not font_descriptor:

283 font_descriptor = FontDescriptor(name=name)

284

285 if character_widths.get("default", 0) == 0:

286 cls._add_default_width(character_widths, font_descriptor.flags)

287 space_width = character_widths.get(" ", 0)

288 if space_width == 0:

289 if (font_descriptor.flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:

290 space_width = character_widths["default"]

291 else:

292 space_width = character_widths["default"] // 2

293

294 return cls(

295 name=name,

296 sub_type=sub_type,

297 encoding=encoding,

298 font_descriptor=font_descriptor,

299 character_map=character_map,

300 character_widths=character_widths,

301 space_width=space_width,

302 interpretable=interpretable

303 )

304

305 def as_font_resource(self) -> DictionaryObject:

306 # For now, this returns a font resource that only works with the 14 Adobe Core fonts.

307 return (

308 DictionaryObject({

309 NameObject("/Subtype"): NameObject("/Type1"),

310 NameObject("/Name"): NameObject(f"/{self.name}"),

311 NameObject("/Type"): NameObject("/Font"),

312 NameObject("/BaseFont"): NameObject(f"/{self.name}"),

313 NameObject("/Encoding"): NameObject("/WinAnsiEncoding")

314 })

315 )

316

317 def text_width(self, text: str = "") -> float:

318 """Sum of character widths specified in PDF font for the supplied text."""

319 return sum(

320 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0

321 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 32%

144 statements