Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 33%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

75 statements  

1from collections.abc import Sequence 

2from dataclasses import dataclass, field 

3from typing import Any, Optional, Union, cast 

4 

5from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject 

6 

7from .errors import ParseError 

8 

9 

10@dataclass(frozen=True) 

11class FontDescriptor: 

12 """ 

13 Represents the FontDescriptor dictionary as defined in the PDF specification. 

14 This contains both descriptive and metric information. 

15 

16 The defaults are derived from the mean values of the 14 core fonts, rounded 

17 to 100. 

18 """ 

19 

20 name: str = "Unknown" 

21 family: str = "Unknown" 

22 weight: str = "Unknown" 

23 

24 ascent: float = 700.0 

25 descent: float = -200.0 

26 cap_height: float = 600.0 

27 x_height: float = 500.0 

28 italic_angle: float = 0.0 # Non-italic 

29 flags: int = 32 # Non-serif, non-symbolic, not fixed width 

30 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) 

31 

32 character_widths: dict[str, int] = field(default_factory=dict) 

33 

34 @staticmethod 

35 def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]: 

36 font_descriptor_dict: DictionaryObject = ( 

37 font_descriptor_obj.get_object() 

38 if isinstance(font_descriptor_obj, IndirectObject) 

39 else font_descriptor_obj 

40 ) 

41 for source_key, target_key in [ 

42 ("/FontName", "name"), 

43 ("/FontFamily", "family"), 

44 ("/FontWeight", "weight"), 

45 ("/Ascent", "ascent"), 

46 ("/Descent", "descent"), 

47 ("/CapHeight", "cap_height"), 

48 ("/XHeight", "x_height"), 

49 ("/ItalicAngle", "italic_angle"), 

50 ("/Flags", "flags"), 

51 ("/FontBBox", "bbox") 

52 ]: 

53 if source_key in font_descriptor_dict: 

54 font_kwargs[target_key] = font_descriptor_dict[source_key] 

55 # No need for an if statement here, bbox is a required key in a font descriptor 

56 bbox_tuple = tuple(map(float, font_kwargs["bbox"])) 

57 assert len(bbox_tuple) == 4, bbox_tuple 

58 font_kwargs["bbox"] = bbox_tuple 

59 return font_kwargs 

60 

61 @staticmethod 

62 def _collect_cid_character_widths( 

63 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int] 

64 ) -> None: 

65 """Parses the /W array from a DescendantFont dictionary and updates character widths.""" 

66 ord_map = { 

67 ord(_target): _surrogate 

68 for _target, _surrogate in char_map.items() 

69 if isinstance(_target, str) 

70 } 

71 # /W width definitions have two valid formats which can be mixed and matched: 

72 # (1) A character start index followed by a list of widths, e.g. 

73 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 

74 # (2) A character start index, a character stop index, and a width, e.g. 

75 # `45 65 500` applies width 500 to characters 45-65. 

76 skip_count = 0 

77 _w = d_font.get("/W", []) 

78 for idx, w_entry in enumerate(_w): 

79 w_entry = w_entry.get_object() 

80 if skip_count: 

81 skip_count -= 1 

82 continue 

83 if not isinstance(w_entry, (int, float)): # pragma: no cover 

84 # We should never get here due to skip_count above. Add a 

85 # warning and or use reader's "strict" to force an ex??? 

86 continue 

87 # check for format (1): `int [int int int int ...]` 

88 w_next_entry = _w[idx + 1].get_object() 

89 if isinstance(w_next_entry, Sequence): 

90 start_idx, width_list = w_entry, w_next_entry 

91 current_widths.update( 

92 { 

93 ord_map[_cidx]: _width 

94 for _cidx, _width in zip( 

95 range( 

96 cast(int, start_idx), 

97 cast(int, start_idx) + len(width_list), 

98 1, 

99 ), 

100 width_list, 

101 ) 

102 if _cidx in ord_map 

103 } 

104 ) 

105 skip_count = 1 

106 # check for format (2): `int int int` 

107 elif isinstance(w_next_entry, (int, float)) and isinstance( 

108 _w[idx + 2].get_object(), (int, float) 

109 ): 

110 start_idx, stop_idx, const_width = ( 

111 w_entry, 

112 w_next_entry, 

113 _w[idx + 2].get_object(), 

114 ) 

115 current_widths.update( 

116 { 

117 ord_map[_cidx]: const_width 

118 for _cidx in range( 

119 cast(int, start_idx), cast(int, stop_idx + 1), 1 

120 ) 

121 if _cidx in ord_map 

122 } 

123 ) 

124 skip_count = 2 

125 else: 

126 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions 

127 # while expecting more elements). This raises an IndexError which is sufficient. 

128 raise ParseError( 

129 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" 

130 ) # pragma: no cover 

131 

132 @classmethod 

133 def from_font_resource( 

134 cls, 

135 pdf_font_dict: DictionaryObject, 

136 encoding: Optional[Union[str, dict[int, str]]] = None, 

137 char_map: Optional[dict[Any, Any]] = None 

138 ) -> "FontDescriptor": 

139 from pypdf._cmap import get_encoding # noqa: PLC0415 

140 from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415 

141 # Prioritize information from the PDF font dictionary 

142 font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") 

143 font_kwargs: dict[str, Any] = {"character_widths": {}} 

144 

145 # Deal with fonts by type; Type1, TrueType and certain Type3 

146 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"): 

147 if "/FontDescriptor" in pdf_font_dict: 

148 # Collect character widths - TrueType and Type1 fonts 

149 # have a /Widths array mapping character codes to widths 

150 if not (encoding and char_map): 

151 encoding, char_map = get_encoding(pdf_font_dict) 

152 if isinstance(encoding, dict) and "/Widths" in pdf_font_dict: 

153 first_char = pdf_font_dict.get("/FirstChar", 0) 

154 font_kwargs["character_widths"] = { 

155 encoding.get(idx + first_char, chr(idx + first_char)): width 

156 for idx, width in enumerate(cast(ArrayObject, pdf_font_dict["/Widths"])) 

157 } 

158 # Collect font descriptor 

159 font_kwargs = cls._parse_font_descriptor( 

160 font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject()) 

161 ) 

162 return cls(**font_kwargs) 

163 

164 if font_name in CORE_FONT_METRICS: 

165 return CORE_FONT_METRICS[font_name] 

166 

167 # Composite font or CID font 

168 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts 

169 if "/DescendantFonts" in pdf_font_dict: 

170 if not (encoding and char_map): 

171 encoding, char_map = get_encoding(pdf_font_dict) 

172 d_font: DictionaryObject 

173 for d_font_idx, d_font in enumerate( 

174 cast(ArrayObject, pdf_font_dict["/DescendantFonts"]) 

175 ): 

176 d_font = cast(DictionaryObject, d_font.get_object()) 

177 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font 

178 # Collect character widths 

179 cls._collect_cid_character_widths( 

180 d_font, char_map, font_kwargs["character_widths"] 

181 ) 

182 # Collect font descriptor 

183 font_kwargs = cls._parse_font_descriptor( 

184 font_kwargs, d_font.get("/FontDescriptor", DictionaryObject()) 

185 ) 

186 

187 return cls(**font_kwargs) 

188 

189 def text_width(self, text: str) -> float: 

190 """Sum of character widths specified in PDF font for the supplied text.""" 

191 return sum( 

192 [self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0 

193 )