Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/

1from collections.abc import Sequence

2from dataclasses import dataclass, field

3from typing import Any, Optional, Union, cast

5from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject

7from .errors import ParseError

10@dataclass(frozen=True)

11class FontDescriptor:

12 """

13 Represents the FontDescriptor dictionary as defined in the PDF specification.

14 This contains both descriptive and metric information.

16 The defaults are derived from the mean values of the 14 core fonts, rounded

17 to 100.

18 """

20 name: str = "Unknown"

21 family: str = "Unknown"

22 weight: str = "Unknown"

24 ascent: float = 700.0

25 descent: float = -200.0

26 cap_height: float = 600.0

27 x_height: float = 500.0

28 italic_angle: float = 0.0 # Non-italic

29 flags: int = 32 # Non-serif, non-symbolic, not fixed width

30 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))

32 character_widths: dict[str, int] = field(default_factory=dict)

34 @staticmethod

35 def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:

36 font_descriptor_dict: DictionaryObject = (

37 font_descriptor_obj.get_object()

38 if isinstance(font_descriptor_obj, IndirectObject)

39 else font_descriptor_obj

40 )

41 for source_key, target_key in [

42 ("/FontName", "name"),

43 ("/FontFamily", "family"),

44 ("/FontWeight", "weight"),

45 ("/Ascent", "ascent"),

46 ("/Descent", "descent"),

47 ("/CapHeight", "cap_height"),

48 ("/XHeight", "x_height"),

49 ("/ItalicAngle", "italic_angle"),

50 ("/Flags", "flags"),

51 ("/FontBBox", "bbox")

52 ]:

53 if source_key in font_descriptor_dict:

54 font_kwargs[target_key] = font_descriptor_dict[source_key]

55 # No need for an if statement here, bbox is a required key in a font descriptor

56 bbox_tuple = tuple(map(float, font_kwargs["bbox"]))

57 assert len(bbox_tuple) == 4, bbox_tuple

58 font_kwargs["bbox"] = bbox_tuple

59 return font_kwargs

61 @staticmethod

62 def _collect_cid_character_widths(

63 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]

64 ) -> None:

65 """Parses the /W array from a DescendantFont dictionary and updates character widths."""

66 ord_map = {

67 ord(_target): _surrogate

68 for _target, _surrogate in char_map.items()

69 if isinstance(_target, str)

70 }

71 # /W width definitions have two valid formats which can be mixed and matched:

72 # (1) A character start index followed by a list of widths, e.g.

73 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.

74 # (2) A character start index, a character stop index, and a width, e.g.

75 # `45 65 500` applies width 500 to characters 45-65.

76 skip_count = 0

77 _w = d_font.get("/W", [])

78 for idx, w_entry in enumerate(_w):

79 w_entry = w_entry.get_object()

80 if skip_count:

81 skip_count -= 1

82 continue

83 if not isinstance(w_entry, (int, float)): # pragma: no cover

84 # We should never get here due to skip_count above. Add a

85 # warning and or use reader's "strict" to force an ex???

86 continue

87 # check for format (1): `int [int int int int ...]`

88 w_next_entry = _w[idx + 1].get_object()

89 if isinstance(w_next_entry, Sequence):

90 start_idx, width_list = w_entry, w_next_entry

91 current_widths.update(

92 {

93 ord_map[_cidx]: _width

94 for _cidx, _width in zip(

95 range(

96 cast(int, start_idx),

97 cast(int, start_idx) + len(width_list),

98 1,

99 ),

100 width_list,

101 )

102 if _cidx in ord_map

103 }

104 )

105 skip_count = 1

106 # check for format (2): `int int int`

107 elif isinstance(w_next_entry, (int, float)) and isinstance(

108 _w[idx + 2].get_object(), (int, float)

109 ):

110 start_idx, stop_idx, const_width = (

111 w_entry,

112 w_next_entry,

113 _w[idx + 2].get_object(),

114 )

115 current_widths.update(

116 {

117 ord_map[_cidx]: const_width

118 for _cidx in range(

119 cast(int, start_idx), cast(int, stop_idx + 1), 1

120 )

121 if _cidx in ord_map

122 }

123 )

124 skip_count = 2

125 else:

126 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions

127 # while expecting more elements). This raises an IndexError which is sufficient.

128 raise ParseError(

129 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"

130 ) # pragma: no cover

131

132 @classmethod

133 def from_font_resource(

134 cls,

135 pdf_font_dict: DictionaryObject,

136 encoding: Optional[Union[str, dict[int, str]]] = None,

137 char_map: Optional[dict[Any, Any]] = None

138 ) -> "FontDescriptor":

139 from pypdf._cmap import get_encoding # noqa: PLC0415

140 from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415

141 # Prioritize information from the PDF font dictionary

142 font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")

143 font_kwargs: dict[str, Any] = {"character_widths": {}}

144

145 # Deal with fonts by type; Type1, TrueType and certain Type3

146 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):

147 if "/FontDescriptor" in pdf_font_dict:

148 # Collect character widths - TrueType and Type1 fonts

149 # have a /Widths array mapping character codes to widths

150 if not (encoding and char_map):

151 encoding, char_map = get_encoding(pdf_font_dict)

152 if isinstance(encoding, dict) and "/Widths" in pdf_font_dict:

153 first_char = pdf_font_dict.get("/FirstChar", 0)

154 font_kwargs["character_widths"] = {

155 encoding.get(idx + first_char, chr(idx + first_char)): width

156 for idx, width in enumerate(cast(ArrayObject, pdf_font_dict["/Widths"]))

157 }

158 # Collect font descriptor

159 font_kwargs = cls._parse_font_descriptor(

160 font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())

161 )

162 return cls(**font_kwargs)

163

164 if font_name in CORE_FONT_METRICS:

165 return CORE_FONT_METRICS[font_name]

166

167 # Composite font or CID font

168 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts

169 if "/DescendantFonts" in pdf_font_dict:

170 if not (encoding and char_map):

171 encoding, char_map = get_encoding(pdf_font_dict)

172 d_font: DictionaryObject

173 for d_font_idx, d_font in enumerate(

174 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])

175 ):

176 d_font = cast(DictionaryObject, d_font.get_object())

177 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font

178 # Collect character widths

179 cls._collect_cid_character_widths(

180 d_font, char_map, font_kwargs["character_widths"]

181 )

182 # Collect font descriptor

183 font_kwargs = cls._parse_font_descriptor(

184 font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())

185 )

186

187 return cls(**font_kwargs)

188

189 def text_width(self, text: str) -> float:

190 """Sum of character widths specified in PDF font for the supplied text."""

191 return sum(

192 [self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0

193 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 33%

75 statements