Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/

1"""Font constants and classes for "layout" mode text operations"""

3from collections.abc import Sequence

4from dataclasses import dataclass, field

5from typing import Any, Union, cast

7from ..._codecs import adobe_glyphs

8from ...errors import ParseError

9from ...generic import IndirectObject

10from ._font_widths import STANDARD_WIDTHS

13@dataclass

14class Font:

15 """

16 A font object formatted for use during "layout" mode text extraction

18 Attributes:

19 subtype (str): font subtype

20 space_width (int | float): width of a space character

21 encoding (str | Dict[int, str]): font encoding

22 char_map (dict): character map

23 font_dictionary (dict): font dictionary

24 width_map (Dict[str, int]): mapping of characters to widths

25 interpretable (bool): Default True. If False, the font glyphs cannot

26 be translated to characters, e.g. Type3 fonts that do not define

27 a '/ToUnicode' mapping.

29 """

31 subtype: str

32 space_width: Union[int, float]

33 encoding: Union[str, dict[int, str]]

34 char_map: dict[Any, Any]

35 font_dictionary: dict[Any, Any]

36 width_map: dict[str, int] = field(default_factory=dict, init=False)

37 interpretable: bool = True

39 def __post_init__(self) -> None:

40 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be

41 # reliably converted into character codes unless all named chars

42 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the

43 # PDF 1.7 standard.

44 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:

45 self.interpretable = all(

46 cname in adobe_glyphs

47 for cname in self.font_dictionary.get("/CharProcs") or []

48 )

50 if not self.interpretable: # save some overhead if font is not interpretable

51 return

53 # TrueType fonts have a /Widths array mapping character codes to widths

54 if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:

55 first_char = self.font_dictionary.get("/FirstChar", 0)

56 self.width_map = {

57 self.encoding.get(idx + first_char, chr(idx + first_char)): width

58 for idx, width in enumerate(self.font_dictionary["/Widths"])

59 }

61 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts

62 if "/DescendantFonts" in self.font_dictionary:

63 d_font: dict[Any, Any]

64 for d_font_idx, d_font in enumerate(

65 self.font_dictionary["/DescendantFonts"]

66 ):

67 while isinstance(d_font, IndirectObject):

68 d_font = d_font.get_object()

69 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font

70 ord_map = {

71 ord(_target): _surrogate

72 for _target, _surrogate in self.char_map.items()

73 if isinstance(_target, str)

74 }

75 # /W width definitions have two valid formats which can be mixed and matched:

76 # (1) A character start index followed by a list of widths, e.g.

77 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.

78 # (2) A character start index, a character stop index, and a width, e.g.

79 # `45 65 500` applies width 500 to characters 45-65.

80 skip_count = 0

81 _w = d_font.get("/W", [])

82 for idx, w_entry in enumerate(_w):

83 w_entry = w_entry.get_object()

84 if skip_count:

85 skip_count -= 1

86 continue

87 if not isinstance(w_entry, (int, float)): # pragma: no cover

88 # We should never get here due to skip_count above. Add a

89 # warning and or use reader's "strict" to force an ex???

90 continue

91 # check for format (1): `int [int int int int ...]`

92 w_next_entry = _w[idx + 1].get_object()

93 if isinstance(w_next_entry, Sequence):

94 start_idx, width_list = w_entry, w_next_entry

95 self.width_map.update(

96 {

97 ord_map[_cidx]: _width

98 for _cidx, _width in zip(

99 range(

100 cast(int, start_idx),

101 cast(int, start_idx) + len(width_list),

102 1,

103 ),

104 width_list,

105 )

106 if _cidx in ord_map

107 }

108 )

109 skip_count = 1

110 # check for format (2): `int int int`

111 elif isinstance(w_next_entry, (int, float)) and isinstance(

112 _w[idx + 2].get_object(), (int, float)

113 ):

114 start_idx, stop_idx, const_width = (

115 w_entry,

116 w_next_entry,

117 _w[idx + 2].get_object(),

118 )

119 self.width_map.update(

120 {

121 ord_map[_cidx]: const_width

122 for _cidx in range(

123 cast(int, start_idx), cast(int, stop_idx + 1), 1

124 )

125 if _cidx in ord_map

126 }

127 )

128 skip_count = 2

129 else:

130 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions

131 # while expecting more elements). This raises an IndexError which is sufficient.

132 raise ParseError(

133 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"

134 ) # pragma: no cover

135

136 if not self.width_map and "/BaseFont" in self.font_dictionary:

137 for key in STANDARD_WIDTHS:

138 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):

139 self.width_map = STANDARD_WIDTHS[key]

140 break

141

142 def word_width(self, word: str) -> float:

143 """Sum of character widths specified in PDF font for the supplied word"""

144 return sum(

145 [self.width_map.get(char, self.space_width * 2) for char in word], 0.0

146 )

147

148 @staticmethod

149 def to_dict(font_instance: "Font") -> dict[str, Any]:

150 """Dataclass to dict for json.dumps serialization."""

151 return {

152 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__

153 }

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_font.py: 37%

57 statements