Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/

1"""Font constants and classes for "layout" mode text operations"""

3from dataclasses import dataclass, field

4from typing import Any, Dict, Sequence, Union, cast

6from ..._codecs import adobe_glyphs

7from ...errors import ParseError

8from ...generic import IndirectObject

9from ._font_widths import STANDARD_WIDTHS

12@dataclass

13class Font:

14 """

15 A font object formatted for use during "layout" mode text extraction

17 Attributes:

18 subtype (str): font subtype

19 space_width (int | float): width of a space character

20 encoding (str | Dict[int, str]): font encoding

21 char_map (dict): character map

22 font_dictionary (dict): font dictionary

23 width_map (Dict[str, int]): mapping of characters to widths

24 interpretable (bool): Default True. If False, the font glyphs cannot

25 be translated to characters, e.g. Type3 fonts that do not define

26 a '/ToUnicode' mapping.

28 """

30 subtype: str

31 space_width: Union[int, float]

32 encoding: Union[str, Dict[int, str]]

33 char_map: Dict[Any, Any]

34 font_dictionary: Dict[Any, Any]

35 width_map: Dict[str, int] = field(default_factory=dict, init=False)

36 interpretable: bool = True

38 def __post_init__(self) -> None:

39 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be

40 # reliably converted into character codes unless all named chars

41 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the

42 # PDF 1.7 standard.

43 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:

44 self.interpretable = all(

45 cname in adobe_glyphs

46 for cname in self.font_dictionary.get("/CharProcs") or []

47 )

49 if not self.interpretable: # save some overhead if font is not interpretable

50 return

52 # TrueType fonts have a /Widths array mapping character codes to widths

53 if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:

54 first_char = self.font_dictionary.get("/FirstChar", 0)

55 self.width_map = {

56 self.encoding.get(idx + first_char, chr(idx + first_char)): width

57 for idx, width in enumerate(self.font_dictionary["/Widths"])

58 }

60 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts

61 if "/DescendantFonts" in self.font_dictionary:

62 d_font: Dict[Any, Any]

63 for d_font_idx, d_font in enumerate(

64 self.font_dictionary["/DescendantFonts"]

65 ):

66 while isinstance(d_font, IndirectObject):

67 d_font = d_font.get_object()

68 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font

69 ord_map = {

70 ord(_target): _surrogate

71 for _target, _surrogate in self.char_map.items()

72 if isinstance(_target, str)

73 }

74 # /W width definitions have two valid formats which can be mixed and matched:

75 # (1) A character start index followed by a list of widths, e.g.

76 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.

77 # (2) A character start index, a character stop index, and a width, e.g.

78 # `45 65 500` applies width 500 to characters 45-65.

79 skip_count = 0

80 _w = d_font.get("/W", [])

81 for idx, w_entry in enumerate(_w):

82 w_entry = w_entry.get_object()

83 if skip_count:

84 skip_count -= 1

85 continue

86 if not isinstance(w_entry, (int, float)): # pragma: no cover

87 # We should never get here due to skip_count above. Add a

88 # warning and or use reader's "strict" to force an ex???

89 continue

90 # check for format (1): `int [int int int int ...]`

91 w_next_entry = _w[idx + 1].get_object()

92 if isinstance(w_next_entry, Sequence):

93 start_idx, width_list = w_entry, w_next_entry

94 self.width_map.update(

95 {

96 ord_map[_cidx]: _width

97 for _cidx, _width in zip(

98 range(

99 cast(int, start_idx),

100 cast(int, start_idx) + len(width_list),

101 1,

102 ),

103 width_list,

104 )

105 if _cidx in ord_map

106 }

107 )

108 skip_count = 1

109 # check for format (2): `int int int`

110 elif isinstance(w_next_entry, (int, float)) and isinstance(

111 _w[idx + 2].get_object(), (int, float)

112 ):

113 start_idx, stop_idx, const_width = (

114 w_entry,

115 w_next_entry,

116 _w[idx + 2].get_object(),

117 )

118 self.width_map.update(

119 {

120 ord_map[_cidx]: const_width

121 for _cidx in range(

122 cast(int, start_idx), cast(int, stop_idx + 1), 1

123 )

124 if _cidx in ord_map

125 }

126 )

127 skip_count = 2

128 else:

129 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions

130 # while expecting more elements). This raises an IndexError which is sufficient.

131 raise ParseError(

132 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"

133 ) # pragma: no cover

134

135 if not self.width_map and "/BaseFont" in self.font_dictionary:

136 for key in STANDARD_WIDTHS:

137 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):

138 self.width_map = STANDARD_WIDTHS[key]

139 break

140

141 def word_width(self, word: str) -> float:

142 """Sum of character widths specified in PDF font for the supplied word"""

143 return sum(

144 [self.width_map.get(char, self.space_width * 2) for char in word], 0.0

145 )

146

147 @staticmethod

148 def to_dict(font_instance: "Font") -> Dict[str, Any]:

149 """Dataclass to dict for json.dumps serialization."""

150 return {

151 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__

152 }

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_font.py: 36%

56 statements