Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_font.py: 36%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

56 statements  

1"""Font constants and classes for "layout" mode text operations""" 

2 

3from dataclasses import dataclass, field 

4from typing import Any, Dict, Sequence, Union, cast 

5 

6from ..._codecs import adobe_glyphs 

7from ...errors import ParseError 

8from ...generic import IndirectObject 

9from ._font_widths import STANDARD_WIDTHS 

10 

11 

12@dataclass 

13class Font: 

14 """ 

15 A font object formatted for use during "layout" mode text extraction 

16 

17 Attributes: 

18 subtype (str): font subtype 

19 space_width (int | float): width of a space character 

20 encoding (str | Dict[int, str]): font encoding 

21 char_map (dict): character map 

22 font_dictionary (dict): font dictionary 

23 width_map (Dict[str, int]): mapping of characters to widths 

24 interpretable (bool): Default True. If False, the font glyphs cannot 

25 be translated to characters, e.g. Type3 fonts that do not define 

26 a '/ToUnicode' mapping. 

27 

28 """ 

29 

30 subtype: str 

31 space_width: Union[int, float] 

32 encoding: Union[str, Dict[int, str]] 

33 char_map: Dict[Any, Any] 

34 font_dictionary: Dict[Any, Any] 

35 width_map: Dict[str, int] = field(default_factory=dict, init=False) 

36 interpretable: bool = True 

37 

38 def __post_init__(self) -> None: 

39 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 

40 # reliably converted into character codes unless all named chars 

41 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 

42 # PDF 1.7 standard. 

43 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary: 

44 self.interpretable = all( 

45 cname in adobe_glyphs 

46 for cname in self.font_dictionary.get("/CharProcs") or [] 

47 ) 

48 

49 if not self.interpretable: # save some overhead if font is not interpretable 

50 return 

51 

52 # TrueType fonts have a /Widths array mapping character codes to widths 

53 if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary: 

54 first_char = self.font_dictionary.get("/FirstChar", 0) 

55 self.width_map = { 

56 self.encoding.get(idx + first_char, chr(idx + first_char)): width 

57 for idx, width in enumerate(self.font_dictionary["/Widths"]) 

58 } 

59 

60 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts 

61 if "/DescendantFonts" in self.font_dictionary: 

62 d_font: Dict[Any, Any] 

63 for d_font_idx, d_font in enumerate( 

64 self.font_dictionary["/DescendantFonts"] 

65 ): 

66 while isinstance(d_font, IndirectObject): 

67 d_font = d_font.get_object() 

68 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font 

69 ord_map = { 

70 ord(_target): _surrogate 

71 for _target, _surrogate in self.char_map.items() 

72 if isinstance(_target, str) 

73 } 

74 # /W width definitions have two valid formats which can be mixed and matched: 

75 # (1) A character start index followed by a list of widths, e.g. 

76 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 

77 # (2) A character start index, a character stop index, and a width, e.g. 

78 # `45 65 500` applies width 500 to characters 45-65. 

79 skip_count = 0 

80 _w = d_font.get("/W", []) 

81 for idx, w_entry in enumerate(_w): 

82 w_entry = w_entry.get_object() 

83 if skip_count: 

84 skip_count -= 1 

85 continue 

86 if not isinstance(w_entry, (int, float)): # pragma: no cover 

87 # We should never get here due to skip_count above. Add a 

88 # warning and or use reader's "strict" to force an ex??? 

89 continue 

90 # check for format (1): `int [int int int int ...]` 

91 w_next_entry = _w[idx + 1].get_object() 

92 if isinstance(w_next_entry, Sequence): 

93 start_idx, width_list = w_entry, w_next_entry 

94 self.width_map.update( 

95 { 

96 ord_map[_cidx]: _width 

97 for _cidx, _width in zip( 

98 range( 

99 cast(int, start_idx), 

100 cast(int, start_idx) + len(width_list), 

101 1, 

102 ), 

103 width_list, 

104 ) 

105 if _cidx in ord_map 

106 } 

107 ) 

108 skip_count = 1 

109 # check for format (2): `int int int` 

110 elif isinstance(w_next_entry, (int, float)) and isinstance( 

111 _w[idx + 2].get_object(), (int, float) 

112 ): 

113 start_idx, stop_idx, const_width = ( 

114 w_entry, 

115 w_next_entry, 

116 _w[idx + 2].get_object(), 

117 ) 

118 self.width_map.update( 

119 { 

120 ord_map[_cidx]: const_width 

121 for _cidx in range( 

122 cast(int, start_idx), cast(int, stop_idx + 1), 1 

123 ) 

124 if _cidx in ord_map 

125 } 

126 ) 

127 skip_count = 2 

128 else: 

129 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions 

130 # while expecting more elements). This raises an IndexError which is sufficient. 

131 raise ParseError( 

132 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" 

133 ) # pragma: no cover 

134 

135 if not self.width_map and "/BaseFont" in self.font_dictionary: 

136 for key in STANDARD_WIDTHS: 

137 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): 

138 self.width_map = STANDARD_WIDTHS[key] 

139 break 

140 

141 def word_width(self, word: str) -> float: 

142 """Sum of character widths specified in PDF font for the supplied word""" 

143 return sum( 

144 [self.width_map.get(char, self.space_width * 2) for char in word], 0.0 

145 ) 

146 

147 @staticmethod 

148 def to_dict(font_instance: "Font") -> Dict[str, Any]: 

149 """Dataclass to dict for json.dumps serialization.""" 

150 return { 

151 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__ 

152 }