Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_font.py: 37%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

57 statements  

1"""Font constants and classes for "layout" mode text operations""" 

2 

3from collections.abc import Sequence 

4from dataclasses import dataclass, field 

5from typing import Any, Union, cast 

6 

7from ..._codecs import adobe_glyphs 

8from ...errors import ParseError 

9from ...generic import IndirectObject 

10from ._font_widths import STANDARD_WIDTHS 

11 

12 

13@dataclass 

14class Font: 

15 """ 

16 A font object formatted for use during "layout" mode text extraction 

17 

18 Attributes: 

19 subtype (str): font subtype 

20 space_width (int | float): width of a space character 

21 encoding (str | Dict[int, str]): font encoding 

22 char_map (dict): character map 

23 font_dictionary (dict): font dictionary 

24 width_map (Dict[str, int]): mapping of characters to widths 

25 interpretable (bool): Default True. If False, the font glyphs cannot 

26 be translated to characters, e.g. Type3 fonts that do not define 

27 a '/ToUnicode' mapping. 

28 

29 """ 

30 

31 subtype: str 

32 space_width: Union[int, float] 

33 encoding: Union[str, dict[int, str]] 

34 char_map: dict[Any, Any] 

35 font_dictionary: dict[Any, Any] 

36 width_map: dict[str, int] = field(default_factory=dict, init=False) 

37 interpretable: bool = True 

38 

39 def __post_init__(self) -> None: 

40 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 

41 # reliably converted into character codes unless all named chars 

42 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 

43 # PDF 1.7 standard. 

44 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary: 

45 self.interpretable = all( 

46 cname in adobe_glyphs 

47 for cname in self.font_dictionary.get("/CharProcs") or [] 

48 ) 

49 

50 if not self.interpretable: # save some overhead if font is not interpretable 

51 return 

52 

53 # TrueType fonts have a /Widths array mapping character codes to widths 

54 if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary: 

55 first_char = self.font_dictionary.get("/FirstChar", 0) 

56 self.width_map = { 

57 self.encoding.get(idx + first_char, chr(idx + first_char)): width 

58 for idx, width in enumerate(self.font_dictionary["/Widths"]) 

59 } 

60 

61 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts 

62 if "/DescendantFonts" in self.font_dictionary: 

63 d_font: dict[Any, Any] 

64 for d_font_idx, d_font in enumerate( 

65 self.font_dictionary["/DescendantFonts"] 

66 ): 

67 while isinstance(d_font, IndirectObject): 

68 d_font = d_font.get_object() 

69 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font 

70 ord_map = { 

71 ord(_target): _surrogate 

72 for _target, _surrogate in self.char_map.items() 

73 if isinstance(_target, str) 

74 } 

75 # /W width definitions have two valid formats which can be mixed and matched: 

76 # (1) A character start index followed by a list of widths, e.g. 

77 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 

78 # (2) A character start index, a character stop index, and a width, e.g. 

79 # `45 65 500` applies width 500 to characters 45-65. 

80 skip_count = 0 

81 _w = d_font.get("/W", []) 

82 for idx, w_entry in enumerate(_w): 

83 w_entry = w_entry.get_object() 

84 if skip_count: 

85 skip_count -= 1 

86 continue 

87 if not isinstance(w_entry, (int, float)): # pragma: no cover 

88 # We should never get here due to skip_count above. Add a 

89 # warning and or use reader's "strict" to force an ex??? 

90 continue 

91 # check for format (1): `int [int int int int ...]` 

92 w_next_entry = _w[idx + 1].get_object() 

93 if isinstance(w_next_entry, Sequence): 

94 start_idx, width_list = w_entry, w_next_entry 

95 self.width_map.update( 

96 { 

97 ord_map[_cidx]: _width 

98 for _cidx, _width in zip( 

99 range( 

100 cast(int, start_idx), 

101 cast(int, start_idx) + len(width_list), 

102 1, 

103 ), 

104 width_list, 

105 ) 

106 if _cidx in ord_map 

107 } 

108 ) 

109 skip_count = 1 

110 # check for format (2): `int int int` 

111 elif isinstance(w_next_entry, (int, float)) and isinstance( 

112 _w[idx + 2].get_object(), (int, float) 

113 ): 

114 start_idx, stop_idx, const_width = ( 

115 w_entry, 

116 w_next_entry, 

117 _w[idx + 2].get_object(), 

118 ) 

119 self.width_map.update( 

120 { 

121 ord_map[_cidx]: const_width 

122 for _cidx in range( 

123 cast(int, start_idx), cast(int, stop_idx + 1), 1 

124 ) 

125 if _cidx in ord_map 

126 } 

127 ) 

128 skip_count = 2 

129 else: 

130 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions 

131 # while expecting more elements). This raises an IndexError which is sufficient. 

132 raise ParseError( 

133 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" 

134 ) # pragma: no cover 

135 

136 if not self.width_map and "/BaseFont" in self.font_dictionary: 

137 for key in STANDARD_WIDTHS: 

138 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): 

139 self.width_map = STANDARD_WIDTHS[key] 

140 break 

141 

142 def word_width(self, word: str) -> float: 

143 """Sum of character widths specified in PDF font for the supplied word""" 

144 return sum( 

145 [self.width_map.get(char, self.space_width * 2) for char in word], 0.0 

146 ) 

147 

148 @staticmethod 

149 def to_dict(font_instance: "Font") -> dict[str, Any]: 

150 """Dataclass to dict for json.dumps serialization.""" 

151 return { 

152 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__ 

153 }