Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py: 34%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

88 statements  

1"""manage the PDF transform stack during "layout" mode text extraction""" 

2 

3from collections import ChainMap, Counter 

4from typing import Any, Dict, List, MutableMapping, Tuple, Union 

5from typing import ChainMap as ChainMapType 

6from typing import Counter as CounterType 

7 

8from ...errors import PdfReadError 

9from .. import mult 

10from ._font import Font 

11from ._text_state_params import TextStateParams 

12 

13TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]] 

14TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]] 

15 

16 

17class TextStateManager: 

18 """ 

19 Tracks the current text state including cm/tm/trm transformation matrices. 

20 

21 Attributes: 

22 transform_stack (ChainMap): ChainMap of cm/tm transformation matrices 

23 q_queue (Counter[int]): Counter of q operators 

24 q_depth (List[int]): list of q operator nesting levels 

25 Tc (float): character spacing 

26 Tw (float): word spacing 

27 Tz (int): horizontal scaling 

28 TL (float): leading 

29 Ts (float): text rise 

30 font (Font): font object 

31 font_size (int | float): font size 

32 

33 """ 

34 

35 def __init__(self) -> None: 

36 self.transform_stack: TextStateManagerChainMapType = ChainMap( 

37 self.new_transform() 

38 ) 

39 self.q_queue: CounterType[int] = Counter() 

40 self.q_depth = [0] 

41 self.Tc: float = 0.0 

42 self.Tw: float = 0.0 

43 self.Tz: float = 100.0 

44 self.TL: float = 0.0 

45 self.Ts: float = 0.0 

46 self.font_stack: List[Tuple[Union[Font, None], Union[int, float]]] = [] 

47 self.font: Union[Font, None] = None 

48 self.font_size: Union[int, float] = 0 

49 

50 def set_state_param(self, op: bytes, value: Union[float, List[Any]]) -> None: 

51 """ 

52 Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators. 

53 

54 Args: 

55 op: operator read from PDF stream as bytes. No action is taken 

56 for unsupported operators (see supported operators above). 

57 value (float | List[Any]): new parameter value. If a list, 

58 value[0] is used. 

59 

60 """ 

61 if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]: 

62 return 

63 self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value) 

64 

65 def set_font(self, font: Font, size: float) -> None: 

66 """ 

67 Set the current font and font_size. 

68 

69 Args: 

70 font (Font): a layout mode Font 

71 size (float): font size 

72 

73 """ 

74 self.font = font 

75 self.font_size = size 

76 

77 def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams: 

78 """ 

79 Create a TextStateParams instance to display a text string. Type[bytes] values 

80 will be decoded implicitly. 

81 

82 Args: 

83 value (str | bytes): text to associate with the captured state. 

84 

85 Raises: 

86 PdfReadError: if font not set (no Tf operator in incoming pdf content stream) 

87 

88 Returns: 

89 TextStateParams: current text state parameters 

90 

91 """ 

92 if not isinstance(self.font, Font): 

93 raise PdfReadError( 

94 "font not set: is PDF missing a Tf operator?" 

95 ) # pragma: no cover 

96 if isinstance(value, bytes): 

97 try: 

98 if isinstance(self.font.encoding, str): 

99 txt = value.decode(self.font.encoding, "surrogatepass") 

100 else: 

101 txt = "".join( 

102 self.font.encoding[x] 

103 if x in self.font.encoding 

104 else bytes((x,)).decode() 

105 for x in value 

106 ) 

107 except (UnicodeEncodeError, UnicodeDecodeError): 

108 txt = value.decode("utf-8", "replace") 

109 txt = "".join( 

110 self.font.char_map.get(x, x) for x in txt 

111 ) 

112 else: 

113 txt = value 

114 return TextStateParams( 

115 txt, 

116 self.font, 

117 self.font_size, 

118 self.Tc, 

119 self.Tw, 

120 self.Tz, 

121 self.TL, 

122 self.Ts, 

123 self.effective_transform, 

124 ) 

125 

126 @staticmethod 

127 def raw_transform( 

128 _a: float = 1.0, 

129 _b: float = 0.0, 

130 _c: float = 0.0, 

131 _d: float = 1.0, 

132 _e: float = 0.0, 

133 _f: float = 0.0, 

134 ) -> Dict[int, float]: 

135 """Only a/b/c/d/e/f matrix params""" 

136 return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f)))) 

137 

138 @staticmethod 

139 def new_transform( 

140 _a: float = 1.0, 

141 _b: float = 0.0, 

142 _c: float = 0.0, 

143 _d: float = 1.0, 

144 _e: float = 0.0, 

145 _f: float = 0.0, 

146 is_text: bool = False, 

147 is_render: bool = False, 

148 ) -> TextStateManagerDictType: 

149 """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys""" 

150 result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f) 

151 result.update({"is_text": is_text, "is_render": is_render}) 

152 return result 

153 

154 def reset_tm(self) -> TextStateManagerChainMapType: 

155 """Clear all transforms from chainmap having is_text==True or is_render==True""" 

156 while ( 

157 self.transform_stack.maps[0]["is_text"] 

158 or self.transform_stack.maps[0]["is_render"] 

159 ): 

160 self.transform_stack = self.transform_stack.parents 

161 return self.transform_stack 

162 

163 def reset_trm(self) -> TextStateManagerChainMapType: 

164 """Clear all transforms from chainmap having is_render==True""" 

165 while self.transform_stack.maps[0]["is_render"]: 

166 self.transform_stack = self.transform_stack.parents 

167 return self.transform_stack 

168 

169 def remove_q(self) -> TextStateManagerChainMapType: 

170 """Rewind to stack prior state after closing a 'q' with internal 'cm' ops""" 

171 self.font, self.font_size = self.font_stack.pop(-1) 

172 self.transform_stack = self.reset_tm() 

173 self.transform_stack.maps = self.transform_stack.maps[ 

174 self.q_queue.pop(self.q_depth.pop(), 0) : 

175 ] 

176 return self.transform_stack 

177 

178 def add_q(self) -> None: 

179 """Add another level to q_queue""" 

180 self.font_stack.append((self.font, self.font_size)) 

181 self.q_depth.append(len(self.q_depth)) 

182 

183 def add_cm(self, *args: Any) -> TextStateManagerChainMapType: 

184 """Concatenate an additional transform matrix""" 

185 self.transform_stack = self.reset_tm() 

186 self.q_queue.update(self.q_depth[-1:]) 

187 self.transform_stack = self.transform_stack.new_child(self.new_transform(*args)) 

188 return self.transform_stack 

189 

190 def _complete_matrix(self, operands: List[float]) -> List[float]: 

191 """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)""" 

192 if len(operands) == 2: # this is a Td operator or equivalent 

193 operands = [1.0, 0.0, 0.0, 1.0, *operands] 

194 return operands 

195 

196 def add_tm(self, operands: List[float]) -> TextStateManagerChainMapType: 

197 """Append a text transform matrix""" 

198 self.transform_stack = self.transform_stack.new_child( 

199 self.new_transform( # type: ignore[misc] 

200 *self._complete_matrix(operands), is_text=True # type: ignore[arg-type] 

201 ) 

202 ) 

203 return self.transform_stack 

204 

205 def add_trm(self, operands: List[float]) -> TextStateManagerChainMapType: 

206 """Append a text rendering transform matrix""" 

207 self.transform_stack = self.transform_stack.new_child( 

208 self.new_transform( # type: ignore[misc] 

209 *self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type] 

210 ) 

211 ) 

212 return self.transform_stack 

213 

214 @property 

215 def effective_transform(self) -> List[float]: 

216 """Current effective transform accounting for cm, tm, and trm transforms""" 

217 eff_transform = [*self.transform_stack.maps[0].values()] 

218 for transform in self.transform_stack.maps[1:]: 

219 eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5 

220 return eff_transform