Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py: 39%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

80 statements  

1"""manage the PDF transform stack during "layout" mode text extraction""" 

2 

3from collections import ChainMap, Counter 

4from collections import ChainMap as ChainMapType 

5from collections import Counter as CounterType 

6from collections.abc import MutableMapping 

7from typing import Any, Union 

8 

9from ..._font import Font 

10from ...errors import PdfReadError 

11from .. import mult 

12from ._text_state_params import TextStateParams 

13 

14TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]] 

15TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]] 

16 

17 

18class TextStateManager: 

19 """ 

20 Tracks the current text state including cm/tm/trm transformation matrices. 

21 

22 Attributes: 

23 transform_stack (ChainMap): ChainMap of cm/tm transformation matrices 

24 q_queue (Counter[int]): Counter of q operators 

25 q_depth (List[int]): list of q operator nesting levels 

26 Tc (float): character spacing 

27 Tw (float): word spacing 

28 Tz (int): horizontal scaling 

29 TL (float): leading 

30 Ts (float): text rise 

31 font (Font): font object 

32 font_size (int | float): font size 

33 

34 """ 

35 

36 def __init__(self) -> None: 

37 self.transform_stack: TextStateManagerChainMapType = ChainMap( 

38 self.new_transform() 

39 ) 

40 self.q_queue: CounterType[int] = Counter() 

41 self.q_depth = [0] 

42 self.Tc: float = 0.0 

43 self.Tw: float = 0.0 

44 self.Tz: float = 100.0 

45 self.TL: float = 0.0 

46 self.Ts: float = 0.0 

47 self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = [] 

48 self.font: Union[Font, None] = None 

49 self.font_size: Union[int, float] = 0 

50 

51 def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None: 

52 """ 

53 Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators. 

54 

55 Args: 

56 op: operator read from PDF stream as bytes. No action is taken 

57 for unsupported operators (see supported operators above). 

58 value (float | List[Any]): new parameter value. If a list, 

59 value[0] is used. 

60 

61 """ 

62 if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]: 

63 return 

64 self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value) 

65 

66 def set_font(self, font: Font, size: float) -> None: 

67 """ 

68 Set the current font and font_size. 

69 

70 Args: 

71 font (Font): a layout mode Font 

72 size (float): font size 

73 

74 """ 

75 self.font = font 

76 self.font_size = size 

77 

78 def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams: 

79 """ 

80 Create a TextStateParams instance to display a text string. Type[bytes] values 

81 will be decoded implicitly. 

82 

83 Args: 

84 value (str | bytes): text to associate with the captured state. 

85 

86 Raises: 

87 PdfReadError: if font not set (no Tf operator in incoming pdf content stream) 

88 

89 Returns: 

90 TextStateParams: current text state parameters 

91 

92 """ 

93 if not isinstance(self.font, Font): 

94 raise PdfReadError( 

95 "font not set: is PDF missing a Tf operator?" 

96 ) # pragma: no cover 

97 return TextStateParams( 

98 value, 

99 self.font, 

100 self.font_size, 

101 self.Tc, 

102 self.Tw, 

103 self.Tz, 

104 self.TL, 

105 self.Ts, 

106 self.effective_transform, 

107 ) 

108 

109 @staticmethod 

110 def raw_transform( 

111 _a: float = 1.0, 

112 _b: float = 0.0, 

113 _c: float = 0.0, 

114 _d: float = 1.0, 

115 _e: float = 0.0, 

116 _f: float = 0.0, 

117 ) -> TextStateManagerDictType: 

118 """Only a/b/c/d/e/f matrix params""" 

119 return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f)))) 

120 

121 @staticmethod 

122 def new_transform( 

123 _a: float = 1.0, 

124 _b: float = 0.0, 

125 _c: float = 0.0, 

126 _d: float = 1.0, 

127 _e: float = 0.0, 

128 _f: float = 0.0, 

129 is_text: bool = False, 

130 is_render: bool = False, 

131 ) -> TextStateManagerDictType: 

132 """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys""" 

133 result = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f) 

134 result.update({"is_text": is_text, "is_render": is_render}) 

135 return result 

136 

137 def reset_tm(self) -> TextStateManagerChainMapType: 

138 """Clear all transforms from chainmap having is_text==True or is_render==True""" 

139 while ( 

140 self.transform_stack.maps[0]["is_text"] 

141 or self.transform_stack.maps[0]["is_render"] 

142 ): 

143 self.transform_stack = self.transform_stack.parents 

144 return self.transform_stack 

145 

146 def reset_trm(self) -> TextStateManagerChainMapType: 

147 """Clear all transforms from chainmap having is_render==True""" 

148 while self.transform_stack.maps[0]["is_render"]: 

149 self.transform_stack = self.transform_stack.parents 

150 return self.transform_stack 

151 

152 def remove_q(self) -> TextStateManagerChainMapType: 

153 """Rewind to stack prior state after closing a 'q' with internal 'cm' ops""" 

154 self.font, self.font_size = self.font_stack.pop(-1) 

155 self.transform_stack = self.reset_tm() 

156 self.transform_stack.maps = self.transform_stack.maps[ 

157 self.q_queue.pop(self.q_depth.pop(), 0) : 

158 ] 

159 return self.transform_stack 

160 

161 def add_q(self) -> None: 

162 """Add another level to q_queue""" 

163 self.font_stack.append((self.font, self.font_size)) 

164 self.q_depth.append(len(self.q_depth)) 

165 

166 def add_cm(self, *args: Any) -> TextStateManagerChainMapType: 

167 """Concatenate an additional transform matrix""" 

168 self.transform_stack = self.reset_tm() 

169 self.q_queue.update(self.q_depth[-1:]) 

170 self.transform_stack = self.transform_stack.new_child(self.new_transform(*args)) 

171 return self.transform_stack 

172 

173 def _complete_matrix(self, operands: list[float]) -> list[float]: 

174 """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)""" 

175 if len(operands) == 2: # this is a Td operator or equivalent 

176 operands = [1.0, 0.0, 0.0, 1.0, *operands] 

177 return operands 

178 

179 def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType: 

180 """Append a text transform matrix""" 

181 self.transform_stack = self.transform_stack.new_child( 

182 self.new_transform( # type: ignore[misc] 

183 *self._complete_matrix(operands), is_text=True # type: ignore[arg-type] 

184 ) 

185 ) 

186 return self.transform_stack 

187 

188 def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType: 

189 """Append a text rendering transform matrix""" 

190 self.transform_stack = self.transform_stack.new_child( 

191 self.new_transform( # type: ignore[misc] 

192 *self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type] 

193 ) 

194 ) 

195 return self.transform_stack 

196 

197 @property 

198 def effective_transform(self) -> list[float]: 

199 """Current effective transform accounting for cm, tm, and trm transforms""" 

200 eff_transform = [*self.transform_stack.maps[0].values()] 

201 for transform in self.transform_stack.maps[1:]: 

202 eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5 

203 return eff_transform