Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py: 48%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

71 statements  

1"""A dataclass that captures the CTM and Text State for a tj operation""" 

2 

3import math 

4from dataclasses import dataclass, field 

5from typing import Any, Union 

6 

7from ..._font import Font 

8from .. import mult, orient 

9 

10 

11@dataclass 

12class TextStateParams: 

13 """ 

14 Text state parameters and operator values for a single text value in a 

15 TJ or Tj PDF operation. 

16 

17 Attributes: 

18 value (bytes | str): the raw text to be rendered. 

19 font (Font): font object 

20 font_size (int | float): font size 

21 Tc (float): character spacing. Defaults to 0.0. 

22 Tw (float): word spacing. Defaults to 0.0. 

23 Tz (float): horizontal scaling. Defaults to 100.0. 

24 TL (float): leading, vertical displacement between text lines. Defaults to 0.0. 

25 Ts (float): text rise. Used for super/subscripts. Defaults to 0.0. 

26 transform (List[float]): effective transformation matrix. 

27 tx (float): x cood of rendered text, i.e. self.transform[4] 

28 ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts. 

29 displaced_tx (float): x coord immediately following rendered text 

30 space_tx (float): tx for a space character 

31 font_height (float): effective font height accounting for CTM 

32 flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.) 

33 rotated (bool): True if the text orientation is rotated with respect to the page. 

34 

35 """ 

36 

37 value: Union[bytes, str] 

38 font: Font 

39 font_size: Union[int, float] 

40 Tc: float = 0.0 

41 Tw: float = 0.0 

42 Tz: float = 100.0 

43 TL: float = 0.0 

44 Ts: float = 0.0 

45 transform: list[float] = field( 

46 default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

47 ) 

48 tx: float = field(default=0.0, init=False) 

49 ty: float = field(default=0.0, init=False) 

50 displaced_tx: float = field(default=0.0, init=False) 

51 space_tx: float = field(default=0.0, init=False) 

52 font_height: float = field(default=0.0, init=False) 

53 flip_vertical: bool = field(default=False, init=False) 

54 rotated: bool = field(default=False, init=False) 

55 text: str = "" 

56 _decoded_value: str = "" 

57 

58 def __post_init__(self) -> None: 

59 if isinstance(self.value, bytes): 

60 try: 

61 if isinstance(self.font.encoding, str): 

62 self._decoded_value = self.value.decode(self.font.encoding, "surrogatepass") 

63 else: 

64 self._decoded_value = "".join( 

65 self.font.encoding[x] 

66 if x in self.font.encoding 

67 else bytes((x,)).decode() 

68 for x in self.value 

69 ) 

70 except UnicodeDecodeError: 

71 self._decoded_value = self.value.decode("utf-8", "replace") 

72 self.text = "".join( 

73 self.font.character_map.get(x, x) for x in self._decoded_value 

74 ) 

75 else: 

76 self.text = self.value 

77 

78 if orient(self.transform) in (90, 270): 

79 self.transform = mult( 

80 [1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0], 

81 self.transform, 

82 ) 

83 self.rotated = True 

84 # self.transform[0] AND self.transform[3] < 0 indicates true rotation. 

85 # If only self.transform[3] < 0, the y coords are simply inverted. 

86 if orient(self.transform) == 180 and self.transform[0] < -1e-6: 

87 self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform) 

88 self.rotated = True 

89 self.displaced_tx = self.displaced_transform()[4] 

90 self.tx = self.transform[4] 

91 self.ty = self.render_transform()[5] 

92 self.space_tx = round(self.word_tx(self.font.space_char), 3) 

93 if self.space_tx < 1e-6: 

94 # if the " " char is assigned 0 width (e.g. for fine tuned spacing 

95 # with TJ int operators a la crazyones.pdf), calculate space_tx as 

96 # a td_offset of -1 * font.space_width where font.space_width is 

97 # the space_width calculated in _font.py. 

98 self.space_tx = round(self.word_tx("", -self.font.space_width), 3) 

99 self.font_height = self.font_size * math.sqrt( 

100 self.transform[1] ** 2 + self.transform[3] ** 2 

101 ) 

102 # flip_vertical handles PDFs generated by Microsoft Word's "publish" command. 

103 self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis 

104 

105 def font_size_matrix(self) -> list[float]: 

106 """Font size matrix""" 

107 return [ 

108 self.font_size * (self.Tz / 100.0), 

109 0.0, 

110 0.0, 

111 self.font_size, 

112 0.0, 

113 self.Ts, 

114 ] 

115 

116 def displaced_transform(self) -> list[float]: 

117 """Effective transform matrix after text has been rendered.""" 

118 return mult(self.displacement_matrix(), self.transform) 

119 

120 def render_transform(self) -> list[float]: 

121 """Effective transform matrix accounting for font size, Tz, and Ts.""" 

122 return mult(self.font_size_matrix(), self.transform) 

123 

124 def displacement_matrix( 

125 self, word: Union[bytes, str, None] = None, td_offset: float = 0.0 

126 ) -> list[float]: 

127 """ 

128 Text displacement matrix 

129 

130 Args: 

131 word (bytes | str, optional): Defaults to None in which case self.text displacement is 

132 returned. 

133 td_offset (float, optional): translation applied by TD operator. Defaults to 0.0. 

134 

135 """ 

136 word = word if word is not None else self.value 

137 return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, td_offset), 0.0] 

138 

139 def word_tx(self, word: Union[bytes, str], td_offset: float = 0.0) -> float: 

140 """Horizontal text displacement for any word according this text state""" 

141 width: float = 0.0 

142 

143 if isinstance(word, bytes): 

144 word = self._decoded_value 

145 

146 for char in word: 

147 if char == self.font.space_char: 

148 width += self.font.space_width 

149 else: 

150 width += self.font.get_text_width(char) 

151 

152 return ( 

153 (self.font_size * ((width - td_offset) / 1000.0)) 

154 + self.Tc 

155 + word.count(self.font.space_char) * self.Tw 

156 ) * (self.Tz / 100.0) 

157 

158 @staticmethod 

159 def to_dict(inst: "TextStateParams") -> dict[str, Any]: 

160 """Dataclass to dict for json.dumps serialization""" 

161 return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}