Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py: 60%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

53 statements  

1"""A dataclass that captures the CTM and Text State for a tj operation""" 

2 

3import math 

4from dataclasses import dataclass, field 

5from typing import Any, Dict, List, Union 

6 

7from .. import mult, orient 

8from ._font import Font 

9 

10 

11@dataclass 

12class TextStateParams: 

13 """ 

14 Text state parameters and operator values for a single text value in a 

15 TJ or Tj PDF operation. 

16 

17 Attributes: 

18 txt (str): the text to be rendered. 

19 font (Font): font object 

20 font_size (int | float): font size 

21 Tc (float): character spacing. Defaults to 0.0. 

22 Tw (float): word spacing. Defaults to 0.0. 

23 Tz (float): horizontal scaling. Defaults to 100.0. 

24 TL (float): leading, vertical displacement between text lines. Defaults to 0.0. 

25 Ts (float): text rise. Used for super/subscripts. Defaults to 0.0. 

26 transform (List[float]): effective transformation matrix. 

27 tx (float): x cood of rendered text, i.e. self.transform[4] 

28 ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts. 

29 displaced_tx (float): x coord immediately following rendered text 

30 space_tx (float): tx for a space character 

31 font_height (float): effective font height accounting for CTM 

32 flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.) 

33 rotated (bool): True if the text orientation is rotated with respect to the page. 

34 

35 """ 

36 

37 txt: str 

38 font: Font 

39 font_size: Union[int, float] 

40 Tc: float = 0.0 

41 Tw: float = 0.0 

42 Tz: float = 100.0 

43 TL: float = 0.0 

44 Ts: float = 0.0 

45 transform: List[float] = field( 

46 default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

47 ) 

48 tx: float = field(default=0.0, init=False) 

49 ty: float = field(default=0.0, init=False) 

50 displaced_tx: float = field(default=0.0, init=False) 

51 space_tx: float = field(default=0.0, init=False) 

52 font_height: float = field(default=0.0, init=False) 

53 flip_vertical: bool = field(default=False, init=False) 

54 rotated: bool = field(default=False, init=False) 

55 

56 def __post_init__(self) -> None: 

57 if orient(self.transform) in (90, 270): 

58 self.transform = mult( 

59 [1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0], 

60 self.transform, 

61 ) 

62 self.rotated = True 

63 # self.transform[0] AND self.transform[3] < 0 indicates true rotation. 

64 # If only self.transform[3] < 0, the y coords are simply inverted. 

65 if orient(self.transform) == 180 and self.transform[0] < -1e-6: 

66 self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform) 

67 self.rotated = True 

68 self.displaced_tx = self.displaced_transform()[4] 

69 self.tx = self.transform[4] 

70 self.ty = self.render_transform()[5] 

71 self.space_tx = round(self.word_tx(" "), 3) 

72 if self.space_tx < 1e-6: 

73 # if the " " char is assigned 0 width (e.g. for fine tuned spacing 

74 # with TJ int operators a la crazyones.pdf), calculate space_tx as 

75 # a TD_offset of -2 * font.space_width where font.space_width is 

76 # the space_width calculated in _cmap.py. 

77 self.space_tx = round(self.word_tx("", self.font.space_width * -2), 3) 

78 self.font_height = self.font_size * math.sqrt( 

79 self.transform[1] ** 2 + self.transform[3] ** 2 

80 ) 

81 # flip_vertical handles PDFs generated by Microsoft Word's "publish" command. 

82 self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis 

83 

84 def font_size_matrix(self) -> List[float]: 

85 """Font size matrix""" 

86 return [ 

87 self.font_size * (self.Tz / 100.0), 

88 0.0, 

89 0.0, 

90 self.font_size, 

91 0.0, 

92 self.Ts, 

93 ] 

94 

95 def displaced_transform(self) -> List[float]: 

96 """Effective transform matrix after text has been rendered.""" 

97 return mult(self.displacement_matrix(), self.transform) 

98 

99 def render_transform(self) -> List[float]: 

100 """Effective transform matrix accounting for font size, Tz, and Ts.""" 

101 return mult(self.font_size_matrix(), self.transform) 

102 

103 def displacement_matrix( 

104 self, word: Union[str, None] = None, TD_offset: float = 0.0 

105 ) -> List[float]: 

106 """ 

107 Text displacement matrix 

108 

109 Args: 

110 word (str, optional): Defaults to None in which case self.txt displacement is 

111 returned. 

112 TD_offset (float, optional): translation applied by TD operator. Defaults to 0.0. 

113 

114 """ 

115 word = word if word is not None else self.txt 

116 return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, TD_offset), 0.0] 

117 

118 def word_tx(self, word: str, TD_offset: float = 0.0) -> float: 

119 """Horizontal text displacement for any word according this text state""" 

120 return ( 

121 (self.font_size * ((self.font.word_width(word) - TD_offset) / 1000.0)) 

122 + self.Tc 

123 + word.count(" ") * self.Tw 

124 ) * (self.Tz / 100.0) 

125 

126 @staticmethod 

127 def to_dict(inst: "TextStateParams") -> Dict[str, Any]: 

128 """Dataclass to dict for json.dumps serialization""" 

129 return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}