Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_appearance_stream.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

185 statements  

1import re 

2from enum import IntEnum 

3from typing import Any, Optional, Union, cast 

4 

5from .._cmap import build_char_map_from_dict 

6from .._codecs.core_fontmetrics import CORE_FONT_METRICS 

7from .._font import FontDescriptor 

8from .._utils import logger_warning 

9from ..constants import AnnotationDictionaryAttributes, FieldDictionaryAttributes 

10from ..generic import ( 

11 DecodedStreamObject, 

12 DictionaryObject, 

13 NameObject, 

14 NumberObject, 

15 RectangleObject, 

16) 

17from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none 

18 

19DEFAULT_FONT_SIZE_IN_MULTILINE = 12 

20 

21 

22class TextAlignment(IntEnum): 

23 """Defines the alignment options for text within a form field's appearance stream.""" 

24 

25 LEFT = 0 

26 CENTER = 1 

27 RIGHT = 2 

28 

29 

30class TextStreamAppearance(DecodedStreamObject): 

31 """ 

32 A class representing the appearance stream for a text-based form field. 

33 

34 This class generates the content stream (the `ap_stream_data`) that dictates 

35 how text is rendered within a form field's bounding box. It handles properties 

36 like font, font size, color, multiline text, and text selection highlighting. 

37 """ 

38 

39 def _scale_text( 

40 self, 

41 font_descriptor: FontDescriptor, 

42 font_size: float, 

43 field_width: float, 

44 field_height: float, 

45 text: str, 

46 is_multiline: bool, 

47 min_font_size: float = 4.0, # Minimum font size to attempt 

48 font_size_step: float = 0.2 # How much to decrease font size by each step 

49 ) -> tuple[list[tuple[float, str]], float]: 

50 """ 

51 Takes a piece of text and scales it to field_width or field_height, given font_name 

52 and font_size. For multiline fields, adds newlines to wrap the text. 

53 

54 Args: 

55 font_descriptor: A FontDescriptor for the font to be used. 

56 font_size: The font size in points. 

57 field_width: The width of the field in which to fit the text. 

58 field_height: The height of the field in which to fit the text. 

59 text: The text to fit with the field. 

60 is_multiline: Whether to scale and wrap the text, or only to scale. 

61 min_font_size: The minimum font size at which to scale the text. 

62 font_size_step: The amount by which to decrement font size per step while scaling. 

63 

64 Returns: 

65 The text in the form of list of tuples, each tuple containing the length of a line 

66 and its contents, and the font_size for these lines and lengths. 

67 """ 

68 # Single line: 

69 if not is_multiline: 

70 test_width = font_descriptor.text_width(text) * font_size / 1000 

71 if test_width > field_width or font_size > field_height: 

72 new_font_size = font_size - font_size_step 

73 if new_font_size >= min_font_size: 

74 # Text overflows height; Retry with smaller font size. 

75 return self._scale_text( 

76 font_descriptor, 

77 round(new_font_size, 1), 

78 field_width, 

79 field_height, 

80 text, 

81 is_multiline, 

82 min_font_size, 

83 font_size_step 

84 ) 

85 return [(test_width, text)], font_size 

86 # Multiline: 

87 orig_text = text 

88 paragraphs = text.replace("\n", "\r").split("\r") 

89 wrapped_lines = [] 

90 current_line_words: list[str] = [] 

91 current_line_width: float = 0 

92 space_width = font_descriptor.text_width(" ") * font_size / 1000 

93 for paragraph in paragraphs: 

94 if not paragraph.strip(): 

95 wrapped_lines.append((0.0, "")) 

96 continue 

97 words = paragraph.split(" ") 

98 for i, word in enumerate(words): 

99 word_width = font_descriptor.text_width(word) * font_size / 1000 

100 test_width = current_line_width + word_width + (space_width if i else 0) 

101 if test_width > field_width and current_line_words: 

102 wrapped_lines.append((current_line_width, " ".join(current_line_words))) 

103 current_line_words = [word] 

104 current_line_width = word_width 

105 elif not current_line_words and word_width > field_width: 

106 wrapped_lines.append((word_width, word)) 

107 current_line_words = [] 

108 current_line_width = 0 

109 else: 

110 if current_line_words: 

111 current_line_width += space_width 

112 current_line_words.append(word) 

113 current_line_width += word_width 

114 if current_line_words: 

115 wrapped_lines.append((current_line_width, " ".join(current_line_words))) 

116 current_line_words = [] 

117 current_line_width = 0 

118 # Estimate total height. 

119 # Assumes line spacing of 1.4 

120 estimated_total_height = font_size + (len(wrapped_lines) - 1) * 1.4 * font_size 

121 if estimated_total_height > field_height: 

122 # Text overflows height; Retry with smaller font size. 

123 new_font_size = font_size - font_size_step 

124 if new_font_size >= min_font_size: 

125 return self._scale_text( 

126 font_descriptor, 

127 round(new_font_size, 1), 

128 field_width, 

129 field_height, 

130 orig_text, 

131 is_multiline, 

132 min_font_size, 

133 font_size_step 

134 ) 

135 return wrapped_lines, font_size 

136 

137 def _generate_appearance_stream_data( 

138 self, 

139 text: str = "", 

140 selection: Optional[list[str]] = None, 

141 rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), 

142 font_descriptor: Optional[FontDescriptor] = None, 

143 font_glyph_byte_map: Optional[dict[str, bytes]] = None, 

144 font_name: str = "/Helv", 

145 font_size: float = 0.0, 

146 font_color: str = "0 g", 

147 is_multiline: bool = False, 

148 alignment: TextAlignment = TextAlignment.LEFT, 

149 is_comb: bool = False, 

150 max_length: Optional[int] = None 

151 ) -> bytes: 

152 """ 

153 Generates the raw bytes of the PDF appearance stream for a text field. 

154 

155 This private method assembles the PDF content stream operators to draw 

156 the provided text within the specified rectangle. It handles text positioning, 

157 font application, color, and special formatting like selected text. 

158 

159 Args: 

160 text: The text to be rendered in the form field. 

161 selection: An optional list of strings that should be highlighted as selected. 

162 font_glyph_byte_map: An optional dictionary mapping characters to their 

163 byte representation for glyph encoding. 

164 rect: The bounding box of the form field. Can be a `RectangleObject` 

165 or a tuple of four floats (x1, y1, x2, y2). 

166 font_name: The name of the font resource to use (e.g., "/Helv"). 

167 font_size: The font size. If 0, it is automatically calculated 

168 based on whether the field is multiline or not. 

169 font_color: The color to apply to the font, represented as a PDF 

170 graphics state string (e.g., "0 g" for black). 

171 is_multiline: A boolean indicating if the text field is multiline. 

172 alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER. 

173 is_comb: Boolean that designates fixed-length fields, where every character 

174 fills one "cell", such as in a postcode. 

175 max_length: Used if is_comb is set. The maximum number of characters for a fixed- 

176 length field. 

177 

178 Returns: 

179 A byte string containing the PDF content stream data. 

180 

181 """ 

182 font_glyph_byte_map = font_glyph_byte_map or {} 

183 if isinstance(rectangle, tuple): 

184 rectangle = RectangleObject(rectangle) 

185 font_descriptor = cast(FontDescriptor, font_descriptor) 

186 

187 # If font_size is 0, apply the logic for multiline or large-as-possible font 

188 if font_size == 0: 

189 if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems 

190 is_multiline = False # with matching "selection" with "line" later on. 

191 if is_multiline: 

192 font_size = DEFAULT_FONT_SIZE_IN_MULTILINE 

193 else: 

194 font_size = rectangle.height - 2 

195 lines, font_size = self._scale_text( 

196 font_descriptor, 

197 font_size, 

198 rectangle.width - 3, # One point margin left and right, and an additional point because the first 

199 # offset takes one extra point (see below, "desired_abs_x_start") 

200 rectangle.height - 3, # One point margin for top and bottom, one point extra for the first line 

201 # (see y_offset) 

202 text, 

203 is_multiline, 

204 ) 

205 elif is_comb: 

206 if max_length and len(text) > max_length: 

207 logger_warning ( 

208 f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.", 

209 __name__ 

210 ) 

211 # We act as if each character is one line, because we draw it separately later on 

212 lines = [( 

213 font_descriptor.text_width(char) * font_size / 1000, 

214 char 

215 ) for index, char in enumerate(text) if index < (max_length or len(text))] 

216 else: 

217 lines = [( 

218 font_descriptor.text_width(line) * font_size / 1000, 

219 line 

220 ) for line in text.replace("\n", "\r").split("\r")] 

221 

222 # Set the vertical offset 

223 y_offset = rectangle.height - 1 - font_size 

224 default_appearance = f"{font_name} {font_size} Tf {font_color}" 

225 

226 ap_stream = ( 

227 f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} " 

228 f"re\nW\nBT\n{default_appearance}\n" 

229 ).encode() 

230 current_x_pos: float = 0 # Initial virtual position within the text object. 

231 

232 for line_number, (line_width, line) in enumerate(lines): 

233 if selection and line in selection: 

234 # Might be improved, but cannot find how to get fill working => replaced with lined box 

235 ap_stream += ( 

236 f"1 {y_offset - (line_number * font_size * 1.4) - 1} {rectangle.width - 2} {font_size + 2} re\n" 

237 f"0.5 0.5 0.5 rg s\n{default_appearance}\n" 

238 ).encode() 

239 

240 # Calculate the desired absolute starting X for the current line 

241 desired_abs_x_start: float = 0 

242 if is_comb and max_length: 

243 # Calculate the width of a cell for one character 

244 cell_width = rectangle.width / max_length 

245 # Space from the left edge of the cell to the character's baseline start 

246 # line_width here is the *actual* character width in points for the single character 'line' 

247 centering_offset_in_cell = (cell_width - line_width) / 2 

248 # Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset 

249 desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell 

250 elif alignment == TextAlignment.RIGHT: 

251 desired_abs_x_start = rectangle.width - 2 - line_width 

252 elif alignment == TextAlignment.CENTER: 

253 desired_abs_x_start = (rectangle.width - line_width) / 2 

254 else: # Left aligned; default 

255 desired_abs_x_start = 2 

256 # Calculate x_rel_offset: how much to move from the current_x_pos 

257 # to reach the desired_abs_x_start. 

258 x_rel_offset = desired_abs_x_start - current_x_pos 

259 

260 # Y-offset: 

261 y_rel_offset: float = 0 

262 if line_number == 0: 

263 y_rel_offset = y_offset # Initial vertical position 

264 elif is_comb: 

265 y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters 

266 else: 

267 y_rel_offset = - font_size * 1.4 # Move down by line height 

268 

269 # Td is a relative translation (Tx and Ty). 

270 # It updates the current text position. 

271 ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode() 

272 # Update current_x_pos based on the Td operation for the next iteration. 

273 # This is the X position where the *current line* will start. 

274 current_x_pos = desired_abs_x_start 

275 

276 encoded_line: list[bytes] = [ 

277 font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line 

278 ] 

279 if any(len(c) >= 2 for c in encoded_line): 

280 ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n" 

281 else: 

282 ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n" 

283 ap_stream += b"ET\nQ\nEMC\nQ\n" 

284 return ap_stream 

285 

286 def __init__( 

287 self, 

288 text: str = "", 

289 selection: Optional[list[str]] = None, 

290 rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), 

291 font_resource: Optional[DictionaryObject] = None, 

292 font_name: str = "/Helv", 

293 font_size: float = 0.0, 

294 font_color: str = "0 g", 

295 is_multiline: bool = False, 

296 alignment: TextAlignment = TextAlignment.LEFT, 

297 is_comb: bool = False, 

298 max_length: Optional[int] = None 

299 ) -> None: 

300 """ 

301 Initializes a TextStreamAppearance object. 

302 

303 This constructor creates a new PDF stream object configured as an XObject 

304 of subtype Form. It uses the `_appearance_stream_data` method to generate 

305 the content for the stream. 

306 

307 Args: 

308 text: The text to be rendered in the form field. 

309 selection: An optional list of strings that should be highlighted as selected. 

310 rect: The bounding box of the form field. Can be a `RectangleObject` 

311 or a tuple of four floats (x1, y1, x2, y2). 

312 font_resource: An optional variable that represents a PDF font dictionary. 

313 font_name: The name of the font resource, e.g., "/Helv". 

314 font_size: The font size. If 0, it's auto-calculated. 

315 font_color: The font color string. 

316 is_multiline: A boolean indicating if the text field is multiline. 

317 alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER. 

318 is_comb: Boolean that designates fixed-length fields, where every character 

319 fills one "cell", such as in a postcode. 

320 max_length: Used if is_comb is set. The maximum number of characters for a fixed- 

321 length field. 

322 

323 """ 

324 super().__init__() 

325 

326 # If a font resource was added, get the font character map 

327 if font_resource: 

328 font_resource = cast(DictionaryObject, font_resource.get_object()) 

329 font_descriptor = FontDescriptor.from_font_resource(font_resource) 

330 else: 

331 logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__) 

332 font_name = "/Helv" 

333 font_resource = DictionaryObject({ 

334 NameObject("/Subtype"): NameObject("/Type1"), 

335 NameObject("/Name"): NameObject("/Helv"), 

336 NameObject("/Type"): NameObject("/Font"), 

337 NameObject("/BaseFont"): NameObject("/Helvetica"), 

338 NameObject("/Encoding"): NameObject("/WinAnsiEncoding") 

339 }) 

340 font_descriptor = CORE_FONT_METRICS["Helvetica"] 

341 

342 # Get the font glyph data 

343 _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 

344 200, font_resource 

345 ) 

346 try: # remove width stored in -1 key 

347 del font_map[-1] 

348 except KeyError: 

349 pass 

350 font_glyph_byte_map: dict[str, bytes] 

351 if isinstance(font_encoding, str): 

352 font_glyph_byte_map = { 

353 v: k.encode(font_encoding) for k, v in font_map.items() 

354 } 

355 else: 

356 font_glyph_byte_map = {v: bytes((k,)) for k, v in font_encoding.items()} 

357 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

358 for key, value in font_map.items(): 

359 font_glyph_byte_map[value] = font_encoding_rev.get(key, key) 

360 

361 ap_stream_data = self._generate_appearance_stream_data( 

362 text, 

363 selection, 

364 rectangle, 

365 font_descriptor, 

366 font_glyph_byte_map, 

367 font_name=font_name, 

368 font_size=font_size, 

369 font_color=font_color, 

370 is_multiline=is_multiline, 

371 alignment=alignment, 

372 is_comb=is_comb, 

373 max_length=max_length 

374 ) 

375 

376 self[NameObject("/Type")] = NameObject("/XObject") 

377 self[NameObject("/Subtype")] = NameObject("/Form") 

378 self[NameObject("/BBox")] = RectangleObject(rectangle) 

379 self.set_data(ByteStringObject(ap_stream_data)) 

380 self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) 

381 # Update Resources with font information 

382 self[NameObject("/Resources")] = DictionaryObject({ 

383 NameObject("/Font"): DictionaryObject({ 

384 NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource) 

385 }) 

386 }) 

387 

388 @classmethod 

389 def from_text_annotation( 

390 cls, 

391 acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) 

392 field: DictionaryObject, 

393 annotation: DictionaryObject, 

394 user_font_name: str = "", 

395 user_font_size: float = -1, 

396 ) -> "TextStreamAppearance": 

397 """ 

398 Creates a TextStreamAppearance object from a text field annotation. 

399 

400 This class method is a factory for creating a `TextStreamAppearance` 

401 instance by extracting all necessary information (bounding box, font, 

402 text content, etc.) from the PDF field and annotation dictionaries. 

403 It respects inheritance for properties like default appearance (`/DA`). 

404 

405 Args: 

406 acro_form: The root AcroForm dictionary from the PDF catalog. 

407 field: The field dictionary object. 

408 annotation: The widget annotation dictionary object associated with the field. 

409 user_font_name: An optional user-provided font name to override the 

410 default. Defaults to an empty string. 

411 user_font_size: An optional user-provided font size to override the 

412 default. A value of -1 indicates no override. 

413 

414 Returns: 

415 A new `TextStreamAppearance` instance configured for the given field. 

416 

417 """ 

418 # Calculate rectangle dimensions 

419 _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect]) 

420 rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1]))) 

421 

422 # Get default appearance dictionary from annotation 

423 default_appearance = annotation.get_inherited( 

424 AnnotationDictionaryAttributes.DA, 

425 acro_form.get(AnnotationDictionaryAttributes.DA, None), 

426 ) 

427 if not default_appearance: 

428 # Create a default appearance if none was found in the annotation 

429 default_appearance = TextStringObject("/Helv 0 Tf 0 g") 

430 else: 

431 default_appearance = default_appearance.get_object() 

432 

433 # Derive font name, size and color from the default appearance. Also set 

434 # user-provided font name and font size in the default appearance, if given. 

435 # For a font name, this presumes that we can find an associated font resource 

436 # dictionary. Uses the variable font_properties as an intermediate. 

437 # As per the PDF spec: 

438 # "At a minimum, the string [that is, default_appearance] shall include a Tf (text 

439 # font) operator along with its two operands, font and size" (Section 12.7.4.3 

440 # "Variable text" of the PDF 2.0 specification). 

441 font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop] 

442 font_name = font_properties.pop(font_properties.index("Tf") - 2) 

443 font_size = float(font_properties.pop(font_properties.index("Tf") - 1)) 

444 font_properties.remove("Tf") 

445 font_color = " ".join(font_properties) 

446 # Determine the font name to use, prioritizing the user's input 

447 if user_font_name: 

448 font_name = user_font_name 

449 # Determine the font size to use, prioritizing the user's input 

450 if user_font_size > 0: 

451 font_size = user_font_size 

452 

453 # Try to find a resource dictionary for the font 

454 document_resources: Any = cast( 

455 DictionaryObject, 

456 cast( 

457 DictionaryObject, 

458 annotation.get_inherited( 

459 "/DR", 

460 acro_form.get("/DR", DictionaryObject()), 

461 ), 

462 ).get_object(), 

463 ) 

464 document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object() 

465 # CORE_FONT_METRICS is the dict with Standard font metrics 

466 if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS: 

467 # ...or AcroForm dictionary 

468 document_resources = cast( 

469 dict[Any, Any], 

470 acro_form.get("/DR", {}), 

471 ) 

472 document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object() 

473 font_resource = document_font_resources.get(font_name, None) 

474 if not is_null_or_none(font_resource): 

475 font_resource = cast(DictionaryObject, font_resource.get_object()) 

476 

477 # Retrieve field text and selected values 

478 field_flags = field.get(FieldDictionaryAttributes.Ff, 0) 

479 if ( 

480 field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and 

481 field_flags & FieldDictionaryAttributes.FfBits.Combo == 0 

482 ): 

483 text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, [])) 

484 selection = field.get("/V", []) 

485 if not isinstance(selection, list): 

486 selection = [selection] 

487 else: # /Tx 

488 text = field.get("/V", "") 

489 selection = [] 

490 

491 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) 

492 text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") 

493 

494 # Retrieve formatting information 

495 is_comb = False 

496 max_length = None 

497 if field_flags & FieldDictionaryAttributes.FfBits.Comb: 

498 is_comb = True 

499 max_length = annotation.get("/MaxLen") 

500 is_multiline = False 

501 if field_flags & FieldDictionaryAttributes.FfBits.Multiline: 

502 is_multiline = True 

503 alignment = field.get("/Q", TextAlignment.LEFT) 

504 

505 # Create the TextStreamAppearance instance 

506 new_appearance_stream = cls( 

507 text, 

508 selection, 

509 rectangle, 

510 font_resource, 

511 font_name=font_name, 

512 font_size=font_size, 

513 font_color=font_color, 

514 is_multiline=is_multiline, 

515 alignment=alignment, 

516 is_comb=is_comb, 

517 max_length=max_length 

518 ) 

519 if AnnotationDictionaryAttributes.AP in annotation: 

520 for key, value in ( 

521 cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items() 

522 ): 

523 if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: 

524 new_appearance_stream[key] = value 

525 

526 return new_appearance_stream