Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_appearance_stream.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

207 statements  

1import re 

2from dataclasses import dataclass 

3from enum import IntEnum 

4from typing import Any, Optional, Union, cast 

5 

6from .._cmap import build_char_map_from_dict 

7from .._codecs.core_fontmetrics import CORE_FONT_METRICS 

8from .._font import FontDescriptor 

9from .._utils import logger_warning 

10from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes 

11from ..generic import ( 

12 DecodedStreamObject, 

13 DictionaryObject, 

14 NameObject, 

15 NumberObject, 

16 RectangleObject, 

17) 

18from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none 

19 

20DEFAULT_FONT_SIZE_IN_MULTILINE = 12 

21 

22 

23@dataclass 

24class BaseStreamConfig: 

25 """A container representing the basic layout of an appearance stream.""" 

26 rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0) 

27 border_width: int = 1 # The width of the border in points 

28 border_style: str = BorderStyles.SOLID 

29 

30 

31class BaseStreamAppearance(DecodedStreamObject): 

32 """A class representing the very base of an appearance stream, that is, a rectangle and a border.""" 

33 

34 def __init__(self, layout: Optional[BaseStreamConfig] = None) -> None: 

35 """ 

36 Takes the appearance stream layout as an argument. 

37 

38 Args: 

39 layout: The basic layout parameters. 

40 """ 

41 super().__init__() 

42 self._layout = layout or BaseStreamConfig() 

43 self[NameObject("/Type")] = NameObject("/XObject") 

44 self[NameObject("/Subtype")] = NameObject("/Form") 

45 self[NameObject("/BBox")] = RectangleObject(self._layout.rectangle) 

46 

47 

48class TextAlignment(IntEnum): 

49 """Defines the alignment options for text within a form field's appearance stream.""" 

50 

51 LEFT = 0 

52 CENTER = 1 

53 RIGHT = 2 

54 

55 

56class TextStreamAppearance(BaseStreamAppearance): 

57 """ 

58 A class representing the appearance stream for a text-based form field. 

59 

60 This class generates the content stream (the `ap_stream_data`) that dictates 

61 how text is rendered within a form field's bounding box. It handles properties 

62 like font, font size, color, multiline text, and text selection highlighting. 

63 """ 

64 

65 def _scale_text( 

66 self, 

67 font_descriptor: FontDescriptor, 

68 font_size: float, 

69 leading_factor: float, 

70 field_width: float, 

71 field_height: float, 

72 text: str, 

73 min_font_size: float, 

74 font_size_step: float = 0.2 

75 ) -> tuple[list[tuple[float, str]], float]: 

76 """ 

77 Takes a piece of text and scales it to field_width or field_height, given font_name 

78 and font_size. Wraps text where necessary. 

79 

80 Args: 

81 font_descriptor: A FontDescriptor for the font to be used. 

82 font_size: The font size in points. 

83 leading_factor: The line distance. 

84 field_width: The width of the field in which to fit the text. 

85 field_height: The height of the field in which to fit the text. 

86 text: The text to fit with the field. 

87 min_font_size: The minimum font size at which to scale the text. 

88 font_size_step: The amount by which to decrement font size per step while scaling. 

89 

90 Returns: 

91 The text in the form of list of tuples, each tuple containing the length of a line 

92 and its contents, and the font_size for these lines and lengths. 

93 """ 

94 orig_text = text 

95 paragraphs = text.replace("\n", "\r").split("\r") 

96 wrapped_lines = [] 

97 current_line_words: list[str] = [] 

98 current_line_width: float = 0 

99 space_width = font_descriptor.text_width(" ") * font_size / 1000 

100 for paragraph in paragraphs: 

101 if not paragraph.strip(): 

102 wrapped_lines.append((0.0, "")) 

103 continue 

104 words = paragraph.split(" ") 

105 for i, word in enumerate(words): 

106 word_width = font_descriptor.text_width(word) * font_size / 1000 

107 test_width = current_line_width + word_width + (space_width if i else 0) 

108 if test_width > field_width and current_line_words: 

109 wrapped_lines.append((current_line_width, " ".join(current_line_words))) 

110 current_line_words = [word] 

111 current_line_width = word_width 

112 elif not current_line_words and word_width > field_width: 

113 wrapped_lines.append((word_width, word)) 

114 current_line_words = [] 

115 current_line_width = 0 

116 else: 

117 if current_line_words: 

118 current_line_width += space_width 

119 current_line_words.append(word) 

120 current_line_width += word_width 

121 if current_line_words: 

122 wrapped_lines.append((current_line_width, " ".join(current_line_words))) 

123 current_line_words = [] 

124 current_line_width = 0 

125 # Estimate total height. 

126 estimated_total_height = font_size + (len(wrapped_lines) - 1) * leading_factor * font_size 

127 if estimated_total_height > field_height: 

128 # Text overflows height; Retry with smaller font size. 

129 new_font_size = font_size - font_size_step 

130 if new_font_size >= min_font_size: 

131 return self._scale_text( 

132 font_descriptor, 

133 new_font_size, 

134 leading_factor, 

135 field_width, 

136 field_height, 

137 orig_text, 

138 min_font_size, 

139 font_size_step 

140 ) 

141 return wrapped_lines, round(font_size, 1) 

142 

143 def _generate_appearance_stream_data( 

144 self, 

145 text: str = "", 

146 selection: Optional[list[str]] = None, 

147 font_descriptor: Optional[FontDescriptor] = None, 

148 font_glyph_byte_map: Optional[dict[str, bytes]] = None, 

149 font_name: str = "/Helv", 

150 font_size: float = 0.0, 

151 font_color: str = "0 g", 

152 is_multiline: bool = False, 

153 alignment: TextAlignment = TextAlignment.LEFT, 

154 is_comb: bool = False, 

155 max_length: Optional[int] = None 

156 ) -> bytes: 

157 """ 

158 Generates the raw bytes of the PDF appearance stream for a text field. 

159 

160 This private method assembles the PDF content stream operators to draw 

161 the provided text within the specified rectangle. It handles text positioning, 

162 font application, color, and special formatting like selected text. 

163 

164 Args: 

165 text: The text to be rendered in the form field. 

166 selection: An optional list of strings that should be highlighted as selected. 

167 font_glyph_byte_map: An optional dictionary mapping characters to their 

168 byte representation for glyph encoding. 

169 font_name: The name of the font resource to use (e.g., "/Helv"). 

170 font_size: The font size. If 0, it is automatically calculated 

171 based on whether the field is multiline or not. 

172 font_color: The color to apply to the font, represented as a PDF 

173 graphics state string (e.g., "0 g" for black). 

174 is_multiline: A boolean indicating if the text field is multiline. 

175 alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER. 

176 is_comb: Boolean that designates fixed-length fields, where every character 

177 fills one "cell", such as in a postcode. 

178 max_length: Used if is_comb is set. The maximum number of characters for a fixed- 

179 length field. 

180 

181 Returns: 

182 A byte string containing the PDF content stream data. 

183 

184 """ 

185 rectangle = self._layout.rectangle 

186 font_glyph_byte_map = font_glyph_byte_map or {} 

187 if isinstance(rectangle, tuple): 

188 rectangle = RectangleObject(rectangle) 

189 font_descriptor = cast(FontDescriptor, font_descriptor) 

190 leading_factor = (font_descriptor.bbox[3] - font_descriptor.bbox[1]) / 1000.0 

191 

192 # Set margins based on border width and style, but never less than 1 point 

193 factor = 2 if self._layout.border_style in {"/B", "/I"} else 1 

194 margin = max(self._layout.border_width * factor, 1) 

195 field_height = rectangle.height - 2 * margin 

196 field_width = rectangle.width - 4 * margin 

197 

198 # If font_size is 0, apply the logic for multiline or large-as-possible font 

199 if font_size == 0: 

200 min_font_size = 4.0 # The mininum font size 

201 if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems 

202 is_multiline = False # with matching "selection" with "line" later on. 

203 if is_multiline: 

204 font_size = DEFAULT_FONT_SIZE_IN_MULTILINE 

205 lines, font_size = self._scale_text( 

206 font_descriptor, 

207 font_size, 

208 leading_factor, 

209 field_width, 

210 field_height, 

211 text, 

212 min_font_size 

213 ) 

214 else: 

215 max_vertical_size = field_height / leading_factor 

216 text_width_unscaled = font_descriptor.text_width(text) / 1000 

217 max_horizontal_size = field_width / (text_width_unscaled or 1) 

218 font_size = round(max(min(max_vertical_size, max_horizontal_size), min_font_size), 1) 

219 lines = [(text_width_unscaled * font_size, text)] 

220 elif is_comb: 

221 if max_length and len(text) > max_length: 

222 logger_warning ( 

223 f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.", 

224 __name__ 

225 ) 

226 # We act as if each character is one line, because we draw it separately later on 

227 lines = [( 

228 font_descriptor.text_width(char) * font_size / 1000, 

229 char 

230 ) for index, char in enumerate(text) if index < (max_length or len(text))] 

231 else: 

232 lines = [( 

233 font_descriptor.text_width(line) * font_size / 1000, 

234 line 

235 ) for line in text.replace("\n", "\r").split("\r")] 

236 

237 # Set the vertical offset 

238 if is_multiline: 

239 y_offset = rectangle.height + margin - font_descriptor.bbox[3] * font_size / 1000.0 

240 else: 

241 y_offset = margin + ((field_height - font_descriptor.ascent * font_size / 1000) / 2) 

242 default_appearance = f"{font_name} {font_size} Tf {font_color}" 

243 

244 ap_stream = ( 

245 f"q\n/Tx BMC \nq\n{2 * margin} {margin} {field_width} {field_height} " 

246 f"re\nW\nBT\n{default_appearance}\n" 

247 ).encode() 

248 current_x_pos: float = 0 # Initial virtual position within the text object. 

249 

250 for line_number, (line_width, line) in enumerate(lines): 

251 if selection and line in selection: 

252 # Might be improved, but cannot find how to get fill working => replaced with lined box 

253 ap_stream += ( 

254 f"1 {y_offset - (line_number * font_size * leading_factor) - 1} " 

255 f"{rectangle.width - 2} {font_size + 2} re\n" 

256 f"0.5 0.5 0.5 rg s\n{default_appearance}\n" 

257 ).encode() 

258 

259 # Calculate the desired absolute starting X for the current line 

260 desired_abs_x_start: float = 0 

261 if is_comb and max_length: 

262 # Calculate the width of a cell for one character 

263 cell_width = rectangle.width / max_length 

264 # Space from the left edge of the cell to the character's baseline start 

265 # line_width here is the *actual* character width in points for the single character 'line' 

266 centering_offset_in_cell = (cell_width - line_width) / 2 

267 # Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset 

268 desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell 

269 elif alignment == TextAlignment.RIGHT: 

270 desired_abs_x_start = rectangle.width - margin * 2 - line_width 

271 elif alignment == TextAlignment.CENTER: 

272 desired_abs_x_start = (rectangle.width - line_width) / 2 

273 else: # Left aligned; default 

274 desired_abs_x_start = margin * 2 

275 # Calculate x_rel_offset: how much to move from the current_x_pos 

276 # to reach the desired_abs_x_start. 

277 x_rel_offset = desired_abs_x_start - current_x_pos 

278 

279 # Y-offset: 

280 y_rel_offset: float = 0 

281 if line_number == 0: 

282 y_rel_offset = y_offset # Initial vertical position 

283 elif is_comb: 

284 y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters 

285 else: 

286 y_rel_offset = - font_size * leading_factor # Move down by line height 

287 

288 # Td is a relative translation (Tx and Ty). 

289 # It updates the current text position. 

290 ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode() 

291 # Update current_x_pos based on the Td operation for the next iteration. 

292 # This is the X position where the *current line* will start. 

293 current_x_pos = desired_abs_x_start 

294 

295 encoded_line: list[bytes] = [ 

296 font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line 

297 ] 

298 if any(len(c) >= 2 for c in encoded_line): 

299 ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n" 

300 else: 

301 ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n" 

302 ap_stream += b"ET\nQ\nEMC\nQ\n" 

303 return ap_stream 

304 

305 def __init__( 

306 self, 

307 layout: Optional[BaseStreamConfig] = None, 

308 text: str = "", 

309 selection: Optional[list[str]] = None, 

310 font_resource: Optional[DictionaryObject] = None, 

311 font_name: str = "/Helv", 

312 font_size: float = 0.0, 

313 font_color: str = "0 g", 

314 is_multiline: bool = False, 

315 alignment: TextAlignment = TextAlignment.LEFT, 

316 is_comb: bool = False, 

317 max_length: Optional[int] = None 

318 ) -> None: 

319 """ 

320 Initializes a TextStreamAppearance object. 

321 

322 This constructor creates a new PDF stream object configured as an XObject 

323 of subtype Form. It uses the `_appearance_stream_data` method to generate 

324 the content for the stream. 

325 

326 Args: 

327 layout: The basic layout parameters. 

328 text: The text to be rendered in the form field. 

329 selection: An optional list of strings that should be highlighted as selected. 

330 font_resource: An optional variable that represents a PDF font dictionary. 

331 font_name: The name of the font resource, e.g., "/Helv". 

332 font_size: The font size. If 0, it's auto-calculated. 

333 font_color: The font color string. 

334 is_multiline: A boolean indicating if the text field is multiline. 

335 alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER. 

336 is_comb: Boolean that designates fixed-length fields, where every character 

337 fills one "cell", such as in a postcode. 

338 max_length: Used if is_comb is set. The maximum number of characters for a fixed- 

339 length field. 

340 

341 """ 

342 super().__init__(layout) 

343 

344 # If a font resource was added, get the font character map 

345 if font_resource: 

346 font_resource = cast(DictionaryObject, font_resource.get_object()) 

347 font_descriptor = FontDescriptor.from_font_resource(font_resource) 

348 else: 

349 logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__) 

350 font_name = "/Helv" 

351 font_resource = DictionaryObject({ 

352 NameObject("/Subtype"): NameObject("/Type1"), 

353 NameObject("/Name"): NameObject("/Helv"), 

354 NameObject("/Type"): NameObject("/Font"), 

355 NameObject("/BaseFont"): NameObject("/Helvetica"), 

356 NameObject("/Encoding"): NameObject("/WinAnsiEncoding") 

357 }) 

358 font_descriptor = CORE_FONT_METRICS["Helvetica"] 

359 

360 # Get the font glyph data 

361 _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 

362 200, font_resource 

363 ) 

364 try: # remove width stored in -1 key 

365 del font_map[-1] 

366 except KeyError: 

367 pass 

368 font_glyph_byte_map: dict[str, bytes] 

369 if isinstance(font_encoding, str): 

370 font_glyph_byte_map = { 

371 v: k.encode(font_encoding) for k, v in font_map.items() 

372 } 

373 else: 

374 font_glyph_byte_map = {v: bytes((k,)) for k, v in font_encoding.items()} 

375 font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} 

376 for key, value in font_map.items(): 

377 font_glyph_byte_map[value] = font_encoding_rev.get(key, key) 

378 

379 ap_stream_data = self._generate_appearance_stream_data( 

380 text, 

381 selection, 

382 font_descriptor, 

383 font_glyph_byte_map, 

384 font_name=font_name, 

385 font_size=font_size, 

386 font_color=font_color, 

387 is_multiline=is_multiline, 

388 alignment=alignment, 

389 is_comb=is_comb, 

390 max_length=max_length 

391 ) 

392 

393 self.set_data(ByteStringObject(ap_stream_data)) 

394 self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) 

395 # Update Resources with font information 

396 self[NameObject("/Resources")] = DictionaryObject({ 

397 NameObject("/Font"): DictionaryObject({ 

398 NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource) 

399 }) 

400 }) 

401 

402 @classmethod 

403 def from_text_annotation( 

404 cls, 

405 acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) 

406 field: DictionaryObject, 

407 annotation: DictionaryObject, 

408 user_font_name: str = "", 

409 user_font_size: float = -1, 

410 ) -> "TextStreamAppearance": 

411 """ 

412 Creates a TextStreamAppearance object from a text field annotation. 

413 

414 This class method is a factory for creating a `TextStreamAppearance` 

415 instance by extracting all necessary information (bounding box, font, 

416 text content, etc.) from the PDF field and annotation dictionaries. 

417 It respects inheritance for properties like default appearance (`/DA`). 

418 

419 Args: 

420 acro_form: The root AcroForm dictionary from the PDF catalog. 

421 field: The field dictionary object. 

422 annotation: The widget annotation dictionary object associated with the field. 

423 user_font_name: An optional user-provided font name to override the 

424 default. Defaults to an empty string. 

425 user_font_size: An optional user-provided font size to override the 

426 default. A value of -1 indicates no override. 

427 

428 Returns: 

429 A new `TextStreamAppearance` instance configured for the given field. 

430 

431 """ 

432 # Calculate rectangle dimensions 

433 _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect]) 

434 rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1]))) 

435 

436 # Get default appearance dictionary from annotation 

437 default_appearance = annotation.get_inherited( 

438 AnnotationDictionaryAttributes.DA, 

439 acro_form.get(AnnotationDictionaryAttributes.DA, None), 

440 ) 

441 if not default_appearance: 

442 # Create a default appearance if none was found in the annotation 

443 default_appearance = TextStringObject("/Helv 0 Tf 0 g") 

444 else: 

445 default_appearance = default_appearance.get_object() 

446 

447 # Derive font name, size and color from the default appearance. Also set 

448 # user-provided font name and font size in the default appearance, if given. 

449 # For a font name, this presumes that we can find an associated font resource 

450 # dictionary. Uses the variable font_properties as an intermediate. 

451 # As per the PDF spec: 

452 # "At a minimum, the string [that is, default_appearance] shall include a Tf (text 

453 # font) operator along with its two operands, font and size" (Section 12.7.4.3 

454 # "Variable text" of the PDF 2.0 specification). 

455 font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop] 

456 font_name = font_properties.pop(font_properties.index("Tf") - 2) 

457 font_size = float(font_properties.pop(font_properties.index("Tf") - 1)) 

458 font_properties.remove("Tf") 

459 font_color = " ".join(font_properties) 

460 # Determine the font name to use, prioritizing the user's input 

461 if user_font_name: 

462 font_name = user_font_name 

463 # Determine the font size to use, prioritizing the user's input 

464 if user_font_size > 0: 

465 font_size = user_font_size 

466 

467 # Try to find a resource dictionary for the font 

468 document_resources: Any = cast( 

469 DictionaryObject, 

470 cast( 

471 DictionaryObject, 

472 annotation.get_inherited( 

473 "/DR", 

474 acro_form.get("/DR", DictionaryObject()), 

475 ), 

476 ).get_object(), 

477 ) 

478 document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object() 

479 # CORE_FONT_METRICS is the dict with Standard font metrics 

480 if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS: 

481 # ...or AcroForm dictionary 

482 document_resources = cast( 

483 dict[Any, Any], 

484 acro_form.get("/DR", {}), 

485 ) 

486 document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object() 

487 font_resource = document_font_resources.get(font_name, None) 

488 if not is_null_or_none(font_resource): 

489 font_resource = cast(DictionaryObject, font_resource.get_object()) 

490 

491 # Retrieve field text and selected values 

492 field_flags = field.get(FieldDictionaryAttributes.Ff, 0) 

493 if ( 

494 field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and 

495 field_flags & FieldDictionaryAttributes.FfBits.Combo == 0 

496 ): 

497 text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, [])) 

498 selection = field.get("/V", []) 

499 if not isinstance(selection, list): 

500 selection = [selection] 

501 else: # /Tx 

502 text = field.get("/V", "") 

503 selection = [] 

504 

505 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) 

506 text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") 

507 

508 # Retrieve formatting information 

509 is_comb = False 

510 max_length = None 

511 if field_flags & FieldDictionaryAttributes.FfBits.Comb: 

512 is_comb = True 

513 max_length = annotation.get("/MaxLen") 

514 is_multiline = False 

515 if field_flags & FieldDictionaryAttributes.FfBits.Multiline: 

516 is_multiline = True 

517 alignment = field.get("/Q", TextAlignment.LEFT) 

518 border_width = 1 

519 border_style = BorderStyles.SOLID 

520 if "/BS" in field: 

521 border_width = cast(DictionaryObject, field["/BS"]).get("/W", border_width) 

522 border_style = cast(DictionaryObject, field["/BS"]).get("/S", border_style) 

523 

524 # Create the TextStreamAppearance instance 

525 layout = BaseStreamConfig(rectangle=rectangle, border_width=border_width, border_style=border_style) 

526 new_appearance_stream = cls( 

527 layout, 

528 text, 

529 selection, 

530 font_resource, 

531 font_name=font_name, 

532 font_size=font_size, 

533 font_color=font_color, 

534 is_multiline=is_multiline, 

535 alignment=alignment, 

536 is_comb=is_comb, 

537 max_length=max_length 

538 ) 

539 if AnnotationDictionaryAttributes.AP in annotation: 

540 for key, value in ( 

541 cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items() 

542 ): 

543 if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: 

544 new_appearance_stream[key] = value 

545 

546 return new_appearance_stream