1import re
2from dataclasses import dataclass
3from enum import IntEnum
4from typing import Any, Optional, Union, cast
5
6from .._codecs import fill_from_encoding
7from .._codecs.core_font_metrics import CORE_FONT_METRICS
8from .._font import Font
9from .._utils import logger_warning
10from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes
11from ..generic import (
12 DecodedStreamObject,
13 DictionaryObject,
14 NameObject,
15 NumberObject,
16 RectangleObject,
17)
18from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none
19
20DEFAULT_FONT_SIZE_IN_MULTILINE = 12
21
22
23@dataclass
24class BaseStreamConfig:
25 """A container representing the basic layout of an appearance stream."""
26 rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0)
27 border_width: int = 1 # The width of the border in points
28 border_style: str = BorderStyles.SOLID
29
30
31class BaseStreamAppearance(DecodedStreamObject):
32 """A class representing the very base of an appearance stream, that is, a rectangle and a border."""
33
34 def __init__(self, layout: Optional[BaseStreamConfig] = None) -> None:
35 """
36 Takes the appearance stream layout as an argument.
37
38 Args:
39 layout: The basic layout parameters.
40 """
41 super().__init__()
42 self._layout = layout or BaseStreamConfig()
43 self[NameObject("/Type")] = NameObject("/XObject")
44 self[NameObject("/Subtype")] = NameObject("/Form")
45 self[NameObject("/BBox")] = RectangleObject(self._layout.rectangle)
46
47
48class TextAlignment(IntEnum):
49 """Defines the alignment options for text within a form field's appearance stream."""
50
51 LEFT = 0
52 CENTER = 1
53 RIGHT = 2
54
55
56class TextStreamAppearance(BaseStreamAppearance):
57 """
58 A class representing the appearance stream for a text-based form field.
59
60 This class generates the content stream (the `ap_stream_data`) that dictates
61 how text is rendered within a form field's bounding box. It handles properties
62 like font, font size, color, multiline text, and text selection highlighting.
63 """
64
65 def _scale_text(
66 self,
67 font: Font,
68 font_size: float,
69 leading_factor: float,
70 field_width: float,
71 field_height: float,
72 text: str,
73 min_font_size: float,
74 font_size_step: float = 0.2
75 ) -> tuple[list[tuple[float, str]], float]:
76 """
77 Takes a piece of text and scales it to field_width or field_height, given font_name
78 and font_size. Wraps text where necessary.
79
80 Args:
81 font: The font to be used.
82 font_size: The font size in points.
83 leading_factor: The line distance.
84 field_width: The width of the field in which to fit the text.
85 field_height: The height of the field in which to fit the text.
86 text: The text to fit with the field.
87 min_font_size: The minimum font size at which to scale the text.
88 font_size_step: The amount by which to decrement font size per step while scaling.
89
90 Returns:
91 The text in the form of list of tuples, each tuple containing the length of a line
92 and its contents, and the font_size for these lines and lengths.
93 """
94 orig_text = text
95 paragraphs = text.replace("\n", "\r").split("\r")
96 wrapped_lines = []
97 current_line_words: list[str] = []
98 current_line_width: float = 0
99 space_width = font.space_width * font_size / 1000
100 for paragraph in paragraphs:
101 if not paragraph.strip():
102 wrapped_lines.append((0.0, ""))
103 continue
104 words = paragraph.split(" ")
105 for i, word in enumerate(words):
106 word_width = font.text_width(word) * font_size / 1000
107 test_width = current_line_width + word_width + (space_width if i else 0)
108 if test_width > field_width and current_line_words:
109 wrapped_lines.append((current_line_width, " ".join(current_line_words)))
110 current_line_words = [word]
111 current_line_width = word_width
112 elif not current_line_words and word_width > field_width:
113 wrapped_lines.append((word_width, word))
114 current_line_words = []
115 current_line_width = 0
116 else:
117 if current_line_words:
118 current_line_width += space_width
119 current_line_words.append(word)
120 current_line_width += word_width
121 if current_line_words:
122 wrapped_lines.append((current_line_width, " ".join(current_line_words)))
123 current_line_words = []
124 current_line_width = 0
125 # Estimate total height.
126 estimated_total_height = font_size + (len(wrapped_lines) - 1) * leading_factor * font_size
127 if estimated_total_height > field_height:
128 # Text overflows height; Retry with smaller font size.
129 new_font_size = font_size - font_size_step
130 if new_font_size >= min_font_size:
131 return self._scale_text(
132 font,
133 new_font_size,
134 leading_factor,
135 field_width,
136 field_height,
137 orig_text,
138 min_font_size,
139 font_size_step
140 )
141 return wrapped_lines, round(font_size, 1)
142
143 def _generate_appearance_stream_data(
144 self,
145 text: str,
146 selection: Union[list[str], None],
147 font: Font,
148 font_glyph_byte_map: Optional[dict[str, bytes]] = None,
149 font_name: str = "/Helv",
150 font_size: float = 0.0,
151 font_color: str = "0 g",
152 is_multiline: bool = False,
153 alignment: TextAlignment = TextAlignment.LEFT,
154 is_comb: bool = False,
155 max_length: Optional[int] = None
156 ) -> bytes:
157 """
158 Generates the raw bytes of the PDF appearance stream for a text field.
159
160 This private method assembles the PDF content stream operators to draw
161 the provided text within the specified rectangle. It handles text positioning,
162 font application, color, and special formatting like selected text.
163
164 Args:
165 text: The text to be rendered in the form field.
166 selection: An optional list of strings that should be highlighted as selected.
167 font: The font to use.
168 font_glyph_byte_map: An optional dictionary mapping characters to their
169 byte representation for glyph encoding.
170 font_name: The name of the font resource to use (e.g., "/Helv").
171 font_size: The font size. If 0, it is automatically calculated
172 based on whether the field is multiline or not.
173 font_color: The color to apply to the font, represented as a PDF
174 graphics state string (e.g., "0 g" for black).
175 is_multiline: A boolean indicating if the text field is multiline.
176 alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
177 is_comb: Boolean that designates fixed-length fields, where every character
178 fills one "cell", such as in a postcode.
179 max_length: Used if is_comb is set. The maximum number of characters for a fixed-
180 length field.
181
182 Returns:
183 A byte string containing the PDF content stream data.
184
185 """
186 rectangle = self._layout.rectangle
187 font_glyph_byte_map = font_glyph_byte_map or {}
188 if isinstance(rectangle, tuple):
189 rectangle = RectangleObject(rectangle)
190 leading_factor = (font.font_descriptor.bbox[3] - font.font_descriptor.bbox[1]) / 1000.0
191
192 # Set margins based on border width and style, but never less than 1 point
193 factor = 2 if self._layout.border_style in {"/B", "/I"} else 1
194 margin = max(self._layout.border_width * factor, 1)
195 field_height = rectangle.height - 2 * margin
196 field_width = rectangle.width - 4 * margin
197
198 # If font_size is 0, apply the logic for multiline or large-as-possible font
199 if font_size == 0:
200 min_font_size = 4.0 # The mininum font size
201 if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems
202 is_multiline = False # with matching "selection" with "line" later on.
203 if is_multiline:
204 font_size = DEFAULT_FONT_SIZE_IN_MULTILINE
205 lines, font_size = self._scale_text(
206 font,
207 font_size,
208 leading_factor,
209 field_width,
210 field_height,
211 text,
212 min_font_size
213 )
214 else:
215 max_vertical_size = field_height / leading_factor
216 text_width_unscaled = font.text_width(text) / 1000
217 max_horizontal_size = field_width / (text_width_unscaled or 1)
218 font_size = round(max(min(max_vertical_size, max_horizontal_size), min_font_size), 1)
219 lines = [(text_width_unscaled * font_size, text)]
220 elif is_comb:
221 if max_length and len(text) > max_length:
222 logger_warning (
223 f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.",
224 __name__
225 )
226 # We act as if each character is one line, because we draw it separately later on
227 lines = [(
228 font.text_width(char) * font_size / 1000,
229 char
230 ) for index, char in enumerate(text) if index < (max_length or len(text))]
231 else:
232 lines = [(
233 font.text_width(line) * font_size / 1000,
234 line
235 ) for line in text.replace("\n", "\r").split("\r")]
236
237 # Set the vertical offset
238 if is_multiline:
239 y_offset = rectangle.height + margin - font.font_descriptor.bbox[3] * font_size / 1000.0
240 else:
241 y_offset = margin + ((field_height - font.font_descriptor.ascent * font_size / 1000) / 2)
242 default_appearance = f"{font_name} {font_size} Tf {font_color}"
243
244 ap_stream = (
245 f"q\n/Tx BMC \nq\n{2 * margin} {margin} {field_width} {field_height} "
246 f"re\nW\nBT\n{default_appearance}\n"
247 ).encode()
248 current_x_pos: float = 0 # Initial virtual position within the text object.
249
250 for line_number, (line_width, line) in enumerate(lines):
251 if selection and line in selection:
252 # Might be improved, but cannot find how to get fill working => replaced with lined box
253 ap_stream += (
254 f"1 {y_offset - (line_number * font_size * leading_factor) - 1} "
255 f"{rectangle.width - 2} {font_size + 2} re\n"
256 f"0.5 0.5 0.5 rg s\n{default_appearance}\n"
257 ).encode()
258
259 # Calculate the desired absolute starting X for the current line
260 desired_abs_x_start: float = 0
261 if is_comb and max_length:
262 # Calculate the width of a cell for one character
263 cell_width = rectangle.width / max_length
264 # Space from the left edge of the cell to the character's baseline start
265 # line_width here is the *actual* character width in points for the single character 'line'
266 centering_offset_in_cell = (cell_width - line_width) / 2
267 # Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset
268 desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell
269 elif alignment == TextAlignment.RIGHT:
270 desired_abs_x_start = rectangle.width - margin * 2 - line_width
271 elif alignment == TextAlignment.CENTER:
272 desired_abs_x_start = (rectangle.width - line_width) / 2
273 else: # Left aligned; default
274 desired_abs_x_start = margin * 2
275 # Calculate x_rel_offset: how much to move from the current_x_pos
276 # to reach the desired_abs_x_start.
277 x_rel_offset = desired_abs_x_start - current_x_pos
278
279 # Y-offset:
280 y_rel_offset: float = 0
281 if line_number == 0:
282 y_rel_offset = y_offset # Initial vertical position
283 elif is_comb:
284 y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters
285 else:
286 y_rel_offset = - font_size * leading_factor # Move down by line height
287
288 # Td is a relative translation (Tx and Ty).
289 # It updates the current text position.
290 ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode()
291 # Update current_x_pos based on the Td operation for the next iteration.
292 # This is the X position where the *current line* will start.
293 current_x_pos = desired_abs_x_start
294
295 encoded_line: list[bytes] = [
296 font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line
297 ]
298 if any(len(c) >= 2 for c in encoded_line):
299 ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n"
300 else:
301 ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n"
302 ap_stream += b"ET\nQ\nEMC\nQ\n"
303 return ap_stream
304
305 def __init__(
306 self,
307 layout: Optional[BaseStreamConfig] = None,
308 text: str = "",
309 selection: Optional[list[str]] = None,
310 font_resource: Optional[DictionaryObject] = None,
311 font_name: str = "/Helv",
312 font_size: float = 0.0,
313 font_color: str = "0 g",
314 is_multiline: bool = False,
315 alignment: TextAlignment = TextAlignment.LEFT,
316 is_comb: bool = False,
317 max_length: Optional[int] = None
318 ) -> None:
319 """
320 Initializes a TextStreamAppearance object.
321
322 This constructor creates a new PDF stream object configured as an XObject
323 of subtype Form. It uses the `_appearance_stream_data` method to generate
324 the content for the stream.
325
326 Args:
327 layout: The basic layout parameters.
328 text: The text to be rendered in the form field.
329 selection: An optional list of strings that should be highlighted as selected.
330 font_resource: An optional variable that represents a PDF font dictionary.
331 font_name: The name of the font resource, e.g., "/Helv".
332 font_size: The font size. If 0, it's auto-calculated.
333 font_color: The font color string.
334 is_multiline: A boolean indicating if the text field is multiline.
335 alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
336 is_comb: Boolean that designates fixed-length fields, where every character
337 fills one "cell", such as in a postcode.
338 max_length: Used if is_comb is set. The maximum number of characters for a fixed-
339 length field.
340
341 """
342 super().__init__(layout)
343
344 # If a font resource was added, get the font character map
345 if font_resource:
346 font = Font.from_font_resource(font_resource)
347 else:
348 logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
349 font_name = "/Helv"
350 core_font_metrics = CORE_FONT_METRICS["Helvetica"]
351 font = Font(
352 name="Helvetica",
353 character_map={},
354 encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
355 sub_type="Type1",
356 font_descriptor=core_font_metrics.font_descriptor,
357 character_widths=core_font_metrics.character_widths
358 )
359 font_resource = font.as_font_resource()
360
361 # Check whether the font resource is able to encode the text value.
362 encodable = True
363 try:
364 if isinstance(font.encoding, str):
365 text.encode(font.encoding, "surrogatepass")
366 else:
367 supported_chars = set(font.encoding.values())
368 if any(char not in supported_chars for char in text):
369 encodable = False
370 # We should add a final check against the character_map (CMap) of the font,
371 # but we don't appear to have PDF forms with such fonts, so we skip this for
372 # now.
373
374 except UnicodeEncodeError:
375 encodable = False
376
377 if not encodable:
378 logger_warning(
379 f"Text string '{text}' contains characters not supported by font encoding. "
380 "This may result in text corruption. "
381 "Consider calling writer.update_page_form_field_values with auto_regenerate=True.",
382 __name__
383 )
384
385 font_glyph_byte_map: dict[str, bytes]
386 if isinstance(font.encoding, str):
387 font_glyph_byte_map = {
388 v: k.encode(font.encoding) for k, v in font.character_map.items()
389 }
390 else:
391 font_glyph_byte_map = {v: bytes((k,)) for k, v in font.encoding.items()}
392 font_encoding_rev = {v: bytes((k,)) for k, v in font.encoding.items()}
393 for key, value in font.character_map.items():
394 font_glyph_byte_map[value] = font_encoding_rev.get(key, key)
395
396 ap_stream_data = self._generate_appearance_stream_data(
397 text,
398 selection,
399 font,
400 font_glyph_byte_map,
401 font_name=font_name,
402 font_size=font_size,
403 font_color=font_color,
404 is_multiline=is_multiline,
405 alignment=alignment,
406 is_comb=is_comb,
407 max_length=max_length
408 )
409
410 self.set_data(ByteStringObject(ap_stream_data))
411 self[NameObject("/Length")] = NumberObject(len(ap_stream_data))
412 # Update Resources with font information
413 self[NameObject("/Resources")] = DictionaryObject({
414 NameObject("/Font"): DictionaryObject({
415 NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource)
416 })
417 })
418
419 @staticmethod
420 def _find_annotation_font_resource(
421 font_name: str,
422 annotation: DictionaryObject,
423 acro_form: DictionaryObject
424 ) -> tuple[str, DictionaryObject]:
425 # Try to find a resource dictionary for the font by examining the annotation and, if that fails,
426 # the AcroForm resources dictionary
427 acro_form_resources: Any = cast(
428 DictionaryObject,
429 annotation.get_inherited(
430 "/DR",
431 acro_form.get("/DR", DictionaryObject()),
432 ),
433 )
434 acro_form_font_resources = acro_form_resources.get("/Font", DictionaryObject())
435 font_resource = acro_form_font_resources.get(font_name, None)
436
437 # Normally, we should have found a font resource by now. However, when a user has provided a specific
438 # font name, we may not have found the associated font resource among the AcroForm resources. Also, in
439 # case of the 14 Adobe Core fonts, we may be expected to construct a font resource ourselves.
440 if is_null_or_none(font_resource):
441 if font_name.removeprefix("/") not in CORE_FONT_METRICS:
442 # Default to Helvetica if we haven't found a font resource and cannot construct one ourselves.
443 logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
444 font_name = "/Helvetica"
445 core_font_metrics = CORE_FONT_METRICS[font_name.removeprefix("/")]
446 font_resource = Font(
447 name=font_name.removeprefix("/"),
448 character_map={},
449 encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
450 sub_type="Type1",
451 font_descriptor=core_font_metrics.font_descriptor,
452 character_widths=core_font_metrics.character_widths
453 ).as_font_resource()
454
455 return font_name, font_resource
456
457 @classmethod
458 def from_text_annotation(
459 cls,
460 acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM])
461 field: DictionaryObject,
462 annotation: DictionaryObject,
463 user_font_name: str = "",
464 user_font_size: float = -1,
465 ) -> "TextStreamAppearance":
466 """
467 Creates a TextStreamAppearance object from a text field annotation.
468
469 This class method is a factory for creating a `TextStreamAppearance`
470 instance by extracting all necessary information (bounding box, font,
471 text content, etc.) from the PDF field and annotation dictionaries.
472 It respects inheritance for properties like default appearance (`/DA`).
473
474 Args:
475 acro_form: The root AcroForm dictionary from the PDF catalog.
476 field: The field dictionary object.
477 annotation: The widget annotation dictionary object associated with the field.
478 user_font_name: An optional user-provided font name to override the
479 default. Defaults to an empty string.
480 user_font_size: An optional user-provided font size to override the
481 default. A value of -1 indicates no override.
482
483 Returns:
484 A new `TextStreamAppearance` instance configured for the given field.
485
486 """
487 # Calculate rectangle dimensions
488 _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect])
489 rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1])))
490
491 # Get default appearance dictionary from annotation
492 default_appearance = annotation.get_inherited(
493 AnnotationDictionaryAttributes.DA,
494 acro_form.get(AnnotationDictionaryAttributes.DA, None),
495 )
496 if not default_appearance:
497 # Create a default appearance if none was found in the annotation
498 default_appearance = TextStringObject("/Helv 0 Tf 0 g")
499 else:
500 default_appearance = default_appearance.get_object()
501
502 # Retrieve field text and selected values
503 field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
504 if (
505 field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
506 field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
507 ):
508 text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
509 selection = field.get("/V", [])
510 if not isinstance(selection, list):
511 selection = [selection]
512 else: # /Tx
513 text = field.get("/V", "")
514 selection = []
515
516 # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
517 text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
518
519 # Derive font name, size and color from the default appearance. Also set
520 # user-provided font name and font size in the default appearance, if given.
521 # For a font name, this presumes that we can find an associated font resource
522 # dictionary. Uses the variable font_properties as an intermediate.
523 # As per the PDF spec:
524 # "At a minimum, the string [that is, default_appearance] shall include a Tf (text
525 # font) operator along with its two operands, font and size" (Section 12.7.4.3
526 # "Variable text" of the PDF 2.0 specification).
527 font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop]
528 font_name = font_properties.pop(font_properties.index("Tf") - 2)
529 font_size = float(font_properties.pop(font_properties.index("Tf") - 1))
530 font_properties.remove("Tf")
531 font_color = " ".join(font_properties)
532 # Determine the font name to use, prioritizing the user's input
533 if user_font_name:
534 font_name = user_font_name
535 # Determine the font size to use, prioritizing the user's input
536 if user_font_size > 0:
537 font_size = user_font_size
538
539 font_name, font_resource = cls._find_annotation_font_resource(font_name, annotation, acro_form)
540
541 # Retrieve formatting information
542 is_comb = False
543 max_length = None
544 if field_flags & FieldDictionaryAttributes.FfBits.Comb:
545 is_comb = True
546 max_length = annotation.get("/MaxLen")
547 is_multiline = False
548 if field_flags & FieldDictionaryAttributes.FfBits.Multiline:
549 is_multiline = True
550 alignment = field.get("/Q", TextAlignment.LEFT)
551 border_width = 1
552 border_style = BorderStyles.SOLID
553 if "/BS" in field:
554 border_width = cast(DictionaryObject, field["/BS"]).get("/W", border_width)
555 border_style = cast(DictionaryObject, field["/BS"]).get("/S", border_style)
556
557 # Create the TextStreamAppearance instance
558 layout = BaseStreamConfig(rectangle=rectangle, border_width=border_width, border_style=border_style)
559 new_appearance_stream = cls(
560 layout,
561 text,
562 selection,
563 font_resource,
564 font_name=font_name,
565 font_size=font_size,
566 font_color=font_color,
567 is_multiline=is_multiline,
568 alignment=alignment,
569 is_comb=is_comb,
570 max_length=max_length
571 )
572
573 if AnnotationDictionaryAttributes.AP in annotation:
574 for key, value in (
575 cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
576 ):
577 if key in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
578 continue
579 # Don't overwrite font resources added by TextAppearanceStream.__init__
580 if key == "/Resources":
581 if "/Font" not in value:
582 value.get_object()[NameObject("/Font")] = DictionaryObject()
583 value["/Font"].get_object()[NameObject(font_name)] = getattr(
584 font_resource, "indirect_reference", font_resource
585 )
586 else:
587 new_appearance_stream[key] = value
588
589 return new_appearance_stream