Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/_content_stream.py: 36%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

42 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""Content stream parsing.""" 

5 

6from __future__ import annotations 

7 

8from collections.abc import Collection 

9from typing import TYPE_CHECKING, Union, cast 

10 

11from pikepdf._core import ( 

12 ContentStreamInlineImage, 

13 ContentStreamInstruction, 

14 Object, 

15 ObjectType, 

16 Page, 

17 PdfError, 

18 _unparse_content_stream, 

19) 

20from pikepdf.objects import Operator 

21 

22if TYPE_CHECKING: 

23 from pikepdf.models.image import PdfInlineImage 

24 

25# Operands, Operator 

26_OldContentStreamOperands = Collection[Union[Object, 'PdfInlineImage']] 

27_OldContentStreamInstructions = tuple[_OldContentStreamOperands, Operator] 

28 

29ContentStreamInstructions = Union[ContentStreamInstruction, ContentStreamInlineImage] 

30 

31UnparseableContentStreamInstructions = Union[ 

32 ContentStreamInstructions, _OldContentStreamInstructions 

33] 

34 

35 

36class PdfParsingError(Exception): 

37 """Error when parsing a PDF content stream.""" 

38 

39 def __init__(self, message=None, line=None): 

40 if not message: 

41 message = f"Error encoding content stream at line {line}" 

42 super().__init__(message) 

43 self.line = line 

44 

45 

46def parse_content_stream( 

47 page_or_stream: Object | Page, operators: str = '' 

48) -> list[ContentStreamInstructions]: 

49 """Parse a PDF content stream into a sequence of instructions. 

50 

51 A PDF content stream is list of instructions that describe where to render 

52 the text and graphics in a PDF. This is the starting point for analyzing 

53 PDFs. 

54 

55 If the input is a page and page.Contents is an array, then the content 

56 stream is automatically treated as one coalesced stream. 

57 

58 Each instruction contains at least one operator and zero or more operands. 

59 

60 This function does not have anything to do with opening a PDF file itself or 

61 processing data from a whole PDF. It is for processing a specific object inside 

62 a PDF that is already opened. 

63 

64 Args: 

65 page_or_stream: A page object, or the content 

66 stream attached to another object such as a Form XObject. 

67 operators: A space-separated string of operators to whitelist. 

68 For example 'q Q cm Do' will return only operators 

69 that pertain to drawing images. Use 'BI ID EI' for inline images. 

70 All other operators and associated tokens are ignored. If blank, 

71 all tokens are accepted. 

72 

73 Example: 

74 >>> with pikepdf.Pdf.open("../tests/resources/pal-1bit-trivial.pdf") as pdf: 

75 ... page = pdf.pages[0] 

76 ... for operands, command in pikepdf.parse_content_stream(page): 

77 ... print(command) 

78 q 

79 cm 

80 Do 

81 Q 

82 

83 .. versionchanged:: 3.0 

84 Returns a list of ``ContentStreamInstructions`` instead of a list 

85 of (operand, operator) tuples. The returned items are duck-type compatible 

86 with the previous returned items. 

87 """ 

88 if not isinstance(page_or_stream, (Object, Page)): 

89 raise TypeError("stream must be a pikepdf.Object or pikepdf.Page") 

90 

91 if ( 

92 isinstance(page_or_stream, Object) 

93 and page_or_stream._type_code != ObjectType.stream 

94 and page_or_stream.get('/Type') != '/Page' 

95 ): 

96 raise TypeError("parse_content_stream called on page or stream object") 

97 

98 if isinstance(page_or_stream, Page): 

99 page_or_stream = page_or_stream.obj 

100 

101 try: 

102 if page_or_stream.get('/Type') == '/Page': 

103 page = page_or_stream 

104 instructions = cast( 

105 list[ContentStreamInstructions], 

106 page._parse_page_contents_grouped(operators), 

107 ) 

108 else: 

109 stream = page_or_stream 

110 instructions = cast( 

111 list[ContentStreamInstructions], 

112 Object._parse_stream_grouped(stream, operators), 

113 ) 

114 except PdfError as e: 

115 if 'supposed to be a stream or an array' in str(e): 

116 raise TypeError("parse_content_stream called on non-stream Object") from e 

117 raise e from e 

118 

119 return instructions 

120 

121 

122def unparse_content_stream( 

123 instructions: Collection[UnparseableContentStreamInstructions], 

124) -> bytes: 

125 """Convert collection of instructions to bytes suitable for storing in PDF. 

126 

127 Given a parsed list of instructions/operand-operators, convert to bytes suitable 

128 for embedding in a PDF. In PDF the operator always follows the operands. 

129 

130 Args: 

131 instructions: collection of instructions such as is returned 

132 by :func:`parse_content_stream()` 

133 

134 Returns: 

135 A binary content stream, suitable for attaching to a Pdf. 

136 To attach to a Pdf, use :meth:`Pdf.make_stream()``. 

137 

138 .. versionchanged:: 3.0 

139 Now accept collections that contain any mixture of 

140 ``ContentStreamInstruction``, ``ContentStreamInlineImage``, and the older 

141 operand-operator tuples from pikepdf 2.x. 

142 """ 

143 try: 

144 return _unparse_content_stream(instructions) 

145 except (ValueError, TypeError, RuntimeError) as e: 

146 raise PdfParsingError( 

147 "While unparsing a content stream, an error occurred" 

148 ) from e