Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/_content

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""Content stream parsing."""

6from __future__ import annotations

8from collections.abc import Collection

9from typing import TYPE_CHECKING, Union, cast

11from pikepdf._core import (

12 ContentStreamInlineImage,

13 ContentStreamInstruction,

14 Object,

15 ObjectType,

16 Page,

17 PdfError,

18 _unparse_content_stream,

19)

20from pikepdf.objects import Operator

22if TYPE_CHECKING:

23 from pikepdf.models.image import PdfInlineImage

25# Operands, Operator

26_OldContentStreamOperands = Collection[Union[Object, 'PdfInlineImage']]

27_OldContentStreamInstructions = tuple[_OldContentStreamOperands, Operator]

29ContentStreamInstructions = Union[ContentStreamInstruction, ContentStreamInlineImage]

31UnparseableContentStreamInstructions = Union[

32 ContentStreamInstructions, _OldContentStreamInstructions

33]

36class PdfParsingError(Exception):

37 """Error when parsing a PDF content stream."""

39 def __init__(self, message=None, line=None):

40 if not message:

41 message = f"Error encoding content stream at line {line}"

42 super().__init__(message)

43 self.line = line

46def parse_content_stream(

47 page_or_stream: Object | Page, operators: str = ''

48) -> list[ContentStreamInstructions]:

49 """Parse a PDF content stream into a sequence of instructions.

51 A PDF content stream is list of instructions that describe where to render

52 the text and graphics in a PDF. This is the starting point for analyzing

53 PDFs.

55 If the input is a page and page.Contents is an array, then the content

56 stream is automatically treated as one coalesced stream.

58 Each instruction contains at least one operator and zero or more operands.

60 This function does not have anything to do with opening a PDF file itself or

61 processing data from a whole PDF. It is for processing a specific object inside

62 a PDF that is already opened.

64 Args:

65 page_or_stream: A page object, or the content

66 stream attached to another object such as a Form XObject.

67 operators: A space-separated string of operators to whitelist.

68 For example 'q Q cm Do' will return only operators

69 that pertain to drawing images. Use 'BI ID EI' for inline images.

70 All other operators and associated tokens are ignored. If blank,

71 all tokens are accepted.

73 Example:

74 >>> with pikepdf.Pdf.open("../tests/resources/pal-1bit-trivial.pdf") as pdf:

75 ... page = pdf.pages[0]

76 ... for operands, command in pikepdf.parse_content_stream(page):

77 ... print(command)

78 q

79 cm

80 Do

81 Q

83 .. versionchanged:: 3.0

84 Returns a list of ``ContentStreamInstructions`` instead of a list

85 of (operand, operator) tuples. The returned items are duck-type compatible

86 with the previous returned items.

87 """

88 if not isinstance(page_or_stream, (Object, Page)):

89 raise TypeError("stream must be a pikepdf.Object or pikepdf.Page")

91 if (

92 isinstance(page_or_stream, Object)

93 and page_or_stream._type_code != ObjectType.stream

94 and page_or_stream.get('/Type') != '/Page'

95 ):

96 raise TypeError("parse_content_stream called on page or stream object")

98 if isinstance(page_or_stream, Page):

99 page_or_stream = page_or_stream.obj

100

101 try:

102 if page_or_stream.get('/Type') == '/Page':

103 page = page_or_stream

104 instructions = cast(

105 list[ContentStreamInstructions],

106 page._parse_page_contents_grouped(operators),

107 )

108 else:

109 stream = page_or_stream

110 instructions = cast(

111 list[ContentStreamInstructions],

112 Object._parse_stream_grouped(stream, operators),

113 )

114 except PdfError as e:

115 if 'supposed to be a stream or an array' in str(e):

116 raise TypeError("parse_content_stream called on non-stream Object") from e

117 raise e from e

118

119 return instructions

120

121

122def unparse_content_stream(

123 instructions: Collection[UnparseableContentStreamInstructions],

124) -> bytes:

125 """Convert collection of instructions to bytes suitable for storing in PDF.

126

127 Given a parsed list of instructions/operand-operators, convert to bytes suitable

128 for embedding in a PDF. In PDF the operator always follows the operands.

129

130 Args:

131 instructions: collection of instructions such as is returned

132 by :func:`parse_content_stream()`

133

134 Returns:

135 A binary content stream, suitable for attaching to a Pdf.

136 To attach to a Pdf, use :meth:`Pdf.make_stream()``.

137

138 .. versionchanged:: 3.0

139 Now accept collections that contain any mixture of

140 ``ContentStreamInstruction``, ``ContentStreamInlineImage``, and the older

141 operand-operator tuples from pikepdf 2.x.

142 """

143 try:

144 return _unparse_content_stream(instructions)

145 except (ValueError, TypeError, RuntimeError) as e:

146 raise PdfParsingError(

147 "While unparsing a content stream, an error occurred"

148 ) from e

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/_content_stream.py: 36%

42 statements