Coverage Report

Created: 2026-02-26 06:36

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/libqpdf/qpdf/QPDFParser.hh
Line
Count
Source
1
#ifndef QPDFPARSER_HH
2
#define QPDFPARSER_HH
3
4
#include <qpdf/InputSource_private.hh>
5
#include <qpdf/QPDFObjectHandle_private.hh>
6
#include <qpdf/QPDFObject_private.hh>
7
#include <qpdf/QPDFTokenizer_private.hh>
8
#include <qpdf/global_private.hh>
9
10
#include <memory>
11
#include <string>
12
13
using namespace qpdf;
14
using namespace qpdf::global;
15
16
namespace qpdf::impl
17
{
18
    /// @class  Parser
19
    /// @brief  Internal parser for PDF objects and content streams.
20
    /// @par
21
    ///         The Parser class provides static methods for parsing PDF objects from input sources.
22
    ///         It handles tokenization, error recovery, and object construction with proper offset
23
    ///         tracking and description for error reporting.
24
    class Parser
25
    {
26
      public:
27
        /// @brief Exception thrown when parser encounters an unrecoverable error.
28
        class Error: public std::exception
29
        {
30
          public:
31
8.37k
            Error() = default;
32
            virtual ~Error() noexcept = default;
33
        };
34
35
        /// @brief Parse a PDF object from an input source.
36
        /// @param input The input source to read from.
37
        /// @param object_description Description of the object for error messages.
38
        /// @param context The QPDF context, or nullptr if parsing standalone.
39
        /// @return The parsed QPDFObjectHandle, or null if parsing fails.
40
        static QPDFObjectHandle
41
        parse(InputSource& input, std::string const& object_description, QPDF* context);
42
43
        /// @brief Parse a content stream from an input source.
44
        /// @param input The input source to read from.
45
        /// @param sp_description Shared pointer to object description.
46
        /// @param tokenizer The tokenizer to use for parsing.
47
        /// @param context The QPDF context.
48
        /// @return The parsed QPDFObjectHandle, or uninitialized handle on EOF.
49
        static QPDFObjectHandle parse_content(
50
            InputSource& input,
51
            std::shared_ptr<QPDFObject::Description> sp_description,
52
            qpdf::Tokenizer& tokenizer,
53
            QPDF* context);
54
55
        /// @brief Parse a PDF object (interface for deprecated QPDFObjectHandle::parse).
56
        /// @param input The input source to read from.
57
        /// @param object_description Description of the object for error messages.
58
        /// @param tokenizer The tokenizer to use for parsing.
59
        /// @param empty Output parameter indicating if object was empty.
60
        /// @param decrypter String decrypter for encrypted strings, or nullptr.
61
        /// @param context The QPDF context, or nullptr if parsing standalone.
62
        /// @return The parsed QPDFObjectHandle.
63
        static QPDFObjectHandle parse(
64
            InputSource& input,
65
            std::string const& object_description,
66
            QPDFTokenizer& tokenizer,
67
            bool& empty,
68
            QPDFObjectHandle::StringDecrypter* decrypter,
69
            QPDF* context);
70
71
        /// @brief Parse a PDF object for use by QPDF.
72
        /// @param input The input source to read from.
73
        /// @param object_description Description of the object for error messages.
74
        /// @param tokenizer The tokenizer to use for parsing.
75
        /// @param decrypter String decrypter for encrypted strings, or nullptr.
76
        /// @param context The QPDF context.
77
        /// @param sanity_checks Enable additional sanity checks during parsing.
78
        /// @return The parsed QPDFObjectHandle.
79
        static QPDFObjectHandle parse(
80
            InputSource& input,
81
            std::string const& object_description,
82
            qpdf::Tokenizer& tokenizer,
83
            QPDFObjectHandle::StringDecrypter* decrypter,
84
            QPDF& context,
85
            bool sanity_checks);
86
87
        /// @brief Parse an object from an object stream.
88
        /// @param input The offset buffer containing the object data.
89
        /// @param stream_id The object stream number.
90
        /// @param obj_id The object ID within the stream.
91
        /// @param tokenizer The tokenizer to use for parsing.
92
        /// @param context The QPDF context.
93
        /// @return The parsed QPDFObjectHandle.
94
        static QPDFObjectHandle parse(
95
            qpdf::is::OffsetBuffer& input,
96
            int stream_id,
97
            int obj_id,
98
            qpdf::Tokenizer& tokenizer,
99
            QPDF& context);
100
101
        /// @brief Create a description for a parsed object.
102
        /// @param input_name The name of the input source.
103
        /// @param object_description Description of the object being parsed.
104
        /// @return Shared pointer to object description with offset placeholder.
105
        static std::shared_ptr<QPDFObject::Description>
106
        make_description(std::string const& input_name, std::string const& object_description)
107
183k
        {
108
183k
            using namespace std::literals;
109
183k
            return std::make_shared<QPDFObject::Description>(
110
183k
                input_name + ", " + object_description + " at offset $PO");
111
183k
        }
112
113
      private:
114
        /// @brief Construct a parser instance.
115
        /// @param input The input source to read from.
116
        /// @param sp_description Shared pointer to object description.
117
        /// @param object_description Description string for error messages.
118
        /// @param tokenizer The tokenizer to use for parsing.
119
        /// @param decrypter String decrypter for encrypted content.
120
        /// @param context The QPDF context.
121
        /// @param parse_pdf Whether parsing PDF objects (vs content streams).
122
        /// @param stream_id Object stream ID for object stream parsing.
123
        /// @param obj_id Object ID within object stream.
124
        /// @param sanity_checks Enable additional sanity checks.
125
        Parser(
126
            InputSource& input,
127
            std::shared_ptr<QPDFObject::Description> sp_description,
128
            std::string const& object_description,
129
            qpdf::Tokenizer& tokenizer,
130
            QPDFObjectHandle::StringDecrypter* decrypter,
131
            QPDF* context,
132
            bool parse_pdf,
133
            int stream_id = 0,
134
            int obj_id = 0,
135
            bool sanity_checks = false) :
136
1.05M
            input_(input),
137
1.05M
            object_description_(object_description),
138
1.05M
            tokenizer_(tokenizer),
139
1.05M
            decrypter_(decrypter),
140
1.05M
            context_(context),
141
1.05M
            description_(std::move(sp_description)),
142
1.05M
            parse_pdf_(parse_pdf),
143
1.05M
            stream_id_(stream_id),
144
1.05M
            obj_id_(obj_id),
145
1.05M
            sanity_checks_(sanity_checks)
146
1.05M
        {
147
1.05M
        }
148
149
        /// @brief Parser state enumeration.
150
        /// @note state <= st_dictionary_value indicates we're in a dictionary context.
151
        enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
152
153
        /// @brief Stack frame for tracking nested arrays and dictionaries.
154
        struct StackFrame
155
        {
156
            StackFrame(InputSource& input, parser_state_e state) :
157
625k
                state(state),
158
625k
                offset(input.tell())
159
625k
            {
160
625k
            }
161
162
            std::vector<QPDFObjectHandle> olist;          ///< Object list for arrays/dict values
163
            std::map<std::string, QPDFObjectHandle> dict; ///< Dictionary entries
164
            parser_state_e state;                         ///< Current parser state
165
            std::string key;                              ///< Current dictionary key
166
            qpdf_offset_t offset;                         ///< Offset of container start
167
            std::string contents_string;                  ///< For /Contents field in signatures
168
            qpdf_offset_t contents_offset{-1};            ///< Offset of /Contents value
169
            int null_count{0};                            ///< Count of null values in container
170
        };
171
172
        /// @brief Parse an object, handling exceptions and returning null on error.
173
        /// @param content_stream True if parsing a content stream.
174
        /// @return The parsed object handle, or null/uninitialized on error.
175
        QPDFObjectHandle parse(bool content_stream = false);
176
177
        /// @brief Parse the first token and dispatch to appropriate handler.
178
        /// @param content_stream True if parsing a content stream.
179
        /// @return The parsed object handle.
180
        QPDFObjectHandle parse_first(bool content_stream);
181
182
        /// @brief Parse the remainder of a composite object (array/dict/reference).
183
        /// @param content_stream True if parsing a content stream.
184
        /// @return The completed object handle.
185
        QPDFObjectHandle parse_remainder(bool content_stream);
186
187
        /// @brief Add an object to the current container.
188
        /// @param obj The object to add.
189
        void add(std::shared_ptr<QPDFObject>&& obj);
190
191
        /// @brief Add a null object to the current container.
192
        void add_null();
193
194
        /// @brief Add a null with a warning message.
195
        /// @param msg Warning message describing the error.
196
        void add_bad_null(std::string const& msg);
197
198
        /// @brief Add a buffered integer from int_buffer_.
199
        /// @param count Buffer index (1 or 2) to read from.
200
        void add_int(int count);
201
202
        /// @brief Create and add a scalar object to the current container.
203
        /// @tparam T The scalar object type (e.g., QPDF_Integer, QPDF_String).
204
        /// @tparam Args Constructor argument types.
205
        /// @param args Arguments to forward to the object constructor.
206
        template <typename T, typename... Args>
207
        void add_scalar(Args&&... args);
208
209
        /// @brief Check if too many bad tokens have been encountered and throw if so.
210
        void check_too_many_bad_tokens();
211
212
        /// @brief Issue a warning about a duplicate dictionary key.
213
        void warn_duplicate_key();
214
215
        /// @brief Fix dictionaries with missing keys by generating fake keys.
216
        void fix_missing_keys();
217
218
        /// @brief Report a limits error and throw.
219
        /// @param limit The limit identifier.
220
        /// @param msg Error message.
221
        [[noreturn]] void limits_error(std::string const& limit, std::string const& msg);
222
223
        /// @brief Issue a warning at a specific offset.
224
        /// @param offset File offset for the warning.
225
        /// @param msg Warning message.
226
        void warn(qpdf_offset_t offset, std::string const& msg) const;
227
228
        /// @brief Issue a warning at the current offset.
229
        /// @param msg Warning message.
230
        void warn(std::string const& msg) const;
231
232
        /// @brief Issue a warning from a QPDFExc exception.
233
        /// @param e The exception to report.
234
        void warn(QPDFExc const& e) const;
235
236
        /// @brief Create a scalar object with description and parsed offset.
237
        /// @tparam T The scalar object type.
238
        /// @tparam Args Constructor argument types.
239
        /// @param args Arguments to forward to the object constructor.
240
        /// @return Object handle with description and offset set.
241
        /// @note The offset includes any leading whitespace.
242
        template <typename T, typename... Args>
243
        QPDFObjectHandle with_description(Args&&... args);
244
245
        /// @brief Set the description and offset on an existing object.
246
        /// @param obj The object to update.
247
        /// @param parsed_offset The file offset where the object was parsed.
248
        void set_description(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
249
250
        // Core parsing state
251
        InputSource& input_;                           ///< Input source to read from
252
        std::string const& object_description_;        ///< Description for error messages
253
        qpdf::Tokenizer& tokenizer_;                   ///< Tokenizer for lexical analysis
254
        QPDFObjectHandle::StringDecrypter* decrypter_; ///< Decrypter for encrypted strings
255
        QPDF* context_;                                ///< QPDF context for object resolution
256
        std::shared_ptr<QPDFObject::Description> description_; ///< Shared description for objects
257
        bool parse_pdf_{false};     ///< True if parsing PDF objects vs content streams
258
        int stream_id_{0};          ///< Object stream ID (for object stream parsing)
259
        int obj_id_{0};             ///< Object ID within object stream
260
        bool sanity_checks_{false}; ///< Enable additional validation checks
261
262
        // Composite object parsing state
263
        std::vector<StackFrame> stack_; ///< Stack of nested containers
264
        StackFrame* frame_{nullptr};    ///< Current stack frame pointer
265
266
        // Error tracking state
267
        /// Number of recent bad tokens. Always > 0 after first bad token encountered.
268
        int bad_count_{0};
269
        /// Number of bad tokens remaining before giving up.
270
        uint32_t max_bad_count_{Limits::parser_max_errors()};
271
        /// Number of good tokens since last bad token. Irrelevant if bad_count == 0.
272
        int good_count_{0};
273
274
        // Token buffering state
275
        /// Start offset of current object, including any leading whitespace.
276
        qpdf_offset_t start_{0};
277
        /// Number of successive integer tokens (for indirect reference detection).
278
        int int_count_{0};
279
        /// Buffer for up to 2 integer tokens.
280
        long long int_buffer_[2]{0, 0};
281
        /// Offsets corresponding to buffered integers.
282
        qpdf_offset_t last_offset_buffer_[2]{0, 0};
283
284
        /// True if object was empty (endobj without content).
285
        bool empty_{false};
286
    };
287
} // namespace qpdf::impl
288
289
#endif // QPDFPARSER_HH