/src/qpdf/libqpdf/qpdf/QPDFParser.hh
Line | Count | Source |
1 | | #ifndef QPDFPARSER_HH |
2 | | #define QPDFPARSER_HH |
3 | | |
4 | | #include <qpdf/InputSource_private.hh> |
5 | | #include <qpdf/QPDFObjectHandle_private.hh> |
6 | | #include <qpdf/QPDFObject_private.hh> |
7 | | #include <qpdf/QPDFTokenizer_private.hh> |
8 | | #include <qpdf/global_private.hh> |
9 | | |
10 | | #include <memory> |
11 | | #include <string> |
12 | | |
13 | | using namespace qpdf; |
14 | | using namespace qpdf::global; |
15 | | |
16 | | namespace qpdf::impl |
17 | | { |
18 | | /// @class Parser |
19 | | /// @brief Internal parser for PDF objects and content streams. |
20 | | /// @par |
21 | | /// The Parser class provides static methods for parsing PDF objects from input sources. |
22 | | /// It handles tokenization, error recovery, and object construction with proper offset |
23 | | /// tracking and description for error reporting. |
24 | | class Parser |
25 | | { |
26 | | public: |
27 | | /// @brief Exception thrown when parser encounters an unrecoverable error. |
28 | | class Error: public std::exception |
29 | | { |
30 | | public: |
31 | 8.37k | Error() = default; |
32 | | virtual ~Error() noexcept = default; |
33 | | }; |
34 | | |
35 | | /// @brief Parse a PDF object from an input source. |
36 | | /// @param input The input source to read from. |
37 | | /// @param object_description Description of the object for error messages. |
38 | | /// @param context The QPDF context, or nullptr if parsing standalone. |
39 | | /// @return The parsed QPDFObjectHandle, or null if parsing fails. |
40 | | static QPDFObjectHandle |
41 | | parse(InputSource& input, std::string const& object_description, QPDF* context); |
42 | | |
43 | | /// @brief Parse a content stream from an input source. |
44 | | /// @param input The input source to read from. |
45 | | /// @param sp_description Shared pointer to object description. |
46 | | /// @param tokenizer The tokenizer to use for parsing. |
47 | | /// @param context The QPDF context. |
48 | | /// @return The parsed QPDFObjectHandle, or uninitialized handle on EOF. |
49 | | static QPDFObjectHandle parse_content( |
50 | | InputSource& input, |
51 | | std::shared_ptr<QPDFObject::Description> sp_description, |
52 | | qpdf::Tokenizer& tokenizer, |
53 | | QPDF* context); |
54 | | |
55 | | /// @brief Parse a PDF object (interface for deprecated QPDFObjectHandle::parse). |
56 | | /// @param input The input source to read from. |
57 | | /// @param object_description Description of the object for error messages. |
58 | | /// @param tokenizer The tokenizer to use for parsing. |
59 | | /// @param empty Output parameter indicating if object was empty. |
60 | | /// @param decrypter String decrypter for encrypted strings, or nullptr. |
61 | | /// @param context The QPDF context, or nullptr if parsing standalone. |
62 | | /// @return The parsed QPDFObjectHandle. |
63 | | static QPDFObjectHandle parse( |
64 | | InputSource& input, |
65 | | std::string const& object_description, |
66 | | QPDFTokenizer& tokenizer, |
67 | | bool& empty, |
68 | | QPDFObjectHandle::StringDecrypter* decrypter, |
69 | | QPDF* context); |
70 | | |
71 | | /// @brief Parse a PDF object for use by QPDF. |
72 | | /// @param input The input source to read from. |
73 | | /// @param object_description Description of the object for error messages. |
74 | | /// @param tokenizer The tokenizer to use for parsing. |
75 | | /// @param decrypter String decrypter for encrypted strings, or nullptr. |
76 | | /// @param context The QPDF context. |
77 | | /// @param sanity_checks Enable additional sanity checks during parsing. |
78 | | /// @return The parsed QPDFObjectHandle. |
79 | | static QPDFObjectHandle parse( |
80 | | InputSource& input, |
81 | | std::string const& object_description, |
82 | | qpdf::Tokenizer& tokenizer, |
83 | | QPDFObjectHandle::StringDecrypter* decrypter, |
84 | | QPDF& context, |
85 | | bool sanity_checks); |
86 | | |
87 | | /// @brief Parse an object from an object stream. |
88 | | /// @param input The offset buffer containing the object data. |
89 | | /// @param stream_id The object stream number. |
90 | | /// @param obj_id The object ID within the stream. |
91 | | /// @param tokenizer The tokenizer to use for parsing. |
92 | | /// @param context The QPDF context. |
93 | | /// @return The parsed QPDFObjectHandle. |
94 | | static QPDFObjectHandle parse( |
95 | | qpdf::is::OffsetBuffer& input, |
96 | | int stream_id, |
97 | | int obj_id, |
98 | | qpdf::Tokenizer& tokenizer, |
99 | | QPDF& context); |
100 | | |
101 | | /// @brief Create a description for a parsed object. |
102 | | /// @param input_name The name of the input source. |
103 | | /// @param object_description Description of the object being parsed. |
104 | | /// @return Shared pointer to object description with offset placeholder. |
105 | | static std::shared_ptr<QPDFObject::Description> |
106 | | make_description(std::string const& input_name, std::string const& object_description) |
107 | 183k | { |
108 | 183k | using namespace std::literals; |
109 | 183k | return std::make_shared<QPDFObject::Description>( |
110 | 183k | input_name + ", " + object_description + " at offset $PO"); |
111 | 183k | } |
112 | | |
113 | | private: |
114 | | /// @brief Construct a parser instance. |
115 | | /// @param input The input source to read from. |
116 | | /// @param sp_description Shared pointer to object description. |
117 | | /// @param object_description Description string for error messages. |
118 | | /// @param tokenizer The tokenizer to use for parsing. |
119 | | /// @param decrypter String decrypter for encrypted content. |
120 | | /// @param context The QPDF context. |
121 | | /// @param parse_pdf Whether parsing PDF objects (vs content streams). |
122 | | /// @param stream_id Object stream ID for object stream parsing. |
123 | | /// @param obj_id Object ID within object stream. |
124 | | /// @param sanity_checks Enable additional sanity checks. |
125 | | Parser( |
126 | | InputSource& input, |
127 | | std::shared_ptr<QPDFObject::Description> sp_description, |
128 | | std::string const& object_description, |
129 | | qpdf::Tokenizer& tokenizer, |
130 | | QPDFObjectHandle::StringDecrypter* decrypter, |
131 | | QPDF* context, |
132 | | bool parse_pdf, |
133 | | int stream_id = 0, |
134 | | int obj_id = 0, |
135 | | bool sanity_checks = false) : |
136 | 1.05M | input_(input), |
137 | 1.05M | object_description_(object_description), |
138 | 1.05M | tokenizer_(tokenizer), |
139 | 1.05M | decrypter_(decrypter), |
140 | 1.05M | context_(context), |
141 | 1.05M | description_(std::move(sp_description)), |
142 | 1.05M | parse_pdf_(parse_pdf), |
143 | 1.05M | stream_id_(stream_id), |
144 | 1.05M | obj_id_(obj_id), |
145 | 1.05M | sanity_checks_(sanity_checks) |
146 | 1.05M | { |
147 | 1.05M | } |
148 | | |
149 | | /// @brief Parser state enumeration. |
150 | | /// @note state <= st_dictionary_value indicates we're in a dictionary context. |
151 | | enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; |
152 | | |
153 | | /// @brief Stack frame for tracking nested arrays and dictionaries. |
154 | | struct StackFrame |
155 | | { |
156 | | StackFrame(InputSource& input, parser_state_e state) : |
157 | 625k | state(state), |
158 | 625k | offset(input.tell()) |
159 | 625k | { |
160 | 625k | } |
161 | | |
162 | | std::vector<QPDFObjectHandle> olist; ///< Object list for arrays/dict values |
163 | | std::map<std::string, QPDFObjectHandle> dict; ///< Dictionary entries |
164 | | parser_state_e state; ///< Current parser state |
165 | | std::string key; ///< Current dictionary key |
166 | | qpdf_offset_t offset; ///< Offset of container start |
167 | | std::string contents_string; ///< For /Contents field in signatures |
168 | | qpdf_offset_t contents_offset{-1}; ///< Offset of /Contents value |
169 | | int null_count{0}; ///< Count of null values in container |
170 | | }; |
171 | | |
172 | | /// @brief Parse an object, handling exceptions and returning null on error. |
173 | | /// @param content_stream True if parsing a content stream. |
174 | | /// @return The parsed object handle, or null/uninitialized on error. |
175 | | QPDFObjectHandle parse(bool content_stream = false); |
176 | | |
177 | | /// @brief Parse the first token and dispatch to appropriate handler. |
178 | | /// @param content_stream True if parsing a content stream. |
179 | | /// @return The parsed object handle. |
180 | | QPDFObjectHandle parse_first(bool content_stream); |
181 | | |
182 | | /// @brief Parse the remainder of a composite object (array/dict/reference). |
183 | | /// @param content_stream True if parsing a content stream. |
184 | | /// @return The completed object handle. |
185 | | QPDFObjectHandle parse_remainder(bool content_stream); |
186 | | |
187 | | /// @brief Add an object to the current container. |
188 | | /// @param obj The object to add. |
189 | | void add(std::shared_ptr<QPDFObject>&& obj); |
190 | | |
191 | | /// @brief Add a null object to the current container. |
192 | | void add_null(); |
193 | | |
194 | | /// @brief Add a null with a warning message. |
195 | | /// @param msg Warning message describing the error. |
196 | | void add_bad_null(std::string const& msg); |
197 | | |
198 | | /// @brief Add a buffered integer from int_buffer_. |
199 | | /// @param count Buffer index (1 or 2) to read from. |
200 | | void add_int(int count); |
201 | | |
202 | | /// @brief Create and add a scalar object to the current container. |
203 | | /// @tparam T The scalar object type (e.g., QPDF_Integer, QPDF_String). |
204 | | /// @tparam Args Constructor argument types. |
205 | | /// @param args Arguments to forward to the object constructor. |
206 | | template <typename T, typename... Args> |
207 | | void add_scalar(Args&&... args); |
208 | | |
209 | | /// @brief Check if too many bad tokens have been encountered and throw if so. |
210 | | void check_too_many_bad_tokens(); |
211 | | |
212 | | /// @brief Issue a warning about a duplicate dictionary key. |
213 | | void warn_duplicate_key(); |
214 | | |
215 | | /// @brief Fix dictionaries with missing keys by generating fake keys. |
216 | | void fix_missing_keys(); |
217 | | |
218 | | /// @brief Report a limits error and throw. |
219 | | /// @param limit The limit identifier. |
220 | | /// @param msg Error message. |
221 | | [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); |
222 | | |
223 | | /// @brief Issue a warning at a specific offset. |
224 | | /// @param offset File offset for the warning. |
225 | | /// @param msg Warning message. |
226 | | void warn(qpdf_offset_t offset, std::string const& msg) const; |
227 | | |
228 | | /// @brief Issue a warning at the current offset. |
229 | | /// @param msg Warning message. |
230 | | void warn(std::string const& msg) const; |
231 | | |
232 | | /// @brief Issue a warning from a QPDFExc exception. |
233 | | /// @param e The exception to report. |
234 | | void warn(QPDFExc const& e) const; |
235 | | |
236 | | /// @brief Create a scalar object with description and parsed offset. |
237 | | /// @tparam T The scalar object type. |
238 | | /// @tparam Args Constructor argument types. |
239 | | /// @param args Arguments to forward to the object constructor. |
240 | | /// @return Object handle with description and offset set. |
241 | | /// @note The offset includes any leading whitespace. |
242 | | template <typename T, typename... Args> |
243 | | QPDFObjectHandle with_description(Args&&... args); |
244 | | |
245 | | /// @brief Set the description and offset on an existing object. |
246 | | /// @param obj The object to update. |
247 | | /// @param parsed_offset The file offset where the object was parsed. |
248 | | void set_description(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset); |
249 | | |
250 | | // Core parsing state |
251 | | InputSource& input_; ///< Input source to read from |
252 | | std::string const& object_description_; ///< Description for error messages |
253 | | qpdf::Tokenizer& tokenizer_; ///< Tokenizer for lexical analysis |
254 | | QPDFObjectHandle::StringDecrypter* decrypter_; ///< Decrypter for encrypted strings |
255 | | QPDF* context_; ///< QPDF context for object resolution |
256 | | std::shared_ptr<QPDFObject::Description> description_; ///< Shared description for objects |
257 | | bool parse_pdf_{false}; ///< True if parsing PDF objects vs content streams |
258 | | int stream_id_{0}; ///< Object stream ID (for object stream parsing) |
259 | | int obj_id_{0}; ///< Object ID within object stream |
260 | | bool sanity_checks_{false}; ///< Enable additional validation checks |
261 | | |
262 | | // Composite object parsing state |
263 | | std::vector<StackFrame> stack_; ///< Stack of nested containers |
264 | | StackFrame* frame_{nullptr}; ///< Current stack frame pointer |
265 | | |
266 | | // Error tracking state |
267 | | /// Number of recent bad tokens. Always > 0 after first bad token encountered. |
268 | | int bad_count_{0}; |
269 | | /// Number of bad tokens remaining before giving up. |
270 | | uint32_t max_bad_count_{Limits::parser_max_errors()}; |
271 | | /// Number of good tokens since last bad token. Irrelevant if bad_count == 0. |
272 | | int good_count_{0}; |
273 | | |
274 | | // Token buffering state |
275 | | /// Start offset of current object, including any leading whitespace. |
276 | | qpdf_offset_t start_{0}; |
277 | | /// Number of successive integer tokens (for indirect reference detection). |
278 | | int int_count_{0}; |
279 | | /// Buffer for up to 2 integer tokens. |
280 | | long long int_buffer_[2]{0, 0}; |
281 | | /// Offsets corresponding to buffered integers. |
282 | | qpdf_offset_t last_offset_buffer_[2]{0, 0}; |
283 | | |
284 | | /// True if object was empty (endobj without content). |
285 | | bool empty_{false}; |
286 | | }; |
287 | | } // namespace qpdf::impl |
288 | | |
289 | | #endif // QPDFPARSER_HH |