/src/qpdf/include/qpdf/QPDFTokenizer.hh
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2005-2021 Jay Berkenbilt |
2 | | // Copyright (c) 2022-2025 Jay Berkenbilt and Manfred Holger |
3 | | // |
4 | | // This file is part of qpdf. |
5 | | // |
6 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
7 | | // in compliance with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
12 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
13 | | // or implied. See the License for the specific language governing permissions and limitations under |
14 | | // the License. |
15 | | // |
16 | | // Versions of qpdf prior to version 7 were released under the terms of version 2.0 of the Artistic |
17 | | // License. At your option, you may continue to consider qpdf to be licensed under those terms. |
18 | | // Please see the manual for additional information. |
19 | | |
20 | | #ifndef QPDFTOKENIZER_HH |
21 | | #define QPDFTOKENIZER_HH |
22 | | |
23 | | #include <qpdf/DLL.h> |
24 | | |
25 | | #include <qpdf/InputSource.hh> |
26 | | |
27 | | #include <cstdio> |
28 | | #include <memory> |
29 | | #include <string> |
30 | | |
31 | | namespace qpdf |
32 | | { |
33 | | class Tokenizer; |
34 | | } // namespace qpdf |
35 | | |
36 | | class QPDFTokenizer |
37 | | { |
38 | | public: |
39 | | // Token type tt_eof is only returned of allowEOF() is called on the tokenizer. tt_eof was |
40 | | // introduced in QPDF version 4.1. tt_space, tt_comment, and tt_inline_image were added in QPDF |
41 | | // version 8. |
42 | | enum token_type_e { |
43 | | tt_bad, |
44 | | tt_array_close, |
45 | | tt_array_open, |
46 | | tt_brace_close, |
47 | | tt_brace_open, |
48 | | tt_dict_close, |
49 | | tt_dict_open, |
50 | | tt_integer, |
51 | | tt_name, |
52 | | tt_real, |
53 | | tt_string, |
54 | | tt_null, |
55 | | tt_bool, |
56 | | tt_word, |
57 | | tt_eof, |
58 | | tt_space, |
59 | | tt_comment, |
60 | | tt_inline_image, |
61 | | }; |
62 | | |
63 | | class Token |
64 | | { |
65 | | public: |
66 | | Token() : |
67 | 0 | type(tt_bad) |
68 | 0 | { |
69 | 0 | } |
70 | | QPDF_DLL |
71 | | Token(token_type_e type, std::string const& value); |
72 | | Token( |
73 | | token_type_e type, |
74 | | std::string const& value, |
75 | | std::string raw_value, |
76 | | std::string error_message) : |
77 | 0 | type(type), |
78 | 0 | value(value), |
79 | 0 | raw_value(raw_value), |
80 | 0 | error_message(error_message) |
81 | 0 | { |
82 | 0 | } |
83 | | token_type_e |
84 | | getType() const |
85 | 0 | { |
86 | 0 | return this->type; |
87 | 0 | } |
88 | | std::string const& |
89 | | getValue() const |
90 | 0 | { |
91 | 0 | return this->value; |
92 | 0 | } |
93 | | std::string const& |
94 | | getRawValue() const |
95 | 0 | { |
96 | 0 | return this->raw_value; |
97 | 0 | } |
98 | | std::string const& |
99 | | getErrorMessage() const |
100 | 0 | { |
101 | 0 | return this->error_message; |
102 | 0 | } |
103 | | bool |
104 | | operator==(Token const& rhs) const |
105 | 0 | { |
106 | 0 | // Ignore fields other than type and value |
107 | 0 | return ( |
108 | 0 | (this->type != tt_bad) && (this->type == rhs.type) && (this->value == rhs.value)); |
109 | 0 | } |
110 | | bool |
111 | | isInteger() const |
112 | 0 | { |
113 | 0 | return this->type == tt_integer; |
114 | 0 | } |
115 | | bool |
116 | | isWord() const |
117 | 0 | { |
118 | 0 | return this->type == tt_word; |
119 | 0 | } |
120 | | bool |
121 | | isWord(std::string const& value) const |
122 | 0 | { |
123 | 0 | return this->type == tt_word && this->value == value; |
124 | 0 | } |
125 | | |
126 | | private: |
127 | | token_type_e type; |
128 | | std::string value; |
129 | | std::string raw_value; |
130 | | std::string error_message; |
131 | | }; |
132 | | |
133 | | QPDF_DLL |
134 | | QPDFTokenizer(); |
135 | | |
136 | | QPDF_DLL |
137 | | ~QPDFTokenizer(); |
138 | | |
139 | | // If called, treat EOF as a separate token type instead of an error. This was introduced in |
140 | | // QPDF 4.1 to facilitate tokenizing content streams. |
141 | | QPDF_DLL |
142 | | void allowEOF(); |
143 | | |
144 | | // If called, readToken will return "ignorable" tokens for space and comments. This was added in |
145 | | // QPDF 8. |
146 | | QPDF_DLL |
147 | | void includeIgnorable(); |
148 | | |
149 | | // There are two modes of operation: push and pull. The pull method is easier but requires an |
150 | | // input source. The push method is more complicated but can be used to tokenize a stream of |
151 | | // incoming characters in a pipeline. |
152 | | |
153 | | // Push mode: |
154 | | |
155 | | // deprecated, please see <https:manual.qpdf.org/release-notes.html#r12-0-0-deprecate> |
156 | | |
157 | | // Keep presenting characters with presentCharacter() and presentEOF() and calling getToken() |
158 | | // until getToken() returns true. When it does, be sure to check unread_ch and to unread ch if |
159 | | // it is true. If these are called when a token is available, an exception will be thrown. |
160 | | QPDF_DLL |
161 | | void presentCharacter(char ch); |
162 | | QPDF_DLL |
163 | | void presentEOF(); |
164 | | |
165 | | // If a token is available, return true and initialize token with the token, unread_char with |
166 | | // whether or not we have to unread the last character, and if unread_char, ch with the |
167 | | // character to unread. |
168 | | QPDF_DLL |
169 | | bool getToken(Token& token, bool& unread_char, char& ch); |
170 | | |
171 | | // This function returns true of the current character is between tokens (i.e., white space that |
172 | | // is not part of a string) or is part of a comment. A tokenizing filter can call this to |
173 | | // determine whether to output the character. |
174 | | [[deprecated("see <https:manual.qpdf.org/release-notes.html#r12-0-0-deprecate>")]] QPDF_DLL bool |
175 | | betweenTokens(); |
176 | | |
177 | | // Pull mode: |
178 | | |
179 | | // Read a token from an input source. Context describes the context in which the token is being |
180 | | // read and is used in the exception thrown if there is an error. After a token is read, the |
181 | | // position of the input source returned by input->tell() points to just after the token, and |
182 | | // the input source's "last offset" as returned by input->getLastOffset() points to the |
183 | | // beginning of the token. |
184 | | QPDF_DLL |
185 | | Token readToken( |
186 | | InputSource& input, std::string const& context, bool allow_bad = false, size_t max_len = 0); |
187 | | QPDF_DLL |
188 | | Token readToken( |
189 | | std::shared_ptr<InputSource> input, |
190 | | std::string const& context, |
191 | | bool allow_bad = false, |
192 | | size_t max_len = 0); |
193 | | |
194 | | // Calling this method puts the tokenizer in a state for reading inline images. You should call |
195 | | // this method after reading the character following the ID operator. In that state, it will |
196 | | // return all data up to BUT NOT INCLUDING the next EI token. After you call this method, the |
197 | | // next call to readToken (or the token created next time getToken returns true) will either be |
198 | | // tt_inline_image or tt_bad. This is the only way readToken |
199 | | // returns a tt_inline_image token. |
200 | | QPDF_DLL |
201 | | void expectInlineImage(std::shared_ptr<InputSource> input); |
202 | | QPDF_DLL |
203 | | void expectInlineImage(InputSource& input); |
204 | | |
205 | | private: |
206 | | friend class QPDFParser; |
207 | | |
208 | | QPDFTokenizer(QPDFTokenizer const&) = delete; |
209 | | QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; |
210 | | |
211 | | std::unique_ptr<qpdf::Tokenizer> m; |
212 | | }; |
213 | | |
214 | | #endif // QPDFTOKENIZER_HH |