Coverage Report

Created: 2025-07-01 06:10

/src/qpdf/include/qpdf/QPDFTokenizer.hh
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2005-2021 Jay Berkenbilt
2
// Copyright (c) 2022-2025 Jay Berkenbilt and Manfred Holger
3
//
4
// This file is part of qpdf.
5
//
6
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
7
// in compliance with the License. You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software distributed under the License
12
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
13
// or implied. See the License for the specific language governing permissions and limitations under
14
// the License.
15
//
16
// Versions of qpdf prior to version 7 were released under the terms of version 2.0 of the Artistic
17
// License. At your option, you may continue to consider qpdf to be licensed under those terms.
18
// Please see the manual for additional information.
19
20
#ifndef QPDFTOKENIZER_HH
21
#define QPDFTOKENIZER_HH
22
23
#include <qpdf/DLL.h>
24
25
#include <qpdf/InputSource.hh>
26
27
#include <cstdio>
28
#include <memory>
29
#include <string>
30
31
namespace qpdf
32
{
33
    class Tokenizer;
34
} // namespace qpdf
35
36
class QPDFTokenizer
37
{
38
  public:
39
    // Token type tt_eof is only returned of allowEOF() is called on the tokenizer. tt_eof was
40
    // introduced in QPDF version 4.1. tt_space, tt_comment, and tt_inline_image were added in QPDF
41
    // version 8.
42
    enum token_type_e {
43
        tt_bad,
44
        tt_array_close,
45
        tt_array_open,
46
        tt_brace_close,
47
        tt_brace_open,
48
        tt_dict_close,
49
        tt_dict_open,
50
        tt_integer,
51
        tt_name,
52
        tt_real,
53
        tt_string,
54
        tt_null,
55
        tt_bool,
56
        tt_word,
57
        tt_eof,
58
        tt_space,
59
        tt_comment,
60
        tt_inline_image,
61
    };
62
63
    class Token
64
    {
65
      public:
66
        Token() :
67
0
            type(tt_bad)
68
0
        {
69
0
        }
70
        QPDF_DLL
71
        Token(token_type_e type, std::string const& value);
72
        Token(
73
            token_type_e type,
74
            std::string const& value,
75
            std::string raw_value,
76
            std::string error_message) :
77
0
            type(type),
78
0
            value(value),
79
0
            raw_value(raw_value),
80
0
            error_message(error_message)
81
0
        {
82
0
        }
83
        token_type_e
84
        getType() const
85
0
        {
86
0
            return this->type;
87
0
        }
88
        std::string const&
89
        getValue() const
90
0
        {
91
0
            return this->value;
92
0
        }
93
        std::string const&
94
        getRawValue() const
95
0
        {
96
0
            return this->raw_value;
97
0
        }
98
        std::string const&
99
        getErrorMessage() const
100
0
        {
101
0
            return this->error_message;
102
0
        }
103
        bool
104
        operator==(Token const& rhs) const
105
0
        {
106
0
            // Ignore fields other than type and value
107
0
            return (
108
0
                (this->type != tt_bad) && (this->type == rhs.type) && (this->value == rhs.value));
109
0
        }
110
        bool
111
        isInteger() const
112
0
        {
113
0
            return this->type == tt_integer;
114
0
        }
115
        bool
116
        isWord() const
117
0
        {
118
0
            return this->type == tt_word;
119
0
        }
120
        bool
121
        isWord(std::string const& value) const
122
0
        {
123
0
            return this->type == tt_word && this->value == value;
124
0
        }
125
126
      private:
127
        token_type_e type;
128
        std::string value;
129
        std::string raw_value;
130
        std::string error_message;
131
    };
132
133
    QPDF_DLL
134
    QPDFTokenizer();
135
136
    QPDF_DLL
137
    ~QPDFTokenizer();
138
139
    // If called, treat EOF as a separate token type instead of an error.  This was introduced in
140
    // QPDF 4.1 to facilitate tokenizing content streams.
141
    QPDF_DLL
142
    void allowEOF();
143
144
    // If called, readToken will return "ignorable" tokens for space and comments. This was added in
145
    // QPDF 8.
146
    QPDF_DLL
147
    void includeIgnorable();
148
149
    // There are two modes of operation: push and pull. The pull method is easier but requires an
150
    // input source. The push method is more complicated but can be used to tokenize a stream of
151
    // incoming characters in a pipeline.
152
153
    // Push mode:
154
155
    // deprecated, please see <https:manual.qpdf.org/release-notes.html#r12-0-0-deprecate>
156
157
    // Keep presenting characters with presentCharacter() and presentEOF() and calling getToken()
158
    // until getToken() returns true. When it does, be sure to check unread_ch and to unread ch if
159
    // it is true. If these are called when a token is available, an exception will be thrown.
160
    QPDF_DLL
161
    void presentCharacter(char ch);
162
    QPDF_DLL
163
    void presentEOF();
164
165
    // If a token is available, return true and initialize token with the token, unread_char with
166
    // whether or not we have to unread the last character, and if unread_char, ch with the
167
    // character to unread.
168
    QPDF_DLL
169
    bool getToken(Token& token, bool& unread_char, char& ch);
170
171
    // This function returns true of the current character is between tokens (i.e., white space that
172
    // is not part of a string) or is part of a comment.  A tokenizing filter can call this to
173
    // determine whether to output the character.
174
    [[deprecated("see <https:manual.qpdf.org/release-notes.html#r12-0-0-deprecate>")]] QPDF_DLL bool
175
    betweenTokens();
176
177
    // Pull mode:
178
179
    // Read a token from an input source. Context describes the context in which the token is being
180
    // read and is used in the exception thrown if there is an error. After a token is read, the
181
    // position of the input source returned by input->tell() points to just after the token, and
182
    // the input source's "last offset" as returned by input->getLastOffset() points to the
183
    // beginning of the token.
184
    QPDF_DLL
185
    Token readToken(
186
        InputSource& input, std::string const& context, bool allow_bad = false, size_t max_len = 0);
187
    QPDF_DLL
188
    Token readToken(
189
        std::shared_ptr<InputSource> input,
190
        std::string const& context,
191
        bool allow_bad = false,
192
        size_t max_len = 0);
193
194
    // Calling this method puts the tokenizer in a state for reading inline images. You should call
195
    // this method after reading the character following the ID operator. In that state, it will
196
    // return all data up to BUT NOT INCLUDING the next EI token. After you call this method, the
197
    // next call to readToken (or the token created next time getToken returns true) will either be
198
    // tt_inline_image or tt_bad. This is the only way readToken
199
    // returns a tt_inline_image token.
200
    QPDF_DLL
201
    void expectInlineImage(std::shared_ptr<InputSource> input);
202
    QPDF_DLL
203
    void expectInlineImage(InputSource& input);
204
205
  private:
206
    friend class QPDFParser;
207
208
    QPDFTokenizer(QPDFTokenizer const&) = delete;
209
    QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
210
211
    std::unique_ptr<qpdf::Tokenizer> m;
212
};
213
214
#endif // QPDFTOKENIZER_HH