Coverage Report

Created: 2026-04-12 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/include/qpdf/QPDFTokenizer.hh
Line
Count
Source
1
// Copyright (c) 2005-2021 Jay Berkenbilt
2
// Copyright (c) 2022-2026 Jay Berkenbilt and Manfred Holger
3
//
4
// This file is part of qpdf.
5
//
6
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
7
// in compliance with the License. You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software distributed under the License
12
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
13
// or implied. See the License for the specific language governing permissions and limitations under
14
// the License.
15
//
16
// Versions of qpdf prior to version 7 were released under the terms of version 2.0 of the Artistic
17
// License. At your option, you may continue to consider qpdf to be licensed under those terms.
18
// Please see the manual for additional information.
19
20
#ifndef QPDFTOKENIZER_HH
21
#define QPDFTOKENIZER_HH
22
23
#include <qpdf/DLL.h>
24
25
#include <qpdf/InputSource.hh>
26
27
#include <cstdio>
28
#include <memory>
29
#include <string>
30
31
namespace qpdf
32
{
33
    class Tokenizer;
34
    namespace impl
35
    {
36
        class Parser;
37
    }
38
} // namespace qpdf
39
40
class QPDFTokenizer
41
{
42
  public:
43
    // Token type tt_eof is only returned of allowEOF() is called on the tokenizer. tt_eof was
44
    // introduced in QPDF version 4.1. tt_space, tt_comment, and tt_inline_image were added in QPDF
45
    // version 8.
46
    enum token_type_e {
47
        tt_bad,
48
        tt_array_close,
49
        tt_array_open,
50
        tt_brace_close,
51
        tt_brace_open,
52
        tt_dict_close,
53
        tt_dict_open,
54
        tt_integer,
55
        tt_name,
56
        tt_real,
57
        tt_string,
58
        tt_null,
59
        tt_bool,
60
        tt_word,
61
        tt_eof,
62
        tt_space,
63
        tt_comment,
64
        tt_inline_image,
65
    };
66
67
    class Token
68
    {
69
      public:
70
        Token() :
71
31.0M
            type(tt_bad)
72
31.0M
        {
73
31.0M
        }
74
        QPDF_DLL
75
        Token(token_type_e type, std::string const& value);
76
        Token(
77
            token_type_e type,
78
            std::string const& value,
79
            std::string raw_value,
80
            std::string error_message) :
81
31.0M
            type(type),
82
31.0M
            value(value),
83
31.0M
            raw_value(raw_value),
84
31.0M
            error_message(error_message)
85
31.0M
        {
86
31.0M
        }
87
        token_type_e
88
        getType() const
89
88.7M
        {
90
88.7M
            return this->type;
91
88.7M
        }
92
        std::string const&
93
        getValue() const
94
2.47M
        {
95
2.47M
            return this->value;
96
2.47M
        }
97
        std::string const&
98
        getRawValue() const
99
28.8M
        {
100
28.8M
            return this->raw_value;
101
28.8M
        }
102
        std::string const&
103
        getErrorMessage() const
104
0
        {
105
0
            return this->error_message;
106
0
        }
107
        bool
108
        operator==(Token const& rhs) const
109
0
        {
110
0
            // Ignore fields other than type and value
111
0
            return (
112
0
                (this->type != tt_bad) && (this->type == rhs.type) && (this->value == rhs.value));
113
0
        }
114
        bool
115
        isInteger() const
116
1.65M
        {
117
1.65M
            return this->type == tt_integer;
118
1.65M
        }
119
        bool
120
        isWord() const
121
0
        {
122
0
            return this->type == tt_word;
123
0
        }
124
        bool
125
        isWord(std::string const& value) const
126
30.9M
        {
127
30.9M
            return this->type == tt_word && this->value == value;
128
30.9M
        }
129
130
      private:
131
        token_type_e type;
132
        std::string value;
133
        std::string raw_value;
134
        std::string error_message;
135
    };
136
137
    QPDF_DLL
138
    QPDFTokenizer();
139
140
    QPDF_DLL
141
    ~QPDFTokenizer();
142
143
    // If called, treat EOF as a separate token type instead of an error.  This was introduced in
144
    // QPDF 4.1 to facilitate tokenizing content streams.
145
    QPDF_DLL
146
    void allowEOF();
147
148
    // If called, readToken will return "ignorable" tokens for space and comments. This was added in
149
    // QPDF 8.
150
    QPDF_DLL
151
    void includeIgnorable();
152
153
    // There are two modes of operation: push and pull. The pull method is easier but requires an
154
    // input source. The push method is more complicated but can be used to tokenize a stream of
155
    // incoming characters in a pipeline.
156
157
    // Push mode:
158
159
    // deprecated, please see <https:manual.qpdf.org/release-notes.html#r12-0-0-deprecate>
160
161
    // Keep presenting characters with presentCharacter() and presentEOF() and calling getToken()
162
    // until getToken() returns true. When it does, be sure to check unread_ch and to unread ch if
163
    // it is true. If these are called when a token is available, an exception will be thrown.
164
    QPDF_DLL
165
    void presentCharacter(char ch);
166
    QPDF_DLL
167
    void presentEOF();
168
169
    // If a token is available, return true and initialize token with the token, unread_char with
170
    // whether or not we have to unread the last character, and if unread_char, ch with the
171
    // character to unread.
172
    QPDF_DLL
173
    bool getToken(Token& token, bool& unread_char, char& ch);
174
175
    // This function returns true of the current character is between tokens (i.e., white space that
176
    // is not part of a string) or is part of a comment.  A tokenizing filter can call this to
177
    // determine whether to output the character.
178
    [[deprecated("see <https:manual.qpdf.org/release-notes.html#r12-0-0-deprecate>")]] QPDF_DLL bool
179
    betweenTokens();
180
181
    // Pull mode:
182
183
    // Read a token from an input source. Context describes the context in which the token is being
184
    // read and is used in the exception thrown if there is an error. After a token is read, the
185
    // position of the input source returned by input->tell() points to just after the token, and
186
    // the input source's "last offset" as returned by input->getLastOffset() points to the
187
    // beginning of the token.
188
    QPDF_DLL
189
    Token readToken(
190
        InputSource& input, std::string const& context, bool allow_bad = false, size_t max_len = 0);
191
    QPDF_DLL
192
    Token readToken(
193
        std::shared_ptr<InputSource> input,
194
        std::string const& context,
195
        bool allow_bad = false,
196
        size_t max_len = 0);
197
198
    // Calling this method puts the tokenizer in a state for reading inline images. You should call
199
    // this method after reading the character following the ID operator. In that state, it will
200
    // return all data up to BUT NOT INCLUDING the next EI token. After you call this method, the
201
    // next call to readToken (or the token created next time getToken returns true) will either be
202
    // tt_inline_image or tt_bad. This is the only way readToken
203
    // returns a tt_inline_image token.
204
    QPDF_DLL
205
    void expectInlineImage(std::shared_ptr<InputSource> input);
206
    QPDF_DLL
207
    void expectInlineImage(InputSource& input);
208
209
  private:
210
    friend class qpdf::impl::Parser;
211
212
    QPDFTokenizer(QPDFTokenizer const&) = delete;
213
    QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
214
215
    std::unique_ptr<qpdf::Tokenizer> m;
216
};
217
218
#endif // QPDFTOKENIZER_HH