Coverage Report

Created: 2025-04-22 06:07

/src/yajl-ruby/ext/yajl/yajl_lex.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2010, Lloyd Hilaiel.
3
 * 
4
 * Redistribution and use in source and binary forms, with or without
5
 * modification, are permitted provided that the following conditions are
6
 * met:
7
 * 
8
 *  1. Redistributions of source code must retain the above copyright
9
 *     notice, this list of conditions and the following disclaimer.
10
 * 
11
 *  2. Redistributions in binary form must reproduce the above copyright
12
 *     notice, this list of conditions and the following disclaimer in
13
 *     the documentation and/or other materials provided with the
14
 *     distribution.
15
 * 
16
 *  3. Neither the name of Lloyd Hilaiel nor the names of its
17
 *     contributors may be used to endorse or promote products derived
18
 *     from this software without specific prior written permission.
19
 * 
20
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
24
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30
 * POSSIBILITY OF SUCH DAMAGE.
31
 */ 
32
33
#include "yajl_lex.h"
34
#include "yajl_buf.h"
35
36
#include <stdlib.h>
37
#include <stdio.h>
38
#include <assert.h>
39
#include <string.h>
40
41
0
const char *yajl_tok_name(yajl_tok tok) {
42
0
    switch (tok) {
43
0
        case yajl_tok_bool: return "bool";
44
0
        case yajl_tok_colon: return "colon";
45
0
        case yajl_tok_comma: return "comma";
46
0
        case yajl_tok_comment: return "comment";
47
0
        case yajl_tok_eof: return "eof";
48
0
        case yajl_tok_error: return "error";
49
0
        case yajl_tok_left_brace: return "open_array";
50
0
        case yajl_tok_left_bracket: return "open_object";
51
0
        case yajl_tok_null: return "null";
52
0
        case yajl_tok_integer: return "integer";
53
0
        case yajl_tok_double: return "double";
54
0
        case yajl_tok_right_brace: return "close_array";
55
0
        case yajl_tok_right_bracket: return "close_object";
56
0
        case yajl_tok_string: return "string";
57
0
        case yajl_tok_string_with_escapes: return "string_with_escapes";
58
0
    }
59
0
    return "unknown";
60
0
}
61
62
/* Impact of the stream parsing feature on the lexer:
63
 *
64
 * YAJL support stream parsing.  That is, the ability to parse the first
65
 * bits of a chunk of JSON before the last bits are available (still on
66
 * the network or disk).  This makes the lexer more complex.  The
67
 * responsibility of the lexer is to handle transparently the case where
68
 * a chunk boundary falls in the middle of a token.  This is
69
 * accomplished is via a buffer and a character reading abstraction. 
70
 *
71
 * Overview of implementation
72
 *
73
 * When we lex to end of input string before end of token is hit, we
74
 * copy all of the input text composing the token into our lexBuf.
75
 * 
76
 * Every time we read a character, we do so through the readChar function.
77
 * readChar's responsibility is to handle pulling all chars from the buffer
78
 * before pulling chars from input text
79
 */
80
81
struct yajl_lexer_t {
82
    /* the overal line and char offset into the data */
83
    unsigned int lineOff;
84
    unsigned int charOff;
85
86
    /* error */
87
    yajl_lex_error error;
88
89
    /* a input buffer to handle the case where a token is spread over
90
     * multiple chunks */ 
91
    yajl_buf buf;
92
93
    /* in the case where we have data in the lexBuf, bufOff holds
94
     * the current offset into the lexBuf. */
95
    unsigned int bufOff;
96
97
    /* are we using the lex buf? */
98
    unsigned int bufInUse;
99
100
    /* shall we allow comments? */
101
    unsigned int allowComments;
102
103
    /* shall we validate utf8 inside strings? */
104
    unsigned int validateUTF8;
105
106
    yajl_alloc_funcs * alloc;
107
};
108
109
#define readChar(lxr, txt, off)                      \
110
122M
    (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
111
122M
     (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
112
122M
     ((txt)[(*(off))++]))
113
114
3.43M
#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
115
116
yajl_lexer
117
yajl_lex_alloc(yajl_alloc_funcs * alloc,
118
               unsigned int allowComments, unsigned int validateUTF8)
119
2.30k
{
120
2.30k
    yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
121
2.30k
    if (!lxr)
122
0
        return NULL;
123
2.30k
    memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
124
2.30k
    lxr->buf = yajl_buf_alloc(alloc);
125
2.30k
    lxr->allowComments = allowComments;
126
2.30k
    lxr->validateUTF8 = validateUTF8;
127
2.30k
    lxr->alloc = alloc;
128
2.30k
    return lxr;
129
2.30k
}
130
131
yajl_lexer
132
1.74M
yajl_lex_realloc(yajl_lexer orig) {
133
1.74M
    orig->lineOff = 0;
134
1.74M
    orig->charOff = 0;
135
1.74M
    orig->error = yajl_lex_e_ok;
136
1.74M
    yajl_buf_clear(orig->buf);
137
1.74M
    orig->bufOff = 0;
138
1.74M
    orig->bufInUse = 0;
139
1.74M
    return orig;
140
1.74M
}
141
142
void
143
yajl_lex_free(yajl_lexer lxr)
144
2.30k
{
145
2.30k
    yajl_buf_free(lxr->buf);
146
2.30k
    YA_FREE(lxr->alloc, lxr);
147
2.30k
    return;
148
2.30k
}
149
150
/* a lookup table which lets us quickly determine three things:
151
 * VEC - valid escaped conrol char
152
 * IJC - invalid json char
153
 * VHC - valid hex char
154
 * note.  the solidus '/' may be escaped or not.
155
 * note.  the
156
 */
157
50.5k
#define VEC 1
158
13.3M
#define IJC 2
159
140k
#define VHC 4
160
static const char charLookupTable[256] =
161
{
162
/*00*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
163
/*08*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
164
/*10*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
165
/*18*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
166
167
/*20*/ 0      , 0      , VEC|IJC, 0      , 0      , 0      , 0      , 0      ,
168
/*28*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , VEC    ,
169
/*30*/ VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    ,
170
/*38*/ VHC    , VHC    , 0      , 0      , 0      , 0      , 0      , 0      ,
171
172
/*40*/ 0      , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , 0      ,
173
/*48*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
174
/*50*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
175
/*58*/ 0      , 0      , 0      , 0      , VEC|IJC, 0      , 0      , 0      ,
176
177
/*60*/ 0      , VHC    , VEC|VHC, VHC    , VHC    , VHC    , VEC|VHC, 0      ,
178
/*68*/ 0      , 0      , 0      , 0      , 0      , 0      , VEC    , 0      ,
179
/*70*/ 0      , 0      , VEC    , 0      , VEC    , 0      , 0      , 0      ,
180
/*78*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
181
182
/* include these so we don't have to always check the range of the char */
183
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
184
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
185
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
186
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
187
188
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
189
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
190
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
191
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
192
193
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
194
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
195
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
196
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
197
198
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
199
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
200
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
201
       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0
202
};
203
204
/** process a variable length utf8 encoded codepoint.
205
 *
206
 *  returns:
207
 *    yajl_tok_string - if valid utf8 char was parsed and offset was
208
 *                      advanced
209
 *    yajl_tok_eof - if end of input was hit before validation could
210
 *                   complete
211
 *    yajl_tok_error - if invalid utf8 was encountered
212
 * 
213
 *  NOTE: on error the offset will point to the first char of the
214
 *  invalid utf8 */
215
6.93k
#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
216
217
static yajl_tok
218
yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
219
                   unsigned int jsonTextLen, unsigned int * offset,
220
                   unsigned char curChar)
221
13.3M
{
222
13.3M
    if (curChar <= 0x7f) {
223
        /* single byte */
224
13.3M
        return yajl_tok_string;
225
13.3M
    } else if ((curChar >> 5) == 0x6) {
226
        /* two byte */ 
227
1.19k
        UTF8_CHECK_EOF;
228
1.19k
        curChar = readChar(lexer, jsonText, offset);
229
1.19k
        if ((curChar >> 6) == 0x2) return yajl_tok_string;
230
2.09k
    } else if ((curChar >> 4) == 0x0e) {
231
        /* three byte */
232
449
        UTF8_CHECK_EOF;
233
448
        curChar = readChar(lexer, jsonText, offset);
234
448
        if ((curChar >> 6) == 0x2) {
235
444
            UTF8_CHECK_EOF;
236
437
            curChar = readChar(lexer, jsonText, offset);
237
437
            if ((curChar >> 6) == 0x2) return yajl_tok_string;
238
437
        }
239
1.64k
    } else if ((curChar >> 3) == 0x1e) {
240
        /* four byte */
241
1.63k
        UTF8_CHECK_EOF;
242
1.62k
        curChar = readChar(lexer, jsonText, offset);
243
1.62k
        if ((curChar >> 6) == 0x2) {
244
1.62k
            UTF8_CHECK_EOF;
245
1.60k
            curChar = readChar(lexer, jsonText, offset);
246
1.60k
            if ((curChar >> 6) == 0x2) {
247
1.59k
                UTF8_CHECK_EOF;
248
1.57k
                curChar = readChar(lexer, jsonText, offset);
249
1.57k
                if ((curChar >> 6) == 0x2) return yajl_tok_string;
250
1.57k
            }
251
1.60k
        }
252
1.62k
    } 
253
254
51
    return yajl_tok_error;
255
13.3M
}
256
257
/* lex a string.  input is the lexer, pointer to beginning of
258
 * json text, and start of string (offset).
259
 * a token is returned which has the following meanings:
260
 * yajl_tok_string: lex of string was successful.  offset points to
261
 *                  terminating '"'.
262
 * yajl_tok_eof: end of text was encountered before we could complete
263
 *               the lex.
264
 * yajl_tok_error: embedded in the string were unallowable chars.  offset
265
 *               points to the offending char
266
 */
267
14.2M
#define STR_CHECK_EOF \
268
14.2M
if (*offset >= jsonTextLen) { \
269
142
   tok = yajl_tok_eof; \
270
142
   goto finish_string_lex; \
271
142
}
272
273
static yajl_tok
274
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
275
                unsigned int jsonTextLen, unsigned int * offset)
276
569k
{
277
569k
    yajl_tok tok = yajl_tok_error;
278
569k
    int hasEscapes = 0;
279
280
14.0M
    for (;;) {
281
14.0M
    unsigned char curChar;
282
283
14.0M
    STR_CHECK_EOF;
284
285
14.0M
        curChar = readChar(lexer, jsonText, offset);
286
287
        /* quote terminates */
288
14.0M
        if (curChar == '"') {
289
569k
            tok = yajl_tok_string;
290
569k
            break;
291
569k
        }
292
        /* backslash escapes a set of control chars, */
293
13.4M
        else if (curChar == '\\') {
294
85.6k
            hasEscapes = 1;
295
85.6k
            STR_CHECK_EOF;
296
297
            /* special case \u */
298
85.6k
            curChar = readChar(lexer, jsonText, offset);
299
85.6k
            if (curChar == 'u') {
300
35.0k
                unsigned int i = 0;
301
302
175k
                for (i=0;i<4;i++) {
303
140k
                    STR_CHECK_EOF;                
304
140k
                    curChar = readChar(lexer, jsonText, offset);                
305
140k
                    if (!(charLookupTable[curChar] & VHC)) {
306
                        /* back up to offending char */
307
29
                        unreadChar(lexer, offset);
308
29
                        lexer->error = yajl_lex_string_invalid_hex_char;
309
29
                        goto finish_string_lex;
310
29
                    }
311
140k
                }
312
50.5k
            } else if (!(charLookupTable[curChar] & VEC)) {
313
                /* back up to offending char */
314
36
                unreadChar(lexer, offset);
315
36
                lexer->error = yajl_lex_string_invalid_escaped_char;
316
36
                goto finish_string_lex;                
317
36
            } 
318
85.6k
        }
319
        /* when not validating UTF8 it's a simple table lookup to determine
320
         * if the present character is invalid */
321
13.3M
        else if(charLookupTable[curChar] & IJC) {
322
            /* back up to offending char */
323
27
            unreadChar(lexer, offset);
324
27
            lexer->error = yajl_lex_string_invalid_json_char;
325
27
            goto finish_string_lex;                
326
27
        }
327
        /* when in validate UTF8 mode we need to do some extra work */
328
13.3M
        else if (lexer->validateUTF8) {
329
13.3M
            yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
330
13.3M
                                            offset, curChar);
331
            
332
13.3M
            if (t == yajl_tok_eof) {
333
51
                tok = yajl_tok_eof;
334
51
                goto finish_string_lex;
335
13.3M
            } else if (t == yajl_tok_error) {
336
51
                lexer->error = yajl_lex_string_invalid_utf8;
337
51
                goto finish_string_lex;
338
51
            } 
339
13.3M
        }
340
        /* accept it, and move on */ 
341
14.0M
    }
342
569k
  finish_string_lex:
343
    /* tell our buddy, the parser, wether he needs to process this string
344
     * again */
345
569k
    if (hasEscapes && tok == yajl_tok_string) {
346
3.26k
        tok = yajl_tok_string_with_escapes;
347
3.26k
    } 
348
349
569k
    return tok;
350
569k
}
351
352
65.4M
#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
353
354
static yajl_tok
355
yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
356
                unsigned int jsonTextLen, unsigned int * offset)
357
1.70M
{
358
    /** XXX: numbers are the only entities in json that we must lex
359
     *       _beyond_ in order to know that they are complete.  There
360
     *       is an ambiguous case for integers at EOF. */
361
362
1.70M
    unsigned char c;
363
364
1.70M
    yajl_tok tok = yajl_tok_integer;
365
366
1.70M
    RETURN_IF_EOF;    
367
1.70M
    c = readChar(lexer, jsonText, offset);
368
369
    /* optional leading minus */
370
1.70M
    if (c == '-') {
371
13.1k
        RETURN_IF_EOF;    
372
13.1k
        c = readChar(lexer, jsonText, offset); 
373
13.1k
    }
374
375
    /* a single zero, or a series of integers */
376
1.70M
    if (c == '0') {
377
1.62M
        RETURN_IF_EOF;    
378
1.62M
        c = readChar(lexer, jsonText, offset); 
379
1.62M
    } else if (c >= '1' && c <= '9') {
380
4.05M
        do {
381
4.05M
            RETURN_IF_EOF;    
382
4.05M
            c = readChar(lexer, jsonText, offset); 
383
4.05M
        } while (c >= '0' && c <= '9');
384
77.5k
    } else {
385
36
        unreadChar(lexer, offset);
386
36
        lexer->error = yajl_lex_missing_integer_after_minus;
387
36
        return yajl_tok_error;
388
36
    }
389
390
    /* optional fraction (indicates this is floating point) */
391
1.70M
    if (c == '.') {
392
16.9k
        int numRd = 0;
393
        
394
16.9k
        RETURN_IF_EOF;
395
16.9k
        c = readChar(lexer, jsonText, offset); 
396
397
3.78M
        while (c >= '0' && c <= '9') {
398
3.77M
            numRd++;
399
3.77M
            RETURN_IF_EOF;
400
3.77M
            c = readChar(lexer, jsonText, offset); 
401
3.77M
        } 
402
403
16.8k
        if (!numRd) {
404
48
            unreadChar(lexer, offset);
405
48
            lexer->error = yajl_lex_missing_integer_after_decimal;
406
48
            return yajl_tok_error;
407
48
        }
408
16.8k
        tok = yajl_tok_double;
409
16.8k
    }
410
411
    /* optional exponent (indicates this is floating point) */
412
1.70M
    if (c == 'e' || c == 'E') {
413
17.1k
        RETURN_IF_EOF;
414
17.1k
        c = readChar(lexer, jsonText, offset); 
415
416
        /* optional sign */
417
17.1k
        if (c == '+' || c == '-') {
418
4.48k
            RETURN_IF_EOF;
419
4.46k
            c = readChar(lexer, jsonText, offset); 
420
4.46k
        }
421
422
17.1k
        if (c >= '0' && c <= '9') {
423
678k
            do {
424
678k
                RETURN_IF_EOF;
425
678k
                c = readChar(lexer, jsonText, offset); 
426
678k
            } while (c >= '0' && c <= '9');
427
17.0k
        } else {
428
51
            unreadChar(lexer, offset);
429
51
            lexer->error = yajl_lex_missing_integer_after_exponent;
430
51
            return yajl_tok_error;
431
51
        }
432
17.0k
        tok = yajl_tok_double;
433
17.0k
    }
434
    
435
    /* we always go "one too far" */
436
1.70M
    unreadChar(lexer, offset);
437
    
438
1.70M
    return tok;
439
1.70M
}
440
441
static yajl_tok
442
yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
443
                 unsigned int jsonTextLen, unsigned int * offset)
444
21.3k
{
445
21.3k
    unsigned char c;
446
447
21.3k
    yajl_tok tok = yajl_tok_comment;
448
449
21.3k
    RETURN_IF_EOF;    
450
21.3k
    c = readChar(lexer, jsonText, offset);
451
452
    /* either slash or star expected */
453
21.3k
    if (c == '/') {
454
        /* now we throw away until end of line */
455
26.4M
        do {
456
26.4M
            RETURN_IF_EOF;    
457
26.4M
            c = readChar(lexer, jsonText, offset); 
458
26.4M
        } while (c != '\n');
459
13.3k
    } else if (c == '*') {
460
        /* now we throw away until end of comment */        
461
27.0M
        for (;;) {
462
27.0M
            RETURN_IF_EOF;    
463
27.0M
            c = readChar(lexer, jsonText, offset); 
464
27.0M
            if (c == '*') {
465
48.0k
                RETURN_IF_EOF;    
466
47.9k
                c = readChar(lexer, jsonText, offset);                 
467
47.9k
                if (c == '/') {
468
13.2k
                    break;
469
34.7k
                } else {
470
34.7k
                    unreadChar(lexer, offset);
471
34.7k
                }
472
47.9k
            }
473
27.0M
        }
474
13.3k
    } else {
475
18
        lexer->error = yajl_lex_invalid_char;
476
18
        tok = yajl_tok_error;
477
18
    }
478
    
479
21.1k
    return tok;
480
21.3k
}
481
482
yajl_tok
483
yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
484
             unsigned int jsonTextLen, unsigned int * offset,
485
             const unsigned char ** outBuf, unsigned int * outLen)
486
42.6M
{
487
42.6M
    yajl_tok tok = yajl_tok_error;
488
42.6M
    unsigned char c;
489
42.6M
    unsigned int startOffset = *offset;
490
491
42.6M
    *outBuf = NULL;
492
42.6M
    *outLen = 0;
493
494
42.7M
    for (;;) {
495
42.7M
        assert(*offset <= jsonTextLen);
496
497
42.7M
        if (*offset >= jsonTextLen) {
498
787
            tok = yajl_tok_eof;
499
787
            goto lexed;
500
787
        }
501
502
42.7M
        c = readChar(lexer, jsonText, offset);
503
504
42.7M
        switch (c) {
505
539k
            case '{':
506
539k
                tok = yajl_tok_left_bracket;
507
539k
                goto lexed;
508
1.11k
            case '}':
509
1.11k
                tok = yajl_tok_right_bracket;
510
1.11k
                goto lexed;
511
33.1M
            case '[':
512
33.1M
                tok = yajl_tok_left_brace;
513
33.1M
                goto lexed;
514
6.06M
            case ']':
515
6.06M
                tok = yajl_tok_right_brace;
516
6.06M
                goto lexed;
517
23.0k
            case ',':
518
23.0k
                tok = yajl_tok_comma;
519
23.0k
                goto lexed;
520
538k
            case ':':
521
538k
                tok = yajl_tok_colon;
522
538k
                goto lexed;
523
107k
            case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
524
107k
                startOffset++;
525
107k
                break;
526
1.49k
            case 't': {
527
1.49k
                const char * want = "rue";
528
4.39k
                do {
529
4.39k
                    if (*offset >= jsonTextLen) {
530
12
                        tok = yajl_tok_eof;
531
12
                        goto lexed;
532
12
                    }
533
4.38k
                    c = readChar(lexer, jsonText, offset);
534
4.38k
                    if (c != *want) {
535
42
                        unreadChar(lexer, offset);
536
42
                        lexer->error = yajl_lex_invalid_string;
537
42
                        tok = yajl_tok_error;
538
42
                        goto lexed;
539
42
                    }
540
4.38k
                } while (*(++want));
541
1.43k
                tok = yajl_tok_bool;
542
1.43k
                goto lexed;
543
1.49k
            }
544
493
            case 'f': {
545
493
                const char * want = "alse";
546
1.82k
                do {
547
1.82k
                    if (*offset >= jsonTextLen) {
548
14
                        tok = yajl_tok_eof;
549
14
                        goto lexed;
550
14
                    }
551
1.81k
                    c = readChar(lexer, jsonText, offset);
552
1.81k
                    if (c != *want) {
553
43
                        unreadChar(lexer, offset);
554
43
                        lexer->error = yajl_lex_invalid_string;
555
43
                        tok = yajl_tok_error;
556
43
                        goto lexed;
557
43
                    }
558
1.81k
                } while (*(++want));
559
436
                tok = yajl_tok_bool;
560
436
                goto lexed;
561
493
            }
562
3.74k
            case 'n': {
563
3.74k
                const char * want = "ull";
564
11.1k
                do {
565
11.1k
                    if (*offset >= jsonTextLen) {
566
15
                        tok = yajl_tok_eof;
567
15
                        goto lexed;
568
15
                    }
569
11.1k
                    c = readChar(lexer, jsonText, offset);
570
11.1k
                    if (c != *want) {
571
31
                        unreadChar(lexer, offset);
572
31
                        lexer->error = yajl_lex_invalid_string;
573
31
                        tok = yajl_tok_error;
574
31
                        goto lexed;
575
31
                    }
576
11.1k
                } while (*(++want));
577
3.69k
                tok = yajl_tok_null;
578
3.69k
                goto lexed;
579
3.74k
            }
580
569k
            case '"': {
581
569k
                tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
582
569k
                                      jsonTextLen, offset);
583
569k
                goto lexed;
584
3.74k
            }
585
13.1k
            case '-':
586
1.65M
            case '0': case '1': case '2': case '3': case '4': 
587
1.70M
            case '5': case '6': case '7': case '8': case '9': {
588
                /* integer parsing wants to start from the beginning */
589
1.70M
                unreadChar(lexer, offset);
590
1.70M
                tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
591
1.70M
                                      jsonTextLen, offset);
592
1.70M
                goto lexed;
593
1.68M
            }
594
21.3k
            case '/':
595
                /* hey, look, a probable comment!  If comments are disabled
596
                 * it's an error. */
597
21.3k
                if (!lexer->allowComments) {
598
0
                    unreadChar(lexer, offset);
599
0
                    lexer->error = yajl_lex_unallowed_comment;
600
0
                    tok = yajl_tok_error;
601
0
                    goto lexed;
602
0
                }
603
                /* if comments are enabled, then we should try to lex
604
                 * the thing.  possible outcomes are
605
                 * - successful lex (tok_comment, which means continue),
606
                 * - malformed comment opening (slash not followed by
607
                 *   '*' or '/') (tok_error)
608
                 * - eof hit. (tok_eof) */
609
21.3k
                tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
610
21.3k
                                       jsonTextLen, offset);
611
21.3k
                if (tok == yajl_tok_comment) {
612
                    /* "error" is silly, but that's the initial
613
                     * state of tok.  guilty until proven innocent. */  
614
21.1k
                    tok = yajl_tok_error;
615
21.1k
                    yajl_buf_clear(lexer->buf);
616
21.1k
                    lexer->bufInUse = 0;
617
21.1k
                    startOffset = *offset; 
618
21.1k
                    break;
619
21.1k
                }
620
                /* hit error or eof, bail */
621
147
                goto lexed;
622
213
            default:
623
213
                lexer->error = yajl_lex_invalid_char;
624
213
                tok = yajl_tok_error;
625
213
                goto lexed;
626
42.7M
        }
627
42.7M
    }
628
629
630
42.6M
  lexed:
631
    /* need to append to buffer if the buffer is in use or
632
     * if it's an EOF token */
633
42.6M
    if (tok == yajl_tok_eof || lexer->bufInUse) {
634
1.38k
        if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
635
1.38k
        lexer->bufInUse = 1;
636
1.38k
        yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
637
1.38k
        lexer->bufOff = 0;
638
639
1.38k
        if (yajl_buf_err(lexer->buf)) {
640
0
            lexer->error = yajl_lex_alloc_failed;
641
0
            return yajl_tok_error;
642
0
        }
643
644
1.38k
        if (tok != yajl_tok_eof) {
645
0
            *outBuf = yajl_buf_data(lexer->buf);
646
0
            *outLen = yajl_buf_len(lexer->buf);
647
0
            lexer->bufInUse = 0;
648
0
        }
649
42.6M
    } else if (tok != yajl_tok_error) {
650
42.6M
        *outBuf = jsonText + startOffset;
651
42.6M
        *outLen = *offset - startOffset;
652
42.6M
    }
653
654
    /* special case for strings. skip the quotes. */
655
42.6M
    if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
656
569k
    {
657
569k
        assert(*outLen >= 2);
658
569k
        (*outBuf)++;
659
569k
        *outLen -= 2; 
660
569k
    }
661
662
663
#ifdef YAJL_LEXER_DEBUG
664
    if (tok == yajl_tok_error) {
665
        printf("lexical error: %s\n",
666
               yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
667
    } else if (tok == yajl_tok_eof) {
668
        printf("EOF hit\n");
669
    } else {
670
        printf("lexed %s: '", tokToStr(tok));
671
        fwrite(*outBuf, 1, *outLen, stdout);
672
        printf("'\n");
673
    }
674
#endif
675
676
42.6M
    return tok;
677
42.6M
}
678
679
const char *
680
yajl_lex_error_to_string(yajl_lex_error error)
681
0
{
682
0
    switch (error) {
683
0
        case yajl_lex_e_ok:
684
0
            return "ok, no error";
685
0
        case yajl_lex_string_invalid_utf8:
686
0
            return "invalid bytes in UTF8 string.";
687
0
        case yajl_lex_string_invalid_escaped_char:
688
0
            return "inside a string, '\\' occurs before a character "
689
0
                   "which it may not.";
690
0
        case yajl_lex_string_invalid_json_char:            
691
0
            return "invalid character inside string.";
692
0
        case yajl_lex_string_invalid_hex_char:
693
0
            return "invalid (non-hex) character occurs after '\\u' inside "
694
0
                   "string.";
695
0
        case yajl_lex_invalid_char:
696
0
            return "invalid char in json text.";
697
0
        case yajl_lex_invalid_string:
698
0
            return "invalid string in json text.";
699
0
        case yajl_lex_missing_integer_after_exponent:
700
0
            return "malformed number, a digit is required after the exponent.";
701
0
        case yajl_lex_missing_integer_after_decimal:
702
0
            return "malformed number, a digit is required after the "
703
0
                   "decimal point.";
704
0
        case yajl_lex_missing_integer_after_minus:
705
0
            return "malformed number, a digit is required after the "
706
0
                   "minus sign.";
707
0
        case yajl_lex_unallowed_comment:
708
0
            return "probable comment found in input text, comments are "
709
0
                   "not enabled.";
710
0
        case yajl_lex_alloc_failed:
711
0
            return "allocation failed";
712
0
    }
713
0
    return "unknown error code";
714
0
}
715
716
717
/** allows access to more specific information about the lexical
718
 *  error when yajl_lex_lex returns yajl_tok_error. */
719
yajl_lex_error
720
yajl_lex_get_error(yajl_lexer lexer)
721
0
{
722
0
    if (lexer == NULL) return (yajl_lex_error) -1;
723
0
    return lexer->error;
724
0
}
725
726
unsigned int yajl_lex_current_line(yajl_lexer lexer)
727
0
{
728
0
    return lexer->lineOff;
729
0
}
730
731
unsigned int yajl_lex_current_char(yajl_lexer lexer)
732
0
{
733
0
    return lexer->charOff;
734
0
}
735
736
yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
737
                       unsigned int jsonTextLen, unsigned int offset)
738
0
{
739
0
    const unsigned char * outBuf;
740
0
    unsigned int outLen;
741
0
    unsigned int bufLen = yajl_buf_len(lexer->buf);
742
0
    unsigned int bufOff = lexer->bufOff;
743
0
    unsigned int bufInUse = lexer->bufInUse;
744
0
    yajl_tok tok;
745
    
746
0
    tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
747
0
                       &outBuf, &outLen);
748
749
0
    if (tok == yajl_tok_eof) {
750
0
        return tok;
751
0
    }
752
753
0
    lexer->bufOff = bufOff;
754
0
    lexer->bufInUse = bufInUse;
755
0
    yajl_buf_truncate(lexer->buf, bufLen);
756
    
757
0
    return tok;
758
0
}