Coverage Report

Created: 2025-09-04 07:51

/src/fluent-bit/src/flb_unescape.c
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
3
/*  Fluent Bit
4
 *  ==========
5
 *  Copyright (C) 2015-2024 The Fluent Bit Authors
6
 *
7
 *  Licensed under the Apache License, Version 2.0 (the "License");
8
 *  you may not use this file except in compliance with the License.
9
 *  You may obtain a copy of the License at
10
 *
11
 *      http://www.apache.org/licenses/LICENSE-2.0
12
 *
13
 *  Unless required by applicable law or agreed to in writing, software
14
 *  distributed under the License is distributed on an "AS IS" BASIS,
15
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 *  See the License for the specific language governing permissions and
17
 *  limitations under the License.
18
 */
19
20
#include <fluent-bit/flb_compat.h>
21
#include <fluent-bit/flb_info.h>
22
#include <fluent-bit/flb_log.h>
23
24
#include <stdlib.h>
25
#include <string.h>
26
#include <inttypes.h>
27
28
static int octal_digit(char c)
29
439k
{
30
439k
    return (c >= '0' && c <= '7');
31
439k
}
32
33
static int hex_digit(char c)
34
704k
{
35
704k
    return ((c >= '0' && c <= '9') ||
36
704k
            (c >= 'A' && c <= 'F') ||
37
704k
            (c >= 'a' && c <= 'f'));
38
704k
}
39
40
static int u8_wc_toutf8(char *dest, uint32_t ch)
41
137M
{
42
137M
    if (ch < 0x80) {
43
129M
        dest[0] = (char)ch;
44
129M
        return 1;
45
129M
    }
46
7.47M
    if (ch < 0x800) {
47
16.3k
        dest[0] = (ch>>6) | 0xC0;
48
16.3k
        dest[1] = (ch & 0x3F) | 0x80;
49
16.3k
        return 2;
50
16.3k
    }
51
7.45M
    if (ch < 0x10000) {
52
76.6k
        dest[0] = (ch>>12) | 0xE0;
53
76.6k
        dest[1] = ((ch>>6) & 0x3F) | 0x80;
54
76.6k
        dest[2] = (ch & 0x3F) | 0x80;
55
76.6k
        return 3;
56
76.6k
    }
57
7.38M
    if (ch < 0x110000) {
58
16.9k
        dest[0] = (ch>>18) | 0xF0;
59
16.9k
        dest[1] = ((ch>>12) & 0x3F) | 0x80;
60
16.9k
        dest[2] = ((ch>>6) & 0x3F) | 0x80;
61
16.9k
        dest[3] = (ch & 0x3F) | 0x80;
62
16.9k
        return 4;
63
16.9k
    }
64
7.36M
    return 0;
65
7.38M
}
66
67
93.6k
static int u8_high_surrogate(uint32_t ch) {
68
93.6k
    return ch >= 0xD800 && ch <= 0xDBFF;
69
93.6k
}
70
71
123k
static int u8_low_surrogate(uint32_t ch) {
72
123k
    return ch >= 0xDC00 && ch <= 0xDFFF;
73
123k
}
74
75
13.4k
static uint32_t u8_combine_surrogates(uint32_t high, uint32_t low) {
76
13.4k
    return 0x10000 + (((high - 0xD800) << 10) | (low - 0xDC00));
77
13.4k
}
78
79
/* assumes that src points to the character after a backslash
80
   returns number of input characters processed */
81
static int u8_read_escape_sequence(const char *str, int size, uint32_t *dest)
82
437k
{
83
437k
    uint32_t ch = 0;
84
437k
    char digs[9]="\0\0\0\0\0\0\0\0";
85
437k
    char ldigs[9]="\0\0\0\0\0\0\0\0";
86
437k
    int dno=0, i=1;
87
437k
    uint32_t low = 0;
88
89
437k
    ch = (uint32_t)str[0];    /* take literal character */
90
91
437k
    if (str[0] == 'n')
92
0
        ch = L'\n';
93
437k
    else if (str[0] == 't')
94
0
        ch = L'\t';
95
437k
    else if (str[0] == 'r')
96
0
        ch = L'\r';
97
437k
    else if (str[0] == 'b')
98
0
        ch = L'\b';
99
437k
    else if (str[0] == 'f')
100
0
        ch = L'\f';
101
437k
    else if (str[0] == 'v')
102
1.81k
        ch = L'\v';
103
435k
    else if (str[0] == 'a')
104
6.85k
        ch = L'\a';
105
428k
    else if (octal_digit(str[0])) {
106
28.7k
        i = 0;
107
34.1k
        do {
108
34.1k
            digs[dno++] = str[i++];
109
34.1k
        } while (i < size && octal_digit(str[i]) && dno < 3);
110
28.7k
        ch = strtol(digs, NULL, 8);
111
28.7k
    }
112
399k
    else if (str[0] == 'x') {
113
50.5k
        while (i < size && hex_digit(str[i]) && dno < 2) {
114
29.4k
            digs[dno++] = str[i++];
115
29.4k
        }
116
21.0k
        if (dno > 0) {
117
16.7k
            ch = strtol(digs, NULL, 16);
118
16.7k
        }
119
21.0k
    }
120
378k
    else if (str[0] == 'u') {
121
494k
        while (i < size && hex_digit(str[i]) && dno < 4) {
122
389k
            digs[dno++] = str[i++];
123
389k
        }
124
104k
        if (dno != 4) {
125
            /* Incomplete \u escape sequence */
126
9.62k
            if (dno > 0) {
127
5.19k
                ch = L'\uFFFD';
128
5.19k
                goto invalid_sequence;
129
5.19k
            }
130
9.62k
        }
131
99.4k
        ch = strtol(digs, NULL, 16);
132
99.4k
        if (u8_low_surrogate(ch)) {
133
            /* Invalid: low surrogate without preceding high surrogate */
134
5.79k
            ch = L'\uFFFD';
135
5.79k
            goto invalid_sequence;
136
5.79k
        }
137
93.6k
        else if (u8_high_surrogate(ch)) {
138
            /* Handle a surrogate pair.
139
             * Note that i is already incremented with 4 here. */
140
56.1k
            if (i + 2 < size && str[i] == '\\' && str[i + 1] == 'u') {
141
28.5k
                dno = 0;
142
28.5k
                i += 2; /* Skip "\u" */
143
122k
                while (i < size && hex_digit(str[i]) && dno < 4) {
144
94.3k
                    ldigs[dno++] = str[i++];
145
94.3k
                }
146
28.5k
                if (dno != 4) {
147
                    /* Incomplete low surrogate */
148
7.39k
                    if (dno > 0) {
149
4.52k
                        ch = L'\uFFFD';
150
4.52k
                        goto invalid_sequence;
151
4.52k
                    }
152
7.39k
                }
153
24.0k
                low = strtol(ldigs, NULL, 16);
154
24.0k
                if (u8_low_surrogate(low)) {
155
13.4k
                    ch = u8_combine_surrogates(ch, low);
156
13.4k
                }
157
10.5k
                else {
158
                    /* Invalid: high surrogate not followed by low surrogate */
159
10.5k
                    ch = L'\uFFFD';
160
10.5k
                    goto invalid_sequence;
161
10.5k
                }
162
24.0k
            }
163
27.6k
            else {
164
                /* Invalid: high surrogate not followed by \u */
165
27.6k
                ch = L'\uFFFD';
166
27.6k
                goto invalid_sequence;
167
27.6k
            }
168
56.1k
        }
169
99.4k
    }
170
274k
    else if (str[0] == 'U') {
171
286k
        while (i < size && hex_digit(str[i]) && dno < 8) {
172
38.7k
            digs[dno++] = str[i++];
173
38.7k
        }
174
248k
        if (dno > 0) {
175
8.80k
            ch = strtol(digs, NULL, 16);
176
8.80k
        }
177
248k
    }
178
179
437k
invalid_sequence:
180
181
437k
    *dest = ch;
182
183
437k
    return i;
184
437k
}
185
186
int flb_unescape_string_utf8(const char *in_buf, int sz, char *out_buf)
187
7.79M
{
188
7.79M
    uint32_t ch;
189
7.79M
    char temp[4];
190
7.79M
    const char *end;
191
7.79M
    const char *next;
192
7.79M
                int size;
193
194
195
7.79M
    int count_out = 0;
196
7.79M
    int count_in = 0;
197
7.79M
    int esc_in = 0;
198
7.79M
    int esc_out = 0;
199
200
7.79M
    end = in_buf + sz;
201
144M
    while (in_buf < end && *in_buf && count_in < sz) {
202
137M
        next = in_buf + 1;
203
137M
        if (next < end && *in_buf == '\\') {
204
1.21M
            esc_in = 2;
205
1.21M
            switch (*next) {
206
142k
            case '"':
207
142k
                ch = '"';
208
142k
                break;
209
1.36k
            case '\'':
210
1.36k
                ch = '\'';
211
1.36k
                break;
212
570k
            case '\\':
213
570k
                ch = '\\';
214
570k
                break;
215
9.30k
            case '/':
216
9.30k
                ch = '/';
217
9.30k
                break;
218
7.25k
            case 'n':
219
7.25k
                ch = '\n';
220
7.25k
                break;
221
14.9k
            case 'b':
222
14.9k
                ch = '\b';
223
14.9k
                break;
224
9.34k
            case 't':
225
9.34k
                ch = '\t';
226
9.34k
                break;
227
13.7k
            case 'f':
228
13.7k
                ch = '\f';
229
13.7k
                break;
230
8.11k
            case 'r':
231
8.11k
                ch = '\r';
232
8.11k
                break;
233
437k
            default:
234
437k
                size = end - next;
235
437k
                if (size > 0) {
236
437k
                    esc_in = u8_read_escape_sequence(next, size, &ch) + 1;
237
437k
                }
238
0
                else {
239
                    /* because char is unsigned char by default on arm, so we need to do a explicit conversion */
240
0
                    ch = (uint32_t) (signed char) *in_buf;
241
0
                    esc_in = 1;
242
0
                }
243
1.21M
            }
244
1.21M
        }
245
135M
        else {
246
            /* explicit convert char to signed char */
247
135M
            ch = (uint32_t) (signed char) *in_buf;
248
135M
            esc_in = 1;
249
135M
        }
250
251
137M
        in_buf += esc_in;
252
137M
        count_in += esc_in;
253
254
137M
        esc_out = u8_wc_toutf8(temp, ch);
255
137M
        if (esc_out > sz-count_out) {
256
0
            flb_error("Crossing over string boundary");
257
0
            break;
258
0
        }
259
260
137M
        if (esc_out == 0) {
261
7.36M
            out_buf[count_out] = ch;
262
7.36M
            esc_out = 1;
263
7.36M
        }
264
129M
        else if (esc_out == 1) {
265
129M
            out_buf[count_out] = (char) temp[0];
266
129M
        }
267
109k
        else {
268
109k
            memcpy(&out_buf[count_out], temp, esc_out);
269
109k
        }
270
137M
        count_out += esc_out;
271
137M
    }
272
7.79M
    if (count_in < sz) {
273
1.95k
        flb_error("Not at boundary but still NULL terminating : %d - '%s'", sz, in_buf);
274
1.95k
    }
275
7.79M
    out_buf[count_out] = '\0';
276
7.79M
    return count_out;
277
7.79M
}
278
279
int flb_unescape_string(const char *buf, int buf_len, char **unesc_buf)
280
13.5k
{
281
13.5k
    int i = 0;
282
13.5k
    int j = 0;
283
13.5k
    char *p;
284
13.5k
    char n;
285
286
13.5k
    p = *unesc_buf;
287
3.62M
    while (i < buf_len) {
288
3.60M
        if (buf[i] == '\\') {
289
35.9k
            if (i + 1 < buf_len) {
290
34.9k
                n = buf[i + 1];
291
34.9k
                if (n == 'n') {
292
780
                    p[j++] = '\n';
293
780
                    i++;
294
780
                }
295
34.1k
                else if (n == 'a') {
296
622
                    p[j++] = '\a';
297
622
                    i++;
298
622
                }
299
33.5k
                else if (n == 'b') {
300
956
                    p[j++] = '\b';
301
956
                    i++;
302
956
                }
303
32.6k
                else if (n == 't') {
304
4.47k
                    p[j++] = '\t';
305
4.47k
                    i++;
306
4.47k
                }
307
28.1k
                else if (n == 'v') {
308
467
                    p[j++] = '\v';
309
467
                    i++;
310
467
                }
311
27.6k
                else if (n == 'f') {
312
932
                    p[j++] = '\f';
313
932
                    i++;
314
932
                }
315
26.7k
                else if (n == 'r') {
316
1.07k
                    p[j++] = '\r';
317
1.07k
                    i++;
318
1.07k
                }
319
25.6k
                else if (n == '\\') {
320
5.79k
                    p[j++] = '\\';
321
5.79k
                    i++;
322
5.79k
                }
323
34.9k
                i++;
324
34.9k
                continue;
325
34.9k
            }
326
990
            else {
327
990
                i++;
328
990
            }
329
35.9k
        }
330
3.57M
        p[j++] = buf[i++];
331
3.57M
    }
332
13.5k
    p[j] = '\0';
333
13.5k
    return j;
334
13.5k
}
335
336
337
/* mysql unquote */
338
int flb_mysql_unquote_string(char *buf, int buf_len, char **unesc_buf)
339
1.85k
{
340
1.85k
    int i = 0;
341
1.85k
    int j = 0;
342
1.85k
    char *p;
343
1.85k
    char n;
344
345
1.85k
    p = *unesc_buf;
346
891k
    while (i < buf_len) {
347
889k
        if ((n = buf[i++]) != '\\') {
348
776k
            p[j++] = n;
349
776k
        } else if(i >= buf_len) {
350
597
            p[j++] = n;
351
112k
        } else {
352
112k
            n = buf[i++];
353
112k
            switch(n) {
354
1.33k
            case 'n':
355
1.33k
                p[j++] = '\n';
356
1.33k
                break;
357
6.51k
            case 'r':
358
6.51k
                p[j++] = '\r';
359
6.51k
                break;
360
747
            case 't':
361
747
                p[j++] = '\t';
362
747
                break;
363
3.24k
            case '\\':
364
3.24k
                p[j++] = '\\';
365
3.24k
                break;
366
1.13k
            case '\'':
367
1.13k
                p[j++] = '\'';
368
1.13k
                break;
369
92.9k
            case '\"':
370
92.9k
                p[j++] = '\"';
371
92.9k
                break;
372
2.33k
            case '0':
373
2.33k
                p[j++] = 0;
374
2.33k
                break;
375
684
            case 'Z':
376
684
                p[j++] = 0x1a;
377
684
                break;
378
3.57k
            default:
379
3.57k
                p[j++] = '\\';
380
3.57k
                p[j++] = n;
381
3.57k
                break;
382
112k
            }
383
112k
        }
384
889k
    }
385
1.85k
    p[j] = '\0';
386
1.85k
    return j;
387
1.85k
}