Coverage Report

Created: 2023-09-25 07:10

/src/jsonnet/core/string_utils.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright 2015 Google Inc. All rights reserved.
3
4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7
8
    http://www.apache.org/licenses/LICENSE-2.0
9
10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16
17
#include <iomanip>
18
19
#include "static_error.h"
20
#include "string_utils.h"
21
22
namespace jsonnet::internal {
23
24
UString jsonnet_string_unparse(const UString &str, bool single)
25
975k
{
26
975k
    UStringStream ss;
27
975k
    ss << (single ? U'\'' : U'\"');
28
975k
    ss << jsonnet_string_escape(str, single);
29
975k
    ss << (single ? U'\'' : U'\"');
30
975k
    return ss.str();
31
975k
}
32
33
UString jsonnet_string_escape(const UString &str, bool single)
34
975k
{
35
975k
    UStringStream ss;
36
1.60G
    for (std::size_t i = 0; i < str.length(); ++i) {
37
1.60G
        char32_t c = str[i];
38
1.60G
        switch (c) {
39
1.54M
            case U'\"': ss << (single ? U"\"" : U"\\\""); break;
40
2.15M
            case U'\'': ss << (single ? U"\\\'" : U"\'"); break;
41
1.30G
            case U'\\': ss << U"\\\\"; break;
42
1.08M
            case U'\b': ss << U"\\b"; break;
43
519k
            case U'\f': ss << U"\\f"; break;
44
37.4M
            case U'\n': ss << U"\\n"; break;
45
3.48M
            case U'\r': ss << U"\\r"; break;
46
1.44M
            case U'\t': ss << U"\\t"; break;
47
7.51k
            case U'\0': ss << U"\\u0000"; break;
48
254M
            default: {
49
254M
                if (c < 0x20 || (c >= 0x7f && c <= 0x9f)) {
50
                    // Unprintable, use \u
51
78.7M
                    std::stringstream ss8;
52
78.7M
                    ss8 << "\\u" << std::hex << std::setfill('0') << std::setw(4)
53
78.7M
                        << (unsigned long)(c);
54
78.7M
                    ss << decode_utf8(ss8.str());
55
175M
                } else {
56
                    // Printable, write verbatim
57
175M
                    ss << c;
58
175M
                }
59
254M
            }
60
1.60G
        }
61
1.60G
    }
62
975k
    return ss.str();
63
975k
}
64
65
unsigned long jsonnet_string_parse_unicode(const LocationRange &loc, const char32_t *c)
66
54.5k
{
67
54.5k
    unsigned long codepoint = 0;
68
    // Expect 4 hex digits.
69
272k
    for (unsigned i = 0; i < 4; ++i) {
70
218k
        auto x = (unsigned char)(c[i]);
71
218k
        unsigned digit;
72
218k
        if (x == '\0') {
73
58
            auto msg = "Truncated unicode escape sequence in string literal.";
74
58
            throw StaticError(loc, msg);
75
217k
        } else if (x >= '0' && x <= '9') {
76
173k
            digit = x - '0';
77
173k
        } else if (x >= 'a' && x <= 'f') {
78
16.3k
            digit = x - 'a' + 10;
79
28.5k
        } else if (x >= 'A' && x <= 'F') {
80
28.3k
            digit = x - 'A' + 10;
81
28.3k
        } else {
82
174
            std::stringstream ss;
83
174
            ss << "Malformed unicode escape character, "
84
174
               << "should be hex: '" << x << "'";
85
174
            throw StaticError(loc, ss.str());
86
174
        }
87
217k
        codepoint *= 16;
88
217k
        codepoint += digit;
89
217k
    }
90
54.3k
    return codepoint;
91
54.5k
}
92
93
bool is_bmp_codepoint(const unsigned long codepoint)
94
52.1k
{
95
52.1k
    return codepoint < 0xd800 || (codepoint >= 0xe000 && codepoint < 0x10000);
96
52.1k
}
97
98
char32_t decode_utf16_surrogates(const LocationRange &loc, const unsigned long high, const unsigned long low)
99
2.18k
{
100
2.18k
    if (high >= 0xd800 && high < 0xdc00 && low >= 0xdc00 && low < 0xe000) {
101
2.14k
        return 0x10000 + ((high & 0x03ff) << 10) + (low & 0x03ff);
102
2.14k
    } else {
103
48
        std::stringstream ss;
104
48
        ss << "Invalid UTF-16 bytes";
105
48
        throw StaticError(loc, ss.str());
106
48
    }
107
2.18k
}
108
109
UString jsonnet_string_unescape(const LocationRange &loc, const UString &s)
110
22.4M
{
111
22.4M
    UString r;
112
22.4M
    const char32_t *s_ptr = s.c_str();
113
290M
    for (const char32_t *c = s_ptr; *c != U'\0'; ++c) {
114
267M
        switch (*c) {
115
1.09M
            case '\\':
116
1.09M
                switch (*(++c)) {
117
42.0k
                    case '"':
118
44.7k
                    case '\'': r += *c; break;
119
120
270k
                    case '\\': r += *c; break;
121
122
5.38k
                    case '/': r += *c; break;
123
124
26.6k
                    case 'b': r += '\b'; break;
125
126
62.6k
                    case 'f': r += '\f'; break;
127
128
516k
                    case 'n': r += '\n'; break;
129
130
39.2k
                    case 'r': r += '\r'; break;
131
132
78.0k
                    case 't': r += '\t'; break;
133
134
52.3k
                    case 'u': {
135
52.3k
                        ++c;  // Consume the 'u'.
136
52.3k
                        unsigned long codepoint = jsonnet_string_parse_unicode(loc, c);
137
138
                        // Leave us on the last char, ready for the ++c at
139
                        // the outer for loop.
140
52.3k
                        c += 3;
141
52.3k
                        if (!is_bmp_codepoint(codepoint)) {
142
2.27k
                           if (*(++c) != '\\') {
143
53
                                std::stringstream ss;
144
53
                                ss << "Invalid non-BMP Unicode escape in string literal";
145
53
                                throw StaticError(loc, ss.str());
146
53
                           }
147
2.22k
                           if (*(++c) != 'u') {
148
12
                                std::stringstream ss;
149
12
                                ss << "Invalid non-BMP Unicode escape in string literal";
150
12
                                throw StaticError(loc, ss.str());
151
12
                           }
152
2.21k
                           ++c;
153
2.21k
                           unsigned long codepoint2 = jsonnet_string_parse_unicode(loc, c);
154
2.21k
                           c += 3;
155
2.21k
                           codepoint = decode_utf16_surrogates(loc, codepoint, codepoint2);
156
2.21k
                       }
157
52.3k
                       r += codepoint;
158
52.3k
                    } break;
159
160
4
                    case '\0': {
161
4
                        auto msg = "Truncated escape sequence in string literal.";
162
4
                        throw StaticError(loc, msg);
163
52.3k
                    }
164
165
308
                    default: {
166
308
                        std::stringstream ss;
167
308
                        std::string utf8;
168
308
                        encode_utf8(*c, utf8);
169
308
                        ss << "Unknown escape sequence in string literal: '" << utf8 << "'";
170
308
                        throw StaticError(loc, ss.str());
171
52.3k
                    }
172
1.09M
                }
173
1.09M
                break;
174
175
266M
            default:
176
                // Just a regular letter.
177
266M
                r += *c;
178
267M
        }
179
267M
    }
180
22.4M
    return r;
181
22.4M
}
182
183
}  // namespace jsonnet::internal