Coverage Report

Created: 2025-09-05 06:51

/src/boost/boost/json/detail/utf8.hpp
Line
Count
Source (jump to first uncovered line)
1
//
2
// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
3
//
4
// Distributed under the Boost Software License, Version 1.0. (See accompanying
5
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6
//
7
// Official repository: https://github.com/boostorg/json
8
//
9
10
#ifndef BOOST_JSON_DETAIL_UTF8_HPP
11
#define BOOST_JSON_DETAIL_UTF8_HPP
12
13
#include <boost/endian/conversion.hpp>
14
#include <boost/json/detail/config.hpp>
15
16
#include <cstddef>
17
#include <cstring>
18
#include <cstdint>
19
20
namespace boost {
21
namespace json {
22
namespace detail {
23
24
template<int N>
25
std::uint32_t
26
load_little_endian(void const* p)
27
1.03M
{
28
1.03M
    std::uint32_t v = 0;
29
1.03M
    std::memcpy(&v, p, N);
30
1.03M
    endian::little_to_native_inplace(v);
31
1.03M
    return v;
32
1.03M
}
unsigned int boost::json::detail::load_little_endian<2>(void const*)
Line
Count
Source
27
99.0k
{
28
99.0k
    std::uint32_t v = 0;
29
99.0k
    std::memcpy(&v, p, N);
30
99.0k
    endian::little_to_native_inplace(v);
31
99.0k
    return v;
32
99.0k
}
unsigned int boost::json::detail::load_little_endian<3>(void const*)
Line
Count
Source
27
16.0k
{
28
16.0k
    std::uint32_t v = 0;
29
16.0k
    std::memcpy(&v, p, N);
30
16.0k
    endian::little_to_native_inplace(v);
31
16.0k
    return v;
32
16.0k
}
unsigned int boost::json::detail::load_little_endian<4>(void const*)
Line
Count
Source
27
917k
{
28
917k
    std::uint32_t v = 0;
29
917k
    std::memcpy(&v, p, N);
30
917k
    endian::little_to_native_inplace(v);
31
917k
    return v;
32
917k
}
33
34
inline
35
uint16_t
36
classify_utf8(char c)
37
156k
{
38
    // 0x000 = invalid
39
    // 0x102 = 2 bytes, second byte [80, BF]
40
    // 0x203 = 3 bytes, second byte [A0, BF]
41
    // 0x303 = 3 bytes, second byte [80, BF]
42
    // 0x403 = 3 bytes, second byte [80, 9F]
43
    // 0x504 = 4 bytes, second byte [90, BF]
44
    // 0x604 = 4 bytes, second byte [80, BF]
45
    // 0x704 = 4 bytes, second byte [80, 8F]
46
156k
    static constexpr uint16_t first[128]
47
156k
    {
48
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
49
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
50
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
51
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
52
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
53
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
54
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
55
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
56
57
156k
       0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
58
156k
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
59
156k
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
60
156k
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
61
156k
       0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
62
156k
       0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
63
156k
       0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
64
156k
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
65
156k
    };
66
156k
    return first[static_cast<unsigned char>(c & 0x7F)];
67
156k
}
68
69
inline
70
bool
71
is_valid_utf8(const char* p, uint16_t first)
72
154k
{
73
154k
    uint32_t v;
74
154k
    switch(first >> 8)
75
154k
    {
76
303
    default:
77
303
        return false;
78
79
    // 2 bytes, second byte [80, BF]
80
99.0k
    case 1:
81
99.0k
        v = load_little_endian<2>(p);
82
99.0k
        return (v & 0xC000) == 0x8000;
83
84
    // 3 bytes, second byte [A0, BF]
85
10.0k
    case 2:
86
10.0k
        v = load_little_endian<3>(p);
87
10.0k
        return (v & 0xC0E000) == 0x80A000;
88
89
    // 3 bytes, second byte [80, BF]
90
3.47k
    case 3:
91
3.47k
        v = load_little_endian<3>(p);
92
3.47k
        return (v & 0xC0C000) == 0x808000;
93
94
    // 3 bytes, second byte [80, 9F]
95
2.55k
    case 4:
96
2.55k
        v = load_little_endian<3>(p);
97
2.55k
        return (v & 0xC0E000) == 0x808000;
98
99
    // 4 bytes, second byte [90, BF]
100
20.7k
    case 5:
101
20.7k
        v = load_little_endian<4>(p);
102
20.7k
        return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
103
104
    // 4 bytes, second byte [80, BF]
105
17.1k
    case 6:
106
17.1k
        v = load_little_endian<4>(p);
107
17.1k
        return (v & 0xC0C0C000) == 0x80808000;
108
109
    // 4 bytes, second byte [80, 8F]
110
1.68k
    case 7:
111
1.68k
        v = load_little_endian<4>(p);
112
1.68k
        return (v & 0xC0C0F000) == 0x80808000;
113
154k
    }
114
154k
}
115
116
class utf8_sequence
117
{
118
    char seq_[4];
119
    uint16_t first_;
120
    uint8_t size_;
121
122
public:
123
    void
124
    save(
125
        const char* p,
126
        std::size_t remain) noexcept
127
1.07k
    {
128
1.07k
        first_ = classify_utf8(*p );
129
1.07k
        if(remain >= length())
130
996
            size_ = length();
131
75
        else
132
75
            size_ = static_cast<uint8_t>(remain);
133
1.07k
        std::memcpy(seq_, p, size_);
134
1.07k
    }
135
136
    uint8_t
137
    length() const noexcept
138
3.18k
    {
139
3.18k
        return first_ & 0xFF;
140
3.18k
    }
141
142
    bool
143
    complete() const noexcept
144
1.07k
    {
145
1.07k
        return size_ >= length();
146
1.07k
    }
147
148
    // returns true if complete
149
    bool
150
    append(
151
        const char* p,
152
        std::size_t remain) noexcept
153
16
    {
154
16
        if(BOOST_JSON_UNLIKELY(needed() == 0))
155
0
            return true;
156
16
        if(BOOST_JSON_LIKELY(remain >= needed()))
157
0
        {
158
0
            std::memcpy(
159
0
                seq_ + size_, p, needed());
160
0
            size_ = length();
161
0
            return true;
162
0
        }
163
16
        if(BOOST_JSON_LIKELY(remain > 0))
164
0
        {
165
0
            std::memcpy(seq_ + size_, p, remain);
166
0
            size_ += static_cast<uint8_t>(remain);
167
0
        }
168
16
        return false;
169
16
    }
170
171
    const char*
172
    data() const noexcept
173
0
    {
174
0
        return seq_;
175
0
    }
176
177
    uint8_t
178
    needed() const noexcept
179
48
    {
180
48
        return length() - size_;
181
48
    }
182
183
    bool
184
    valid() const noexcept
185
0
    {
186
0
        BOOST_ASSERT(size_ >= length());
187
0
        return is_valid_utf8(seq_, first_);
188
0
    }
189
};
190
191
} // detail
192
} // namespace json
193
} // namespace boost
194
195
#endif