/src/llama.cpp/common/unicode.cpp
Line | Count | Source |
1 | | #include "unicode.h" |
2 | | |
3 | | // implementation adopted from src/unicode.cpp |
4 | | |
5 | 0 | size_t utf8_sequence_length(unsigned char first_byte) { |
6 | 0 | const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; |
7 | 0 | uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4; |
8 | 0 | return lookup[highbits]; |
9 | 0 | } |
10 | | |
11 | 0 | utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) { |
12 | 0 | if (offset >= input.size()) { |
13 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
14 | 0 | } |
15 | | |
16 | | // ASCII fast path |
17 | 0 | if (!(input[offset] & 0x80)) { |
18 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1); |
19 | 0 | } |
20 | | |
21 | | // Invalid: continuation byte as first byte |
22 | 0 | if (!(input[offset] & 0x40)) { |
23 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
24 | 0 | } |
25 | | |
26 | | // 2-byte sequence |
27 | 0 | if (!(input[offset] & 0x20)) { |
28 | 0 | if (offset + 1 >= input.size()) { |
29 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
30 | 0 | } |
31 | 0 | if ((input[offset + 1] & 0xc0) != 0x80) { |
32 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
33 | 0 | } |
34 | 0 | auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f); |
35 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2); |
36 | 0 | } |
37 | | |
38 | | // 3-byte sequence |
39 | 0 | if (!(input[offset] & 0x10)) { |
40 | 0 | if (offset + 2 >= input.size()) { |
41 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
42 | 0 | } |
43 | 0 | if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) { |
44 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
45 | 0 | } |
46 | 0 | auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f); |
47 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3); |
48 | 0 | } |
49 | | |
50 | | // 4-byte sequence |
51 | 0 | if (!(input[offset] & 0x08)) { |
52 | 0 | if (offset + 3 >= input.size()) { |
53 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
54 | 0 | } |
55 | 0 | if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) { |
56 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
57 | 0 | } |
58 | 0 | auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f); |
59 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4); |
60 | 0 | } |
61 | | |
62 | | // Invalid first byte |
63 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
64 | 0 | } |