/src/llama.cpp/common/unicode.cpp
Line | Count | Source |
1 | | #include "unicode.h" |
2 | | |
3 | | #include <algorithm> |
4 | | #include <cassert> |
5 | | #include <stdexcept> |
6 | | #include <string> |
7 | | #include <vector> |
8 | | |
9 | | // implementation adopted from src/unicode.cpp |
10 | | |
11 | 0 | size_t common_utf8_sequence_length(unsigned char first_byte) { |
12 | 0 | const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; |
13 | 0 | uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4; |
14 | 0 | return lookup[highbits]; |
15 | 0 | } |
16 | | |
17 | 0 | utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset) { |
18 | 0 | if (offset >= input.size()) { |
19 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
20 | 0 | } |
21 | | |
22 | | // ASCII fast path |
23 | 0 | if (!(input[offset] & 0x80)) { |
24 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1); |
25 | 0 | } |
26 | | |
27 | | // Invalid: continuation byte as first byte |
28 | 0 | if (!(input[offset] & 0x40)) { |
29 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
30 | 0 | } |
31 | | |
32 | | // 2-byte sequence |
33 | 0 | if (!(input[offset] & 0x20)) { |
34 | 0 | if (offset + 1 >= input.size()) { |
35 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
36 | 0 | } |
37 | 0 | if ((input[offset + 1] & 0xc0) != 0x80) { |
38 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
39 | 0 | } |
40 | 0 | auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f); |
41 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2); |
42 | 0 | } |
43 | | |
44 | | // 3-byte sequence |
45 | 0 | if (!(input[offset] & 0x10)) { |
46 | 0 | if (offset + 2 >= input.size()) { |
47 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
48 | 0 | } |
49 | 0 | if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) { |
50 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
51 | 0 | } |
52 | 0 | auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f); |
53 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3); |
54 | 0 | } |
55 | | |
56 | | // 4-byte sequence |
57 | 0 | if (!(input[offset] & 0x08)) { |
58 | 0 | if (offset + 3 >= input.size()) { |
59 | 0 | return utf8_parse_result(utf8_parse_result::INCOMPLETE); |
60 | 0 | } |
61 | 0 | if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) { |
62 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
63 | 0 | } |
64 | 0 | auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f); |
65 | 0 | return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4); |
66 | 0 | } |
67 | | |
68 | | // Invalid first byte |
69 | 0 | return utf8_parse_result(utf8_parse_result::INVALID); |
70 | 0 | } |
71 | | |
72 | 0 | bool common_utf8_is_complete(const std::string & s) { |
73 | 0 | if (s.empty()) { |
74 | 0 | return true; |
75 | 0 | } |
76 | 0 | for (int i = 1; i <= std::min(4, (int)s.size()); i++) { |
77 | 0 | unsigned char c = s[s.size() - i]; |
78 | 0 | if ((c & 0xC0) != 0x80) { |
79 | 0 | int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1; |
80 | 0 | return i >= expected; |
81 | 0 | } |
82 | 0 | } |
83 | 0 | return false; |
84 | 0 | } |
85 | | |
86 | 0 | std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) { |
87 | 0 | std::string result; |
88 | 0 | for (size_t i = 0; i < cps.size(); ++i) { |
89 | 0 | result.append(common_unicode_cpt_to_utf8(cps[i])); |
90 | 0 | } |
91 | 0 | return result; |
92 | 0 | } |
93 | | |
94 | 0 | std::string common_unicode_cpt_to_utf8(uint32_t cpt) { |
95 | 0 | std::string result; |
96 | |
|
97 | 0 | if (/* 0x00 <= cpt && */ cpt <= 0x7f) { |
98 | 0 | result.push_back(cpt); |
99 | 0 | return result; |
100 | 0 | } |
101 | 0 | if (0x80 <= cpt && cpt <= 0x7ff) { |
102 | 0 | result.push_back(0xc0 | ((cpt >> 6) & 0x1f)); |
103 | 0 | result.push_back(0x80 | (cpt & 0x3f)); |
104 | 0 | return result; |
105 | 0 | } |
106 | 0 | if (0x800 <= cpt && cpt <= 0xffff) { |
107 | 0 | result.push_back(0xe0 | ((cpt >> 12) & 0x0f)); |
108 | 0 | result.push_back(0x80 | ((cpt >> 6) & 0x3f)); |
109 | 0 | result.push_back(0x80 | (cpt & 0x3f)); |
110 | 0 | return result; |
111 | 0 | } |
112 | 0 | if (0x10000 <= cpt && cpt <= 0x10ffff) { |
113 | 0 | result.push_back(0xf0 | ((cpt >> 18) & 0x07)); |
114 | 0 | result.push_back(0x80 | ((cpt >> 12) & 0x3f)); |
115 | 0 | result.push_back(0x80 | ((cpt >> 6) & 0x3f)); |
116 | 0 | result.push_back(0x80 | (cpt & 0x3f)); |
117 | 0 | return result; |
118 | 0 | } |
119 | | |
120 | 0 | throw std::invalid_argument("invalid codepoint"); |
121 | 0 | } |
122 | | |
123 | | |
124 | | |