/src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | #include "utf8proc_wrapper.hpp" |
2 | | #include "utf8proc.hpp" |
3 | | |
4 | | using namespace std; |
5 | | |
6 | | namespace duckdb { |
7 | | |
8 | | // This function efficiently checks if a string is valid UTF8. |
9 | | // It was originally written by Sjoerd Mullender. |
10 | | |
11 | | // Here is the table that makes it work: |
12 | | |
13 | | // B = Number of Bytes in UTF8 encoding |
14 | | // C_MIN = First Unicode code point |
15 | | // C_MAX = Last Unicode code point |
16 | | // B1 = First Byte Prefix |
17 | | |
18 | | // B C_MIN C_MAX B1 |
19 | | // 1 U+000000 U+00007F 0xxxxxxx |
20 | | // 2 U+000080 U+0007FF 110xxxxx |
21 | | // 3 U+000800 U+00FFFF 1110xxxx |
22 | | // 4 U+010000 U+10FFFF 11110xxx |
23 | | |
24 | 3 | static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t *invalid_pos, size_t pos, UnicodeInvalidReason reason) { |
25 | 3 | if (invalid_reason) { |
26 | 0 | *invalid_reason = reason; |
27 | 0 | } |
28 | 3 | if (invalid_pos) { |
29 | 0 | *invalid_pos = pos; |
30 | 0 | } |
31 | 3 | } |
32 | | |
33 | 496 | UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) { |
34 | 496 | UnicodeType type = UnicodeType::ASCII; |
35 | 496 | char c; |
36 | 1.92k | for (size_t i = 0; i < len; i++) { |
37 | 1.43k | c = s[i]; |
38 | 1.43k | if (c == '\0') { |
39 | 1 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE); |
40 | 1 | return UnicodeType::INVALID; |
41 | 1 | } |
42 | | // 1 Byte / ASCII |
43 | 1.43k | if ((c & 0x80) == 0) { |
44 | 1.42k | continue; |
45 | 1.42k | } |
46 | 2 | type = UnicodeType::UNICODE; |
47 | 2 | if ((s[++i] & 0xC0) != 0x80) { |
48 | 2 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); |
49 | 2 | return UnicodeType::INVALID; |
50 | 2 | } |
51 | 0 | if ((c & 0xE0) == 0xC0) { |
52 | 0 | continue; |
53 | 0 | } |
54 | 0 | if ((s[++i] & 0xC0) != 0x80) { |
55 | 0 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); |
56 | 0 | return UnicodeType::INVALID; |
57 | 0 | } |
58 | 0 | if ((c & 0xF0) == 0xE0) { |
59 | 0 | continue; |
60 | 0 | } |
61 | 0 | if ((s[++i] & 0xC0) != 0x80) { |
62 | 0 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); |
63 | 0 | return UnicodeType::INVALID; |
64 | 0 | } |
65 | 0 | if ((c & 0xF8) == 0xF0) { |
66 | 0 | continue; |
67 | 0 | } |
68 | 0 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); |
69 | 0 | return UnicodeType::INVALID; |
70 | 0 | } |
71 | | |
72 | 493 | return type; |
73 | 496 | } |
74 | | |
75 | | |
76 | 0 | char* Utf8Proc::Normalize(const char *s, size_t len) { |
77 | 0 | assert(s); |
78 | 0 | assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID); |
79 | 0 | return (char*) utf8proc_NFC((const utf8proc_uint8_t*) s, len); |
80 | 0 | } |
81 | | |
82 | 4 | bool Utf8Proc::IsValid(const char *s, size_t len) { |
83 | 4 | return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID; |
84 | 4 | } |
85 | | |
86 | 4 | size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) { |
87 | 4 | return utf8proc_next_grapheme(s, len, cpos); |
88 | 4 | } |
89 | | |
90 | 0 | size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) { |
91 | 0 | if (!Utf8Proc::IsValid(s, len)) { |
92 | 0 | return cpos - 1; |
93 | 0 | } |
94 | 0 | size_t current_pos = 0; |
95 | 0 | while(true) { |
96 | 0 | size_t new_pos = NextGraphemeCluster(s, len, current_pos); |
97 | 0 | if (new_pos <= current_pos || new_pos >= cpos) { |
98 | 0 | return current_pos; |
99 | 0 | } |
100 | 0 | current_pos = new_pos; |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | 0 | bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) { |
105 | 0 | return utf8proc_codepoint_to_utf8(cp, sz, c); |
106 | 0 | } |
107 | | |
108 | 0 | int Utf8Proc::CodepointLength(int cp) { |
109 | 0 | return utf8proc_codepoint_length(cp); |
110 | 0 | } |
111 | | |
112 | 0 | int32_t Utf8Proc::UTF8ToCodepoint(const char *c, int &sz) { |
113 | 0 | return utf8proc_codepoint(c, sz); |
114 | 0 | } |
115 | | |
116 | 4 | size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) { |
117 | 4 | int sz; |
118 | 4 | auto codepoint = duckdb::utf8proc_codepoint(s + pos, sz); |
119 | 4 | auto properties = duckdb::utf8proc_get_property(codepoint); |
120 | 4 | return properties->charwidth; |
121 | 4 | } |
122 | | |
123 | | } |