/src/mysql-server/strings/mb_wc.h
Line | Count | Source |
1 | | #ifndef MB_WC_INCLUDED |
2 | | #define MB_WC_INCLUDED |
3 | | |
4 | | /* Copyright (c) 2016, 2025, Oracle and/or its affiliates. |
5 | | |
6 | | This program is free software; you can redistribute it and/or modify |
7 | | it under the terms of the GNU General Public License, version 2.0, |
8 | | as published by the Free Software Foundation. |
9 | | |
10 | | This program is designed to work with certain software (including |
11 | | but not limited to OpenSSL) that is licensed under separate terms, |
12 | | as designated in a particular file or component or in included license |
13 | | documentation. The authors of MySQL hereby grant you an additional |
14 | | permission to link the program and your derivative works with the |
15 | | separately licensed software that they have either included with |
16 | | the program or referenced in the documentation. |
17 | | |
18 | | Without limiting anything contained in the foregoing, this file, |
19 | | which is part of C Driver for MySQL (Connector/C), is also subject to the |
20 | | Universal FOSS Exception, version 1.0, a copy of which can be found at |
21 | | http://oss.oracle.com/licenses/universal-foss-exception. |
22 | | |
23 | | This program is distributed in the hope that it will be useful, |
24 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26 | | GNU General Public License, version 2.0, for more details. |
27 | | |
28 | | You should have received a copy of the GNU General Public License |
29 | | along with this program; if not, write to the Free Software |
30 | | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
31 | | |
32 | | /** |
33 | | @file mb_wc.h |
34 | | |
35 | | Definitions of mb_wc (multibyte to wide character, ie., effectively |
36 | | “parse a UTF-8 character”) functions for UTF-8 (both three- and four-byte). |
37 | | These are available both as inline functions, as C-style thunks so that they |
38 | | can fit into MY_CHARSET_HANDLER, and as functors. |
39 | | |
40 | | The functors exist so that you can specialize a class on them and get them |
41 | | inlined instead of having to call them through the function pointer in |
42 | | MY_CHARSET_HANDLER; mb_wc is in itself so cheap (the most common case is |
43 | | just a single byte load and a predictable compare) that the call overhead |
44 | | in a tight loop is significant, and these routines tend to take up a lot |
45 | | of CPU time when sorting. Typically, at the outermost level, you'd simply |
46 | | compare cs->cset->mb_wc with my_mb_wc_{utf8mb3,utf8mb4}_thunk, and if so, |
47 | | instantiate your function with the given class. If it doesn't match, |
48 | | you can use Mb_wc_through_function_pointer, which calls through the |
49 | | function pointer as usual. (It will cache the function pointer for you, |
50 | | which is typically faster than looking it up all the time -- the compiler |
51 | | cannot always figure out on its own that it doesn't change.) |
52 | | |
53 | | The Mb_wc_* classes should be sent by _value_, not by reference, since |
54 | | they are never larger than two pointers (and usually simply zero). |
55 | | */ |
56 | | #include <cstdint> |
57 | | |
58 | | #include <string.h> |
59 | | |
60 | | #include "my_compiler.h" // for ALWAYS_INLINE |
61 | | #include "my_config.h" |
62 | | #include "mysql/strings/m_ctype.h" |
63 | | |
64 | | template <bool RANGE_CHECK, bool SUPPORT_MB4> |
65 | | static int my_mb_wc_utf8_prototype(my_wc_t *pwc, const uint8_t *s, |
66 | | const uint8_t *e); |
67 | | |
68 | | static int my_mb_wc_utf8mb3(my_wc_t *pwc, const uint8_t *s, const uint8_t *e); |
69 | | static int my_mb_wc_utf8mb4(my_wc_t *pwc, const uint8_t *s, const uint8_t *e); |
70 | | |
71 | | /** |
72 | | Functor that converts a UTF-8 multibyte sequence (up to three bytes) |
73 | | to a wide character. |
74 | | */ |
75 | | struct Mb_wc_utf8mb3 { |
76 | | Mb_wc_utf8mb3() = default; |
77 | | |
78 | | ALWAYS_INLINE |
79 | 0 | int operator()(my_wc_t *pwc, const uint8_t *s, const uint8_t *e) const { |
80 | 0 | return my_mb_wc_utf8mb3(pwc, s, e); |
81 | 0 | } |
82 | | }; |
83 | | |
84 | | /** |
85 | | Functor that converts a UTF-8 multibyte sequence (up to four bytes) |
86 | | to a wide character. |
87 | | */ |
88 | | struct Mb_wc_utf8mb4 { |
89 | | Mb_wc_utf8mb4() = default; |
90 | | |
91 | | ALWAYS_INLINE |
92 | 0 | int operator()(my_wc_t *pwc, const uint8_t *s, const uint8_t *e) const { |
93 | 0 | return my_mb_wc_utf8mb4(pwc, s, e); |
94 | 0 | } |
95 | | }; |
96 | | |
97 | | /** |
98 | | Functor that uses a function pointer to convert a multibyte sequence |
99 | | to a wide character. |
100 | | */ |
101 | | class Mb_wc_through_function_pointer { |
102 | | public: |
103 | | explicit Mb_wc_through_function_pointer(const CHARSET_INFO *cs) |
104 | 0 | : m_funcptr(cs->cset->mb_wc), m_cs(cs) {} |
105 | | |
106 | 0 | int operator()(my_wc_t *pwc, const uint8_t *s, const uint8_t *e) const { |
107 | 0 | return m_funcptr(m_cs, pwc, s, e); |
108 | 0 | } |
109 | | |
110 | | private: |
111 | | typedef int (*mbwc_func_t)(const CHARSET_INFO *, my_wc_t *, const uint8_t *, |
112 | | const uint8_t *); |
113 | | |
114 | | const mbwc_func_t m_funcptr; |
115 | | const CHARSET_INFO *const m_cs; |
116 | | }; |
117 | | |
118 | | template <bool RANGE_CHECK, bool SUPPORT_MB4> |
119 | | static ALWAYS_INLINE int my_mb_wc_utf8_prototype(my_wc_t *pwc, const uint8_t *s, |
120 | 0 | const uint8_t *e) { |
121 | 0 | if (RANGE_CHECK && s >= e) return MY_CS_TOOSMALL; |
122 | | |
123 | 0 | uint8_t c = s[0]; |
124 | 0 | if (c < 0x80) { |
125 | 0 | *pwc = c; |
126 | 0 | return 1; |
127 | 0 | } |
128 | | |
129 | 0 | if (c < 0xe0) { |
130 | 0 | if (c < 0xc2) // Resulting code point would be less than 0x80. |
131 | 0 | return MY_CS_ILSEQ; |
132 | | |
133 | 0 | if (RANGE_CHECK && s + 2 > e) return MY_CS_TOOSMALL2; |
134 | | |
135 | 0 | if ((s[1] & 0xc0) != 0x80) // Next byte must be a continuation byte. |
136 | 0 | return MY_CS_ILSEQ; |
137 | | |
138 | 0 | *pwc = ((my_wc_t)(c & 0x1f) << 6) + (my_wc_t)(s[1] & 0x3f); |
139 | 0 | return 2; |
140 | 0 | } |
141 | | |
142 | 0 | if (c < 0xf0) { |
143 | 0 | if (RANGE_CHECK && s + 3 > e) return MY_CS_TOOSMALL3; |
144 | | |
145 | | // Next two bytes must be continuation bytes. |
146 | 0 | uint16_t two_bytes = 0; |
147 | 0 | memcpy(&two_bytes, s + 1, sizeof(two_bytes)); |
148 | 0 | if ((two_bytes & 0xc0c0) != 0x8080) // Endianness does not matter. |
149 | 0 | return MY_CS_ILSEQ; |
150 | | |
151 | 0 | *pwc = ((my_wc_t)(c & 0x0f) << 12) + ((my_wc_t)(s[1] & 0x3f) << 6) + |
152 | 0 | (my_wc_t)(s[2] & 0x3f); |
153 | 0 | if (*pwc < 0x800) return MY_CS_ILSEQ; |
154 | | /* |
155 | | According to RFC 3629, UTF-8 should prohibit characters between |
156 | | U+D800 and U+DFFF, which are reserved for surrogate pairs and do |
157 | | not directly represent characters. |
158 | | */ |
159 | 0 | if (*pwc >= 0xd800 && *pwc <= 0xdfff) return MY_CS_ILSEQ; |
160 | 0 | return 3; |
161 | 0 | } |
162 | | |
163 | 0 | if (SUPPORT_MB4) { |
164 | 0 | if (RANGE_CHECK && s + 4 > e) /* We need 4 characters */ |
165 | 0 | return MY_CS_TOOSMALL4; |
166 | | |
167 | | /* |
168 | | This byte must be of the form 11110xxx, and the next three bytes |
169 | | must be continuation bytes. |
170 | | */ |
171 | 0 | uint32_t four_bytes = 0; |
172 | 0 | memcpy(&four_bytes, s, sizeof(four_bytes)); |
173 | | #ifdef WORDS_BIGENDIAN |
174 | | if ((four_bytes & 0xf8c0c0c0) != 0xf0808080) |
175 | | #else |
176 | 0 | if ((four_bytes & 0xc0c0c0f8) != 0x808080f0) |
177 | 0 | #endif |
178 | 0 | return MY_CS_ILSEQ; |
179 | | |
180 | 0 | *pwc = ((my_wc_t)(c & 0x07) << 18) + ((my_wc_t)(s[1] & 0x3f) << 12) + |
181 | 0 | ((my_wc_t)(s[2] & 0x3f) << 6) + (my_wc_t)(s[3] & 0x3f); |
182 | 0 | if (*pwc < 0x10000 || *pwc > 0x10ffff) return MY_CS_ILSEQ; |
183 | 0 | return 4; |
184 | 0 | } |
185 | | |
186 | 0 | return MY_CS_ILSEQ; |
187 | 0 | } Unexecuted instantiation: my_error.cc:int my_mb_wc_utf8_prototype<true, true>(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: my_error.cc:int my_mb_wc_utf8_prototype<true, false>(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-uca.cc:int my_mb_wc_utf8_prototype<true, true>(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-uca.cc:int my_mb_wc_utf8_prototype<true, false>(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-utf8.cc:int my_mb_wc_utf8_prototype<true, false>(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-utf8.cc:int my_mb_wc_utf8_prototype<false, false>(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-utf8.cc:int my_mb_wc_utf8_prototype<true, true>(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-utf8.cc:int my_mb_wc_utf8_prototype<false, true>(unsigned long*, unsigned char const*, unsigned char const*) |
188 | | |
189 | | /** |
190 | | Parses a single UTF-8 character from a byte string. |
191 | | |
192 | | @param[out] pwc the parsed character, if any |
193 | | @param s the string to read from |
194 | | @param e the end of the string; will not read past this |
195 | | |
196 | | @return the number of bytes read from s, or a value <= 0 for failure |
197 | | (see m_ctype.h) |
198 | | */ |
199 | | static inline int my_mb_wc_utf8mb3(my_wc_t *pwc, const uint8_t *s, |
200 | 0 | const uint8_t *e) { |
201 | 0 | return my_mb_wc_utf8_prototype</*RANGE_CHECK=*/true, /*SUPPORT_MB4=*/false>( |
202 | 0 | pwc, s, e); |
203 | 0 | } Unexecuted instantiation: my_error.cc:my_mb_wc_utf8mb3(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-uca.cc:my_mb_wc_utf8mb3(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-utf8.cc:my_mb_wc_utf8mb3(unsigned long*, unsigned char const*, unsigned char const*) |
204 | | |
205 | | /** |
206 | | Parses a single UTF-8 character from a byte string. The difference |
207 | | between this and my_mb_wc_utf8mb3 is that this function also can handle |
208 | | four-byte UTF-8 characters. |
209 | | |
210 | | @param[out] pwc the parsed character, if any |
211 | | @param s the string to read from |
212 | | @param e the end of the string; will not read past this |
213 | | |
214 | | @return the number of bytes read from s, or a value <= 0 for failure |
215 | | (see m_ctype.h) |
216 | | */ |
217 | | static ALWAYS_INLINE int my_mb_wc_utf8mb4(my_wc_t *pwc, const uint8_t *s, |
218 | 0 | const uint8_t *e) { |
219 | 0 | return my_mb_wc_utf8_prototype</*RANGE_CHECK=*/true, /*SUPPORT_MB4=*/true>( |
220 | 0 | pwc, s, e); |
221 | 0 | } Unexecuted instantiation: my_error.cc:my_mb_wc_utf8mb4(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-uca.cc:my_mb_wc_utf8mb4(unsigned long*, unsigned char const*, unsigned char const*) Unexecuted instantiation: ctype-utf8.cc:my_mb_wc_utf8mb4(unsigned long*, unsigned char const*, unsigned char const*) |
222 | | |
223 | | // Non-inlined versions of the above. These are used as function pointers |
224 | | // in MY_CHARSET_HANDLER structs, and you can compare against them to see |
225 | | // if using the Mb_wc_utf8* functors would be appropriate. |
226 | | |
227 | | extern "C" int my_mb_wc_utf8mb3_thunk(const CHARSET_INFO *cs, my_wc_t *pwc, |
228 | | const uint8_t *s, const uint8_t *e); |
229 | | |
230 | | extern "C" int my_mb_wc_utf8mb4_thunk(const CHARSET_INFO *cs, my_wc_t *pwc, |
231 | | const uint8_t *s, const uint8_t *e); |
232 | | |
233 | | #endif // MB_WC_INCLUDED |