/src/samba/lib/util/charset/util_unistr.c
Line | Count | Source |
1 | | /* |
2 | | Unix SMB/CIFS implementation. |
3 | | Samba utility functions |
4 | | Copyright (C) Andrew Tridgell 1992-2001 |
5 | | Copyright (C) Simo Sorce 2001 |
6 | | |
7 | | This program is free software; you can redistribute it and/or modify |
8 | | it under the terms of the GNU General Public License as published by |
9 | | the Free Software Foundation; either version 3 of the License, or |
10 | | (at your option) any later version. |
11 | | |
12 | | This program is distributed in the hope that it will be useful, |
13 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | GNU General Public License for more details. |
16 | | |
17 | | You should have received a copy of the GNU General Public License |
18 | | along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #include "replace.h" |
22 | | #include "system/locale.h" |
23 | | #include "charset.h" |
24 | | #include "lib/util/byteorder.h" |
25 | | #include "lib/util/fault.h" |
26 | | #include "lib/util/tsort.h" |
27 | | |
28 | | /** |
29 | | String replace. |
30 | | NOTE: oldc and newc must be 7 bit characters |
31 | | **/ |
32 | | _PUBLIC_ void string_replace_m(char *s, char oldc, char newc) |
33 | 0 | { |
34 | 0 | struct smb_iconv_handle *ic = get_iconv_handle(); |
35 | 0 | while (s && *s) { |
36 | 0 | size_t size; |
37 | 0 | codepoint_t c = next_codepoint_handle(ic, s, &size); |
38 | 0 | if (c == oldc) { |
39 | 0 | *s = newc; |
40 | 0 | } |
41 | 0 | s += size; |
42 | 0 | } |
43 | 0 | } |
44 | | |
45 | | /** |
46 | | Convert a string to lower case, allocated with talloc |
47 | | **/ |
48 | | _PUBLIC_ char *strlower_talloc_handle(struct smb_iconv_handle *iconv_handle, |
49 | | TALLOC_CTX *ctx, const char *src) |
50 | 0 | { |
51 | 0 | size_t size=0; |
52 | 0 | char *dest; |
53 | |
|
54 | 0 | if(src == NULL) { |
55 | 0 | return NULL; |
56 | 0 | } |
57 | | |
58 | | /* this takes advantage of the fact that upper/lower can't |
59 | | change the length of a character by more than 1 byte */ |
60 | 0 | dest = talloc_array(ctx, char, 2*(strlen(src))+1); |
61 | 0 | if (dest == NULL) { |
62 | 0 | return NULL; |
63 | 0 | } |
64 | | |
65 | 0 | while (*src) { |
66 | 0 | size_t c_size; |
67 | 0 | codepoint_t c = next_codepoint_handle(iconv_handle, src, &c_size); |
68 | 0 | src += c_size; |
69 | |
|
70 | 0 | c = tolower_m(c); |
71 | |
|
72 | 0 | c_size = push_codepoint_handle(iconv_handle, dest+size, c); |
73 | 0 | if (c_size == -1) { |
74 | 0 | talloc_free(dest); |
75 | 0 | return NULL; |
76 | 0 | } |
77 | 0 | size += c_size; |
78 | 0 | } |
79 | | |
80 | 0 | dest[size] = 0; |
81 | | |
82 | | /* trim it so talloc_append_string() works */ |
83 | 0 | dest = talloc_realloc(ctx, dest, char, size+1); |
84 | |
|
85 | 0 | talloc_set_name_const(dest, dest); |
86 | |
|
87 | 0 | return dest; |
88 | 0 | } |
89 | | |
90 | | _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src) |
91 | 0 | { |
92 | 0 | struct smb_iconv_handle *iconv_handle = get_iconv_handle(); |
93 | 0 | return strlower_talloc_handle(iconv_handle, ctx, src); |
94 | 0 | } |
95 | | |
96 | | /** |
97 | | Convert a string to UPPER case, allocated with talloc |
98 | | source length limited to n bytes, iconv handle supplied |
99 | | **/ |
100 | | _PUBLIC_ char *strupper_talloc_n_handle(struct smb_iconv_handle *iconv_handle, |
101 | | TALLOC_CTX *ctx, const char *src, size_t n) |
102 | 298 | { |
103 | 298 | size_t size=0; |
104 | 298 | char *dest; |
105 | | |
106 | 298 | if (!src) { |
107 | 0 | return NULL; |
108 | 0 | } |
109 | | |
110 | | /* this takes advantage of the fact that upper/lower can't |
111 | | change the length of a character by more than 1 byte */ |
112 | 298 | dest = talloc_array(ctx, char, 2*(n+1)); |
113 | 298 | if (dest == NULL) { |
114 | 0 | return NULL; |
115 | 0 | } |
116 | | |
117 | 85.3k | while (n && *src) { |
118 | 85.1k | size_t c_size; |
119 | 85.1k | codepoint_t c = next_codepoint_handle_ext(iconv_handle, src, n, |
120 | 85.1k | CH_UNIX, &c_size); |
121 | 85.1k | src += c_size; |
122 | 85.1k | n -= c_size; |
123 | | |
124 | 85.1k | c = toupper_m(c); |
125 | | |
126 | 85.1k | c_size = push_codepoint_handle(iconv_handle, dest+size, c); |
127 | 85.1k | if (c_size == -1) { |
128 | 53 | talloc_free(dest); |
129 | 53 | return NULL; |
130 | 53 | } |
131 | 85.0k | size += c_size; |
132 | 85.0k | } |
133 | | |
134 | 245 | dest[size] = 0; |
135 | | |
136 | | /* trim it so talloc_append_string() works */ |
137 | 245 | dest = talloc_realloc(ctx, dest, char, size+1); |
138 | | |
139 | 245 | talloc_set_name_const(dest, dest); |
140 | | |
141 | 245 | return dest; |
142 | 298 | } |
143 | | |
144 | | /** |
145 | | Convert a string to UPPER case, allocated with talloc |
146 | | source length limited to n bytes |
147 | | **/ |
148 | | _PUBLIC_ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n) |
149 | 298 | { |
150 | 298 | struct smb_iconv_handle *iconv_handle = get_iconv_handle(); |
151 | 298 | return strupper_talloc_n_handle(iconv_handle, ctx, src, n); |
152 | 298 | } |
153 | | /** |
154 | | Convert a string to UPPER case, allocated with talloc |
155 | | **/ |
156 | | _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src) |
157 | 298 | { |
158 | 298 | return strupper_talloc_n(ctx, src, src?strlen(src):0); |
159 | 298 | } |
160 | | |
161 | | /** |
162 | | talloc_strdup() a unix string to upper case. |
163 | | **/ |
164 | | _PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src) |
165 | 0 | { |
166 | 0 | return strupper_talloc(ctx, src); |
167 | 0 | } |
168 | | |
169 | | |
170 | | /* |
171 | | * strncasecmp_ldb() works like a *bit* like strncasecmp, with various |
172 | | * tricks to suit the way LDB compares strings. The differences are: |
173 | | * |
174 | | * 0. each string has it's own length. |
175 | | * |
176 | | * 1. consecutive spaces are collapsed down to one space, so that |
177 | | * "a b" equals "a b". (this is why each string needs its own |
178 | | * length). Leading and trailing spaces are removed altogether. |
179 | | * |
180 | | * 2. Comparisons are done in UPPER CASE, as Windows does, not in |
181 | | * lowercase as POSIX would have it. |
182 | | * |
183 | | * 3. An invalid byte compares higher than any real character. For example, |
184 | | * "hello\xc2\xff" would sort higher than "hello\xcd\xb6", because CD |
185 | | * B6 is a valid sequence and C2 FF is not. |
186 | | * |
187 | | * 4. If two strings become invalid on the same character, the rest |
188 | | * of the string is compared via ldb ASCII case fold rules. |
189 | | * |
190 | | * For example, "hellō\xC2\xFFworld" < " hElLŌ\xFE ", because the |
191 | | * strings are equal up to 'ō' by utf-8 casefold, but the "\xc2\xff" and |
192 | | * "\xfe" are invalid sequences. At that point, we skip to the byte-by-byte |
193 | | * (but space-eating, casefolding) comparison, and 0xc2 < 0xff. |
194 | | */ |
195 | | |
196 | | #define EAT_SPACE(s, len, ends_in_space) \ |
197 | 9.90k | do { \ |
198 | 15.6k | while (len) { \ |
199 | 11.7k | if (*s != ' ') { \ |
200 | 6.00k | break; \ |
201 | 6.00k | } \ |
202 | 11.7k | s++; \ |
203 | 5.74k | len--; \ |
204 | 5.74k | } \ |
205 | 9.90k | ends_in_space = (len == 0 || *s == '\0'); \ |
206 | 9.90k | } while(0) |
207 | | |
208 | | |
209 | | _PUBLIC_ int strncasecmp_ldb(const char *s1, |
210 | | size_t len1, |
211 | | const char *s2, |
212 | | size_t len2) |
213 | 4.26k | { |
214 | 4.26k | struct smb_iconv_handle *iconv_handle = get_iconv_handle(); |
215 | 4.26k | codepoint_t c1, c2; |
216 | 4.26k | size_t cs1, cs2; |
217 | 4.26k | bool ends_in_space1, ends_in_space2; |
218 | 4.26k | int ret; |
219 | 4.26k | bool end1, end2; |
220 | | |
221 | 4.26k | EAT_SPACE(s1, len1, ends_in_space1); |
222 | 4.26k | EAT_SPACE(s2, len2, ends_in_space2); |
223 | | /* |
224 | | * if ends_in_space was set, the string was empty or only |
225 | | * spaces (which we treat as equivalent). |
226 | | */ |
227 | 4.26k | if (ends_in_space1 && ends_in_space2) { |
228 | 1.18k | return 0; |
229 | 1.18k | } |
230 | 3.07k | if (ends_in_space1) { |
231 | 720 | return -1; |
232 | 720 | } |
233 | 2.35k | if (ends_in_space2) { |
234 | 720 | return 1; |
235 | 720 | } |
236 | | |
237 | 4.95k | while (true) { |
238 | | /* |
239 | | * If the next byte is a space, we eat all the spaces, |
240 | | * and say we found a single codepoint. If the spaces |
241 | | * were at the end of the string, the codepoint is 0, |
242 | | * as if there were no spaces. Otherwise it is 0x20, |
243 | | * as if there was one space. |
244 | | * |
245 | | * Setting the codepoint to 0 will break the loop, but |
246 | | * only after codepoints have been found in both strings. |
247 | | */ |
248 | 4.95k | if (len1 == 0 || *s1 == 0) { |
249 | 528 | c1 = 0; |
250 | 4.43k | } else if (*s1 == ' ') { |
251 | 249 | EAT_SPACE(s1, len1, ends_in_space1); |
252 | 249 | c1 = ends_in_space1 ? 0 : ' '; |
253 | 4.18k | } else if ((*s1 & 0x80) == 0) { |
254 | 2.17k | c1 = *s1; |
255 | 2.17k | s1++; |
256 | 2.17k | len1--; |
257 | 2.17k | } else { |
258 | 2.01k | c1 = next_codepoint_handle_ext(iconv_handle, s1, len1, |
259 | 2.01k | CH_UNIX, &cs1); |
260 | 2.01k | if (c1 != INVALID_CODEPOINT) { |
261 | 1.22k | s1 += cs1; |
262 | 1.22k | len1 -= cs1; |
263 | 1.22k | } |
264 | 2.01k | } |
265 | | |
266 | 4.95k | if (len2 == 0 || *s2 == 0) { |
267 | 528 | c2 = 0; |
268 | 4.43k | } else if (*s2 == ' ') { |
269 | 249 | EAT_SPACE(s2, len2, ends_in_space2); |
270 | 249 | c2 = ends_in_space2 ? 0 : ' '; |
271 | 4.18k | } else if ((*s2 & 0x80) == 0) { |
272 | 2.17k | c2 = *s2; |
273 | 2.17k | s2++; |
274 | 2.17k | len2--; |
275 | 2.17k | } else { |
276 | 2.01k | c2 = next_codepoint_handle_ext(iconv_handle, s2, len2, |
277 | 2.01k | CH_UNIX, &cs2); |
278 | 2.01k | if (c2 != INVALID_CODEPOINT) { |
279 | 1.22k | s2 += cs2; |
280 | 1.22k | len2 -= cs2; |
281 | 1.22k | } |
282 | 2.01k | } |
283 | | |
284 | 4.95k | if (c1 == 0 || c2 == 0 || |
285 | 4.35k | c1 == INVALID_CODEPOINT || c2 == INVALID_CODEPOINT) { |
286 | 1.43k | break; |
287 | 1.43k | } |
288 | | |
289 | 3.52k | if (c1 == c2) { |
290 | 2.98k | continue; |
291 | 2.98k | } |
292 | 544 | c1 = toupper_m(c1); |
293 | 544 | c2 = toupper_m(c2); |
294 | 544 | if (c1 != c2) { |
295 | 206 | break; |
296 | 206 | } |
297 | 544 | } |
298 | | |
299 | | /* |
300 | | * Either a difference has been found, or one or both strings have |
301 | | * ended or hit invalid codepoints. |
302 | | */ |
303 | 1.63k | ret = NUMERIC_CMP(c1, c2); |
304 | | |
305 | 1.63k | if (ret != 0) { |
306 | 394 | return ret; |
307 | 394 | } |
308 | | /* |
309 | | * the strings are equal up to here, but one might be longer. |
310 | | */ |
311 | 1.24k | end1 = len1 == 0 || *s1 == 0; |
312 | 1.24k | end2 = len2 == 0 || *s2 == 0; |
313 | | |
314 | 1.24k | if (end1 && end2) { |
315 | 520 | return 0; |
316 | 520 | } |
317 | 724 | if (end1) { |
318 | 0 | return -1; |
319 | 0 | } |
320 | 724 | if (end2) { |
321 | 0 | return -1; |
322 | 0 | } |
323 | | |
324 | | /* |
325 | | * By elimination, if we got here, we have INVALID_CODEPOINT on both |
326 | | * sides. |
327 | | * |
328 | | * THere is no perfect option, but what we choose to do is continue on |
329 | | * with ascii case fold (as if calling ldb_comparison_fold_ascii() |
330 | | * which is private to ldb, so we can't just defer to it). |
331 | | */ |
332 | 22.9k | while (true) { |
333 | 22.9k | if (len1 == 0 || *s1 == 0) { |
334 | 526 | c1 = 0; |
335 | 22.4k | } else if (*s1 == ' ') { |
336 | 435 | EAT_SPACE(s1, len1, ends_in_space1); |
337 | 435 | c1 = ends_in_space1 ? 0 : ' '; |
338 | 22.0k | } else { |
339 | 22.0k | c1 = *s1; |
340 | 22.0k | s1++; |
341 | 22.0k | len1--; |
342 | 22.0k | c1 = ('a' <= c1 && c1 <= 'z') ? c1 ^ 0x20 : c1; |
343 | 22.0k | } |
344 | | |
345 | 22.9k | if (len2 == 0 || *s2 == 0) { |
346 | 526 | c2 = 0; |
347 | 22.4k | } else if (*s2 == ' ') { |
348 | 435 | EAT_SPACE(s2, len2, ends_in_space2); |
349 | 435 | c2 = ends_in_space2 ? 0 : ' '; |
350 | 22.0k | } else { |
351 | 22.0k | c2 = *s2; |
352 | 22.0k | s2++; |
353 | 22.0k | len2--; |
354 | 22.0k | c2 = ('a' <= c2 && c2 <= 'z') ? c2 ^ 0x20 : c2; |
355 | 22.0k | } |
356 | | |
357 | 22.9k | if (c1 == 0 || c2 == 0 || c1 != c2) { |
358 | 724 | break; |
359 | 724 | } |
360 | 22.9k | } |
361 | 724 | return NUMERIC_CMP(c1, c2); |
362 | 724 | } |
363 | | |
364 | | #undef EAT_SPACE |
365 | | |
366 | | |
367 | | /** |
368 | | Find the number of 'c' chars in a string |
369 | | **/ |
370 | | _PUBLIC_ size_t count_chars_m(const char *s, char c) |
371 | 0 | { |
372 | 0 | struct smb_iconv_handle *ic = get_iconv_handle(); |
373 | 0 | size_t count = 0; |
374 | |
|
375 | 0 | while (*s) { |
376 | 0 | size_t size; |
377 | 0 | codepoint_t c2 = next_codepoint_handle(ic, s, &size); |
378 | 0 | if (c2 == c) count++; |
379 | 0 | s += size; |
380 | 0 | } |
381 | |
|
382 | 0 | return count; |
383 | 0 | } |
384 | | |
385 | | size_t ucs2_align(const void *base_ptr, const void *p, int flags) |
386 | 0 | { |
387 | 0 | if (flags & (STR_NOALIGN|STR_ASCII)) { |
388 | 0 | return 0; |
389 | 0 | } |
390 | 0 | return PTR_DIFF(p, base_ptr) & 1; |
391 | 0 | } |
392 | | |
393 | | /** |
394 | | return the number of bytes occupied by a buffer in CH_UTF16 format |
395 | | **/ |
396 | | size_t utf16_len(const void *buf) |
397 | 2.03k | { |
398 | 2.03k | size_t len; |
399 | | |
400 | 61.2M | for (len = 0; PULL_LE_U16(buf,len); len += 2) ; |
401 | | |
402 | 2.03k | return len; |
403 | 2.03k | } |
404 | | |
405 | | /** |
406 | | return the number of bytes occupied by a buffer in CH_UTF16 format |
407 | | the result includes the null termination |
408 | | **/ |
409 | | size_t utf16_null_terminated_len(const void *buf) |
410 | 1.32k | { |
411 | 1.32k | return utf16_len(buf) + 2; |
412 | 1.32k | } |
413 | | |
414 | | /** |
415 | | return the number of bytes occupied by a buffer in CH_UTF16 format |
416 | | limited by 'n' bytes |
417 | | **/ |
418 | | size_t utf16_len_n(const void *src, size_t n) |
419 | 7.42M | { |
420 | 7.42M | size_t len; |
421 | | |
422 | 3.07G | for (len = 0; (len+2 <= n) && PULL_LE_U16(src, len); len += 2) ; |
423 | | |
424 | 7.42M | return len; |
425 | 7.42M | } |
426 | | |
427 | | /** |
428 | | return the number of bytes occupied by a buffer in CH_UTF16 format |
429 | | the result includes the null termination |
430 | | limited by 'n' bytes |
431 | | **/ |
432 | | size_t utf16_null_terminated_len_n(const void *src, size_t n) |
433 | 7.42M | { |
434 | 7.42M | size_t len; |
435 | | |
436 | 7.42M | len = utf16_len_n(src, n); |
437 | | |
438 | 7.42M | if (len+2 <= n) { |
439 | 7.38M | len += 2; |
440 | 7.38M | } |
441 | | |
442 | 7.42M | return len; |
443 | 7.42M | } |
444 | | |
445 | | unsigned char *talloc_utf16_strlendup(TALLOC_CTX *mem_ctx, const char *str, size_t len) |
446 | 758 | { |
447 | 758 | unsigned char *new_str = NULL; |
448 | | |
449 | | /* Check for overflow. */ |
450 | 758 | if (len > SIZE_MAX - 2) { |
451 | 0 | return NULL; |
452 | 0 | } |
453 | | |
454 | | /* |
455 | | * Allocate the new string, including space for the |
456 | | * UTF‐16 null terminator. |
457 | | */ |
458 | 758 | new_str = talloc_size(mem_ctx, len + 2); |
459 | 758 | if (new_str == NULL) { |
460 | 0 | return NULL; |
461 | 0 | } |
462 | | |
463 | 758 | memcpy(new_str, str, len); |
464 | | |
465 | | /* |
466 | | * Ensure that the UTF‐16 string is |
467 | | * null‐terminated. |
468 | | */ |
469 | 758 | new_str[len] = '\0'; |
470 | 758 | new_str[len + 1] = '\0'; |
471 | | |
472 | 758 | return new_str; |
473 | 758 | } |
474 | | |
475 | | unsigned char *talloc_utf16_strdup(TALLOC_CTX *mem_ctx, const char *str) |
476 | 0 | { |
477 | 0 | if (str == NULL) { |
478 | 0 | return NULL; |
479 | 0 | } |
480 | 0 | return talloc_utf16_strlendup(mem_ctx, str, utf16_len(str)); |
481 | 0 | } |
482 | | |
483 | | unsigned char *talloc_utf16_strndup(TALLOC_CTX *mem_ctx, const char *str, size_t n) |
484 | 0 | { |
485 | 0 | if (str == NULL) { |
486 | 0 | return NULL; |
487 | 0 | } |
488 | 0 | return talloc_utf16_strlendup(mem_ctx, str, utf16_len_n(str, n)); |
489 | 0 | } |
490 | | |
491 | | /** |
492 | | * Determine the length and validity of a utf-8 string. |
493 | | * |
494 | | * @param input the string pointer |
495 | | * @param maxlen maximum size of the string |
496 | | * @param byte_len receives the length of the valid section |
497 | | * @param char_len receives the number of unicode characters in the valid section |
498 | | * @param utf16_len receives the number of bytes the string would need in UTF16 encoding. |
499 | | * |
500 | | * @return true if the input is valid up to maxlen, or a '\0' byte, otherwise false. |
501 | | */ |
502 | | bool utf8_check(const char *input, size_t maxlen, |
503 | | size_t *byte_len, |
504 | | size_t *char_len, |
505 | | size_t *utf16_len) |
506 | 0 | { |
507 | 0 | const uint8_t *s = (const uint8_t *)input; |
508 | 0 | size_t i; |
509 | 0 | size_t chars = 0; |
510 | 0 | size_t long_chars = 0; |
511 | 0 | uint32_t codepoint; |
512 | 0 | uint8_t a, b, c, d; |
513 | 0 | for (i = 0; i < maxlen; i++, chars++) { |
514 | 0 | if (s[i] == 0) { |
515 | 0 | break; |
516 | 0 | } |
517 | 0 | if (s[i] < 0x80) { |
518 | 0 | continue; |
519 | 0 | } |
520 | 0 | if ((s[i] & 0xe0) == 0xc0) { |
521 | | /* 110xxxxx 10xxxxxx */ |
522 | 0 | a = s[i]; |
523 | 0 | if (maxlen - i < 2) { |
524 | 0 | goto error; |
525 | 0 | } |
526 | 0 | b = s[i + 1]; |
527 | 0 | if ((b & 0xc0) != 0x80) { |
528 | 0 | goto error; |
529 | 0 | } |
530 | 0 | codepoint = (a & 31) << 6 | (b & 63); |
531 | 0 | if (codepoint < 0x80) { |
532 | 0 | goto error; |
533 | 0 | } |
534 | 0 | i++; |
535 | 0 | continue; |
536 | 0 | } |
537 | 0 | if ((s[i] & 0xf0) == 0xe0) { |
538 | | /* 1110xxxx 10xxxxxx 10xxxxxx */ |
539 | 0 | if (maxlen - i < 3) { |
540 | 0 | goto error; |
541 | 0 | } |
542 | 0 | a = s[i]; |
543 | 0 | b = s[i + 1]; |
544 | 0 | c = s[i + 2]; |
545 | 0 | if ((b & 0xc0) != 0x80 || (c & 0xc0) != 0x80) { |
546 | 0 | goto error; |
547 | 0 | } |
548 | 0 | codepoint = (c & 63) | (b & 63) << 6 | (a & 15) << 12; |
549 | |
|
550 | 0 | if (codepoint < 0x800) { |
551 | 0 | goto error; |
552 | 0 | } |
553 | 0 | if (codepoint >= 0xd800 && codepoint <= 0xdfff) { |
554 | | /* |
555 | | * This is an invalid codepoint, per |
556 | | * RFC3629, as it encodes part of a |
557 | | * UTF-16 surrogate pair for a |
558 | | * character over U+10000, which ought |
559 | | * to have been encoded as a four byte |
560 | | * utf-8 sequence. |
561 | | */ |
562 | 0 | goto error; |
563 | 0 | } |
564 | 0 | i += 2; |
565 | 0 | continue; |
566 | 0 | } |
567 | | |
568 | 0 | if ((s[i] & 0xf8) == 0xf0) { |
569 | | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
570 | 0 | if (maxlen - i < 4) { |
571 | 0 | goto error; |
572 | 0 | } |
573 | 0 | a = s[i]; |
574 | 0 | b = s[i + 1]; |
575 | 0 | c = s[i + 2]; |
576 | 0 | d = s[i + 3]; |
577 | |
|
578 | 0 | if ((b & 0xc0) != 0x80 || |
579 | 0 | (c & 0xc0) != 0x80 || |
580 | 0 | (d & 0xc0) != 0x80) { |
581 | 0 | goto error; |
582 | 0 | } |
583 | 0 | codepoint = (d & 63) | (c & 63) << 6 | (b & 63) << 12 | (a & 7) << 18; |
584 | |
|
585 | 0 | if (codepoint < 0x10000 || codepoint > 0x10ffff) { |
586 | 0 | goto error; |
587 | 0 | } |
588 | | /* this one will need two UTF16 characters */ |
589 | 0 | long_chars++; |
590 | 0 | i += 3; |
591 | 0 | continue; |
592 | 0 | } |
593 | | /* |
594 | | * If it wasn't handled yet, it's wrong. |
595 | | */ |
596 | 0 | goto error; |
597 | 0 | } |
598 | 0 | *byte_len = i; |
599 | 0 | *char_len = chars; |
600 | 0 | *utf16_len = chars + long_chars; |
601 | 0 | return true; |
602 | | |
603 | 0 | error: |
604 | 0 | *byte_len = i; |
605 | 0 | *char_len = chars; |
606 | 0 | *utf16_len = chars + long_chars; |
607 | 0 | return false; |
608 | 0 | } |
609 | | |
610 | | |
611 | | /** |
612 | | * Copy a string from a char* unix src to a dos codepage string destination. |
613 | | * |
614 | | * @converted_size the number of bytes occupied by the string in the destination. |
615 | | * @return bool true if success. |
616 | | * |
617 | | * @param flags can include |
618 | | * <dl> |
619 | | * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd> |
620 | | * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd> |
621 | | * </dl> |
622 | | * |
623 | | * @param dest_len the maximum length in bytes allowed in the |
624 | | * destination. If @p dest_len is -1 then no maximum is used. |
625 | | **/ |
626 | | static bool push_ascii_string(void *dest, const char *src, size_t dest_len, int flags, size_t *converted_size) |
627 | 0 | { |
628 | 0 | size_t src_len; |
629 | 0 | bool ret; |
630 | |
|
631 | 0 | if (flags & STR_UPPER) { |
632 | 0 | char *tmpbuf = strupper_talloc(NULL, src); |
633 | 0 | if (tmpbuf == NULL) { |
634 | 0 | return false; |
635 | 0 | } |
636 | 0 | ret = push_ascii_string(dest, tmpbuf, dest_len, flags & ~STR_UPPER, converted_size); |
637 | 0 | talloc_free(tmpbuf); |
638 | 0 | return ret; |
639 | 0 | } |
640 | | |
641 | 0 | src_len = strlen(src); |
642 | |
|
643 | 0 | if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) |
644 | 0 | src_len++; |
645 | |
|
646 | 0 | return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len, converted_size); |
647 | 0 | } |
648 | | |
649 | | /** |
650 | | * Copy a string from a dos codepage source to a unix char* destination. |
651 | | * |
652 | | * The resulting string in "dest" is always null terminated. |
653 | | * |
654 | | * @param flags can have: |
655 | | * <dl> |
656 | | * <dt>STR_TERMINATE</dt> |
657 | | * <dd>STR_TERMINATE means the string in @p src |
658 | | * is null terminated, and src_len is ignored.</dd> |
659 | | * </dl> |
660 | | * |
661 | | * @param src_len is the length of the source area in bytes. |
662 | | * @returns the number of bytes occupied by the string in @p src. |
663 | | **/ |
664 | | static ssize_t pull_ascii_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags) |
665 | 0 | { |
666 | 0 | size_t size = 0; |
667 | |
|
668 | 0 | if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) { |
669 | 0 | if (src_len == (size_t)-1) { |
670 | 0 | src_len = strlen((const char *)src) + 1; |
671 | 0 | } else { |
672 | 0 | size_t len = strnlen((const char *)src, src_len); |
673 | 0 | if (len < src_len) |
674 | 0 | len++; |
675 | 0 | src_len = len; |
676 | 0 | } |
677 | 0 | } |
678 | | |
679 | | /* We're ignoring the return here.. */ |
680 | 0 | (void)convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len, &size); |
681 | |
|
682 | 0 | if (dest_len) |
683 | 0 | dest[MIN(size, dest_len-1)] = 0; |
684 | |
|
685 | 0 | return src_len; |
686 | 0 | } |
687 | | |
688 | | /** |
689 | | * Copy a string from a char* src to a unicode destination. |
690 | | * |
691 | | * @returns the number of bytes occupied by the string in the destination. |
692 | | * |
693 | | * @param flags can have: |
694 | | * |
695 | | * <dl> |
696 | | * <dt>STR_TERMINATE <dd>means include the null termination. |
697 | | * <dt>STR_UPPER <dd>means uppercase in the destination. |
698 | | * <dt>STR_NOALIGN <dd>means don't do alignment. |
699 | | * </dl> |
700 | | * |
701 | | * @param dest_len is the maximum length allowed in the |
702 | | * destination. If dest_len is -1 then no maximum is used. |
703 | | **/ |
704 | | static ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags) |
705 | 0 | { |
706 | 0 | size_t len=0; |
707 | 0 | size_t src_len = strlen(src); |
708 | 0 | size_t size = 0; |
709 | 0 | bool ret; |
710 | |
|
711 | 0 | if (flags & STR_UPPER) { |
712 | 0 | char *tmpbuf = strupper_talloc(NULL, src); |
713 | 0 | ssize_t retval; |
714 | 0 | if (tmpbuf == NULL) { |
715 | 0 | return -1; |
716 | 0 | } |
717 | 0 | retval = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER); |
718 | 0 | talloc_free(tmpbuf); |
719 | 0 | return retval; |
720 | 0 | } |
721 | | |
722 | 0 | if (flags & STR_TERMINATE) |
723 | 0 | src_len++; |
724 | |
|
725 | 0 | if (ucs2_align(NULL, dest, flags)) { |
726 | 0 | *(char *)dest = 0; |
727 | 0 | dest = (void *)((char *)dest + 1); |
728 | 0 | if (dest_len) dest_len--; |
729 | 0 | len++; |
730 | 0 | } |
731 | | |
732 | | /* ucs2 is always a multiple of 2 bytes */ |
733 | 0 | dest_len &= ~1; |
734 | |
|
735 | 0 | ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len, &size); |
736 | 0 | if (ret == false) { |
737 | 0 | return 0; |
738 | 0 | } |
739 | | |
740 | 0 | len += size; |
741 | |
|
742 | 0 | return (ssize_t)len; |
743 | 0 | } |
744 | | |
745 | | |
746 | | /** |
747 | | Copy a string from a ucs2 source to a unix char* destination. |
748 | | Flags can have: |
749 | | STR_TERMINATE means the string in src is null terminated. |
750 | | STR_NOALIGN means don't try to align. |
751 | | if STR_TERMINATE is set then src_len is ignored if it is -1. |
752 | | src_len is the length of the source area in bytes |
753 | | Return the number of bytes occupied by the string in src. |
754 | | The resulting string in "dest" is always null terminated. |
755 | | **/ |
756 | | |
757 | | static size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags) |
758 | 0 | { |
759 | 0 | size_t size = 0; |
760 | |
|
761 | 0 | if (ucs2_align(NULL, src, flags)) { |
762 | 0 | src = (const void *)((const char *)src + 1); |
763 | 0 | if (src_len > 0) |
764 | 0 | src_len--; |
765 | 0 | } |
766 | |
|
767 | 0 | if (flags & STR_TERMINATE) { |
768 | 0 | if (src_len == (size_t)-1) { |
769 | 0 | src_len = utf16_null_terminated_len(src); |
770 | 0 | } else { |
771 | 0 | src_len = utf16_null_terminated_len_n(src, src_len); |
772 | 0 | } |
773 | 0 | } |
774 | | |
775 | | /* ucs2 is always a multiple of 2 bytes */ |
776 | 0 | if (src_len != (size_t)-1) |
777 | 0 | src_len &= ~1; |
778 | | |
779 | | /* We're ignoring the return here.. */ |
780 | 0 | (void)convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len, &size); |
781 | 0 | if (dest_len) |
782 | 0 | dest[MIN(size, dest_len-1)] = 0; |
783 | |
|
784 | 0 | return src_len; |
785 | 0 | } |
786 | | |
787 | | /** |
788 | | Copy a string from a char* src to a unicode or ascii |
789 | | dos codepage destination choosing unicode or ascii based on the |
790 | | flags in the SMB buffer starting at base_ptr. |
791 | | Return the number of bytes occupied by the string in the destination. |
792 | | flags can have: |
793 | | STR_TERMINATE means include the null termination. |
794 | | STR_UPPER means uppercase in the destination. |
795 | | STR_ASCII use ascii even with unicode packet. |
796 | | STR_NOALIGN means don't do alignment. |
797 | | dest_len is the maximum length allowed in the destination. If dest_len |
798 | | is -1 then no maximum is used. |
799 | | **/ |
800 | | |
801 | | _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags) |
802 | 0 | { |
803 | 0 | if (flags & STR_ASCII) { |
804 | 0 | size_t size = 0; |
805 | 0 | if (push_ascii_string(dest, src, dest_len, flags, &size)) { |
806 | 0 | return (ssize_t)size; |
807 | 0 | } else { |
808 | 0 | return (ssize_t)-1; |
809 | 0 | } |
810 | 0 | } else if (flags & STR_UNICODE) { |
811 | 0 | return push_ucs2(dest, src, dest_len, flags); |
812 | 0 | } else { |
813 | 0 | smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set"); |
814 | 0 | return -1; |
815 | 0 | } |
816 | 0 | } |
817 | | |
818 | | |
819 | | /** |
820 | | Copy a string from a unicode or ascii source (depending on |
821 | | the packet flags) to a char* destination. |
822 | | Flags can have: |
823 | | STR_TERMINATE means the string in src is null terminated. |
824 | | STR_UNICODE means to force as unicode. |
825 | | STR_ASCII use ascii even with unicode packet. |
826 | | STR_NOALIGN means don't do alignment. |
827 | | if STR_TERMINATE is set then src_len is ignored is it is -1 |
828 | | src_len is the length of the source area in bytes. |
829 | | Return the number of bytes occupied by the string in src. |
830 | | The resulting string in "dest" is always null terminated. |
831 | | **/ |
832 | | |
833 | | _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags) |
834 | 0 | { |
835 | 0 | if (flags & STR_ASCII) { |
836 | 0 | return pull_ascii_string(dest, src, dest_len, src_len, flags); |
837 | 0 | } else if (flags & STR_UNICODE) { |
838 | 0 | return pull_ucs2(dest, src, dest_len, src_len, flags); |
839 | 0 | } else { |
840 | 0 | smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set"); |
841 | 0 | return -1; |
842 | 0 | } |
843 | 0 | } |