/src/CMake/Source/cm_utf8.c
Line | Count | Source |
1 | | /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying |
2 | | file LICENSE.rst or https://cmake.org/licensing for details. */ |
3 | | #include "cm_utf8.h" |
4 | | |
5 | | #include <string.h> |
6 | | |
7 | | /* |
8 | | RFC 3629 |
9 | | 07-bit: 0xxxxxxx |
10 | | 11-bit: 110xxxxx 10xxxxxx |
11 | | 16-bit: 1110xxxx 10xxxxxx 10xxxxxx |
12 | | 21-bit: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
13 | | |
14 | | Pre-RFC Compatibility |
15 | | 26-bit: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
16 | | 31-bit: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
17 | | */ |
18 | | |
19 | | /* Number of leading ones before a zero in the byte. */ |
20 | | unsigned char const cm_utf8_ones[256] = { |
21 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
22 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
23 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
24 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
25 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, |
26 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
27 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
28 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
29 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
30 | | 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8 |
31 | | }; |
32 | | |
33 | | /* Mask away control bits from bytes with n leading ones. */ |
34 | | static unsigned char const cm_utf8_mask[7] = { 0xEF, 0x3F, 0x1F, 0x0F, |
35 | | 0x07, 0x03, 0x01 }; |
36 | | |
37 | | /* Minimum allowed value when first byte has n leading ones. */ |
38 | | static unsigned int const cm_utf8_min[7] = { |
39 | | 0, 0, 1u << 7, 1u << 11, 1u << 16, 1u << 21, 1u << 26 /*, 1u<<31 */ |
40 | | }; |
41 | | |
42 | | char const* cm_utf8_decode_character(char const* first, char const* last, |
43 | | unsigned int* pc) |
44 | 0 | { |
45 | | /* We need at least one byte. */ |
46 | 0 | if (first == last) { |
47 | 0 | return 0; |
48 | 0 | } |
49 | | |
50 | | /* Count leading ones in the first byte. */ |
51 | 0 | unsigned char c = (unsigned char)*first++; |
52 | 0 | unsigned char const ones = cm_utf8_ones[c]; |
53 | 0 | switch (ones) { |
54 | 0 | case 0: |
55 | 0 | *pc = c; |
56 | 0 | return first; /* One-byte character. */ |
57 | 0 | case 1: |
58 | 0 | case 7: |
59 | 0 | case 8: |
60 | 0 | return 0; /* Invalid leading byte. */ |
61 | 0 | default: |
62 | 0 | break; |
63 | 0 | } |
64 | | |
65 | | /* Extract bits from this multi-byte character. */ |
66 | 0 | { |
67 | 0 | unsigned int uc = c & cm_utf8_mask[ones]; |
68 | 0 | int left; |
69 | 0 | for (left = ones - 1; left && first != last; --left) { |
70 | 0 | c = (unsigned char)*first++; |
71 | 0 | if (cm_utf8_ones[c] != 1) { |
72 | 0 | return 0; |
73 | 0 | } |
74 | 0 | uc = (uc << 6) | (c & cm_utf8_mask[1]); |
75 | 0 | } |
76 | | |
77 | 0 | if (left > 0 || uc < cm_utf8_min[ones]) { |
78 | 0 | return 0; |
79 | 0 | } |
80 | | |
81 | | /* UTF-16 surrogate halves. */ |
82 | 0 | if (0xD800 <= uc && uc <= 0xDFFF) { |
83 | 0 | return 0; |
84 | 0 | } |
85 | | |
86 | | /* Invalid codepoints. */ |
87 | 0 | if (0x10FFFF < uc) { |
88 | 0 | return 0; |
89 | 0 | } |
90 | | |
91 | 0 | *pc = uc; |
92 | 0 | return first; |
93 | 0 | } |
94 | 0 | } |
95 | | |
96 | | int cm_utf8_is_valid(char const* s) |
97 | 0 | { |
98 | 0 | if (!s) { |
99 | 0 | return 0; |
100 | 0 | } |
101 | | |
102 | 0 | char const* last = s + strlen(s); |
103 | 0 | char const* pos = s; |
104 | 0 | unsigned int pc; |
105 | |
|
106 | 0 | while (pos != last && (pos = cm_utf8_decode_character(pos, last, &pc))) { |
107 | | /* Nothing to do. */ |
108 | 0 | } |
109 | |
|
110 | 0 | return pos == last; |
111 | 0 | } |