/src/openvswitch/lib/unicode.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2009, 2010 Nicira, Inc. |
3 | | * |
4 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | * you may not use this file except in compliance with the License. |
6 | | * You may obtain a copy of the License at: |
7 | | * |
8 | | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | | * |
10 | | * Unless required by applicable law or agreed to in writing, software |
11 | | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | * See the License for the specific language governing permissions and |
14 | | * limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <config.h> |
18 | | |
19 | | #include "unicode.h" |
20 | | |
21 | | #include <inttypes.h> |
22 | | |
23 | | #include "openvswitch/dynamic-string.h" |
24 | | #include "util.h" |
25 | | |
26 | | /* Returns the unicode code point corresponding to leading surrogate 'leading' |
27 | | * and trailing surrogate 'trailing'. The return value will not make any |
28 | | * sense if 'leading' or 'trailing' are not in the correct ranges for leading |
29 | | * or trailing surrogates. */ |
30 | | int |
31 | | utf16_decode_surrogate_pair(int leading, int trailing) |
32 | 0 | { |
33 | | /* |
34 | | * Leading surrogate: 110110wwwwxxxxxx |
35 | | * Trailing surrogate: 110111xxxxxxxxxx |
36 | | * Code point: 000uuuuuxxxxxxxxxxxxxxxx |
37 | | */ |
38 | 0 | int w = (leading >> 6) & 0xf; |
39 | 0 | int u = w + 1; |
40 | 0 | int x0 = leading & 0x3f; |
41 | 0 | int x1 = trailing & 0x3ff; |
42 | 0 | return (u << 16) | (x0 << 10) | x1; |
43 | 0 | } |
44 | | |
45 | | /* Returns the number of Unicode characters in UTF-8 string 's'. */ |
46 | | size_t |
47 | | utf8_length(const char *s_) |
48 | 0 | { |
49 | 0 | const uint8_t *s; |
50 | 0 | size_t length; |
51 | |
|
52 | 0 | length = 0; |
53 | 0 | for (s = (const uint8_t *) s_; *s != '\0'; s++) { |
54 | | /* The most-significant bits of the first byte in a character are one |
55 | | * of 2#01, 2#00, or 2#11. 2#10 is a continuation byte. */ |
56 | 0 | length += (*s & 0xc0) != 0x80; |
57 | 0 | } |
58 | 0 | return length; |
59 | 0 | } |
60 | | |
61 | | static char * |
62 | | invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp) |
63 | 0 | { |
64 | 0 | struct ds msg; |
65 | 0 | int i; |
66 | |
|
67 | 0 | if (lengthp) { |
68 | 0 | *lengthp = 0; |
69 | 0 | } |
70 | |
|
71 | 0 | ds_init(&msg); |
72 | 0 | ds_put_cstr(&msg, "invalid UTF-8 sequence"); |
73 | 0 | for (i = 0; i < n; i++) { |
74 | 0 | ds_put_format(&msg, " 0x%02"PRIx8, s[i]); |
75 | 0 | } |
76 | 0 | return ds_steal_cstr(&msg); |
77 | 0 | } |
78 | | |
79 | | struct utf8_sequence { |
80 | | uint8_t octets[5][2]; |
81 | | }; |
82 | | |
83 | | static const struct utf8_sequence * |
84 | | lookup_utf8_sequence(uint8_t c) |
85 | 0 | { |
86 | 0 | static const struct utf8_sequence seqs[] = { |
87 | 0 | { { { 0x01, 0x7f }, |
88 | 0 | { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } }, |
89 | |
|
90 | 0 | { { { 0xc2, 0xdf }, { 0x80, 0xbf }, |
91 | 0 | { 0, 0 }, { 0, 0 }, { 0, 0 } } }, |
92 | |
|
93 | 0 | { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf }, |
94 | 0 | {0,0}, {0, 0 } } }, |
95 | |
|
96 | 0 | { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf }, |
97 | 0 | { 0, 0 }, { 0, 0 } } }, |
98 | |
|
99 | 0 | { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf }, |
100 | 0 | { 0, 0 }, { 0, 0 } } }, |
101 | |
|
102 | 0 | { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf }, |
103 | 0 | { 0, 0 }, { 0, 0 } } }, |
104 | |
|
105 | 0 | { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf }, |
106 | 0 | { 0, 0 } } }, |
107 | |
|
108 | 0 | { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf }, |
109 | 0 | { 0, 0 } } }, |
110 | |
|
111 | 0 | { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf }, |
112 | 0 | { 0, 0 } } }, |
113 | 0 | }; |
114 | |
|
115 | 0 | size_t i; |
116 | |
|
117 | 0 | for (i = 0; i < ARRAY_SIZE(seqs); i++) { |
118 | 0 | const uint8_t *o = seqs[i].octets[0]; |
119 | 0 | if (c >= o[0] && c <= o[1]) { |
120 | 0 | return &seqs[i]; |
121 | 0 | } |
122 | 0 | } |
123 | 0 | return NULL; |
124 | 0 | } |
125 | | |
126 | | /* Checks that 's' is a valid, null-terminated UTF-8 string. If so, returns a |
127 | | * null pointer and sets '*lengthp' to the number of Unicode characters in |
128 | | * 's'. If not, returns an error message that the caller must free and sets |
129 | | * '*lengthp' to 0. |
130 | | * |
131 | | * 'lengthp' may be NULL if the length is not needed. */ |
132 | | char * |
133 | | utf8_validate(const char *s_, size_t *lengthp) |
134 | 0 | { |
135 | 0 | size_t length = 0; |
136 | 0 | const uint8_t *s; |
137 | |
|
138 | 0 | for (s = (const uint8_t *) s_; *s != '\0'; ) { |
139 | 0 | length++; |
140 | 0 | if (s[0] < 0x80) { |
141 | 0 | s++; |
142 | 0 | } else { |
143 | 0 | const struct utf8_sequence *seq; |
144 | 0 | int i; |
145 | |
|
146 | 0 | seq = lookup_utf8_sequence(s[0]); |
147 | 0 | if (!seq) { |
148 | 0 | return invalid_utf8_sequence(s, 1, lengthp); |
149 | 0 | } |
150 | | |
151 | 0 | for (i = 1; seq->octets[i][0]; i++) { |
152 | 0 | const uint8_t *o = seq->octets[i]; |
153 | 0 | if (s[i] < o[0] || s[i] > o[1]) { |
154 | 0 | return invalid_utf8_sequence(s, i + 1, lengthp); |
155 | 0 | } |
156 | 0 | } |
157 | 0 | s += i; |
158 | 0 | } |
159 | 0 | } |
160 | 0 | if (lengthp) { |
161 | 0 | *lengthp = length; |
162 | 0 | } |
163 | 0 | return NULL; |
164 | 0 | } |