/src/mosquitto/libcommon/utf8_common.c
Line | Count | Source |
1 | | /* |
2 | | Copyright (c) 2016-2021 Roger Light <roger@atchoo.org> |
3 | | |
4 | | All rights reserved. This program and the accompanying materials |
5 | | are made available under the terms of the Eclipse Public License 2.0 |
6 | | and Eclipse Distribution License v1.0 which accompany this distribution. |
7 | | |
8 | | The Eclipse Public License is available at |
9 | | https://www.eclipse.org/legal/epl-2.0/ |
10 | | and the Eclipse Distribution License is available at |
11 | | http://www.eclipse.org/org/documents/edl-v10.php. |
12 | | |
13 | | SPDX-License-Identifier: EPL-2.0 OR BSD-3-Clause |
14 | | |
15 | | Contributors: |
16 | | Roger Light - initial implementation. |
17 | | */ |
18 | | |
19 | | #include "config.h" |
20 | | |
21 | | #include <stdio.h> |
22 | | #include "mosquitto.h" |
23 | | |
24 | | |
25 | | BROKER_EXPORT int mosquitto_validate_utf8(const char *str, int len) |
26 | 7.21M | { |
27 | 7.21M | int i; |
28 | 7.21M | int j; |
29 | 7.21M | int codelen; |
30 | 7.21M | int codepoint; |
31 | 7.21M | const unsigned char *ustr = (const unsigned char *)str; |
32 | | |
33 | 7.21M | if(!str){ |
34 | 111k | return MOSQ_ERR_INVAL; |
35 | 111k | } |
36 | 7.10M | if(len < 0 || len > 65536){ |
37 | 10 | return MOSQ_ERR_INVAL; |
38 | 10 | } |
39 | | |
40 | 22.7M | for(i=0; i<len; i++){ |
41 | 15.6M | if(ustr[i] == 0){ |
42 | 751 | return MOSQ_ERR_MALFORMED_UTF8; |
43 | 15.6M | }else if(ustr[i] <= 0x7f){ |
44 | 15.6M | codelen = 1; |
45 | 15.6M | codepoint = ustr[i]; |
46 | 15.6M | }else if((ustr[i] & 0xE0) == 0xC0){ |
47 | | /* 110xxxxx - 2 byte sequence */ |
48 | 6.07k | if(ustr[i] == 0xC0 || ustr[i] == 0xC1){ |
49 | | /* Invalid bytes */ |
50 | 492 | return MOSQ_ERR_MALFORMED_UTF8; |
51 | 492 | } |
52 | 5.58k | codelen = 2; |
53 | 5.58k | codepoint = (ustr[i] & 0x1F); |
54 | 14.5k | }else if((ustr[i] & 0xF0) == 0xE0){ |
55 | | /* 1110xxxx - 3 byte sequence */ |
56 | 6.66k | codelen = 3; |
57 | 6.66k | codepoint = (ustr[i] & 0x0F); |
58 | 7.92k | }else if((ustr[i] & 0xF8) == 0xF0){ |
59 | | /* 11110xxx - 4 byte sequence */ |
60 | 7.21k | if(ustr[i] > 0xF4){ |
61 | | /* Invalid, this would produce values > 0x10FFFF. */ |
62 | 219 | return MOSQ_ERR_MALFORMED_UTF8; |
63 | 219 | } |
64 | 6.99k | codelen = 4; |
65 | 6.99k | codepoint = (ustr[i] & 0x07); |
66 | 6.99k | }else{ |
67 | | /* Unexpected continuation byte. */ |
68 | 709 | return MOSQ_ERR_MALFORMED_UTF8; |
69 | 709 | } |
70 | | |
71 | | /* Reconstruct full code point */ |
72 | 15.6M | if(i >= len-codelen+1){ |
73 | | /* Not enough data */ |
74 | 838 | return MOSQ_ERR_MALFORMED_UTF8; |
75 | 838 | } |
76 | 15.6M | for(j=0; j<codelen-1; j++){ |
77 | 38.1k | if((ustr[++i] & 0xC0) != 0x80){ |
78 | | /* Not a continuation byte */ |
79 | 496 | return MOSQ_ERR_MALFORMED_UTF8; |
80 | 496 | } |
81 | 37.6k | codepoint = (codepoint<<6) | (ustr[i] & 0x3F); |
82 | 37.6k | } |
83 | | |
84 | | /* Check for UTF-16 high/low surrogates */ |
85 | 15.6M | if(codepoint >= 0xD800 && codepoint <= 0xDFFF){ |
86 | 302 | return MOSQ_ERR_MALFORMED_UTF8; |
87 | 302 | } |
88 | | |
89 | | /* Check for overlong or out of range encodings */ |
90 | | /* Checking codelen == 2 isn't necessary here, because it is already |
91 | | * covered above in the C0 and C1 checks. |
92 | | * if(codelen == 2 && codepoint < 0x0080){ |
93 | | * return MOSQ_ERR_MALFORMED_UTF8; |
94 | | * }else |
95 | | */ |
96 | 15.6M | if(codelen == 3 && codepoint < 0x0800){ |
97 | 304 | return MOSQ_ERR_MALFORMED_UTF8; |
98 | 15.6M | }else if(codelen == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)){ |
99 | 1.00k | return MOSQ_ERR_MALFORMED_UTF8; |
100 | 1.00k | } |
101 | | |
102 | | /* Check for non-characters */ |
103 | 15.6M | if(codepoint >= 0xFDD0 && codepoint <= 0xFDEF){ |
104 | 242 | return MOSQ_ERR_MALFORMED_UTF8; |
105 | 242 | } |
106 | 15.6M | if((codepoint & 0xFFFF) == 0xFFFE || (codepoint & 0xFFFF) == 0xFFFF){ |
107 | 317 | return MOSQ_ERR_MALFORMED_UTF8; |
108 | 317 | } |
109 | | /* Check for control characters */ |
110 | 15.6M | if(codepoint <= 0x001F || (codepoint >= 0x007F && codepoint <= 0x009F)){ |
111 | 684 | return MOSQ_ERR_MALFORMED_UTF8; |
112 | 684 | } |
113 | 15.6M | } |
114 | 7.09M | return MOSQ_ERR_SUCCESS; |
115 | 7.10M | } |