/src/mosquitto/lib/utf8_mosq.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2016-2021 Roger Light <roger@atchoo.org> |
3 | | |
4 | | All rights reserved. This program and the accompanying materials |
5 | | are made available under the terms of the Eclipse Public License 2.0 |
6 | | and Eclipse Distribution License v1.0 which accompany this distribution. |
7 | | |
8 | | The Eclipse Public License is available at |
9 | | https://www.eclipse.org/legal/epl-2.0/ |
10 | | and the Eclipse Distribution License is available at |
11 | | http://www.eclipse.org/org/documents/edl-v10.php. |
12 | | |
13 | | SPDX-License-Identifier: EPL-2.0 OR BSD-3-Clause |
14 | | |
15 | | Contributors: |
16 | | Roger Light - initial implementation. |
17 | | */ |
18 | | |
19 | | #include "config.h" |
20 | | |
21 | | #include <stdio.h> |
22 | | #include "mosquitto.h" |
23 | | |
24 | | BROKER_EXPORT int mosquitto_validate_utf8(const char *str, int len) |
25 | 0 | { |
26 | 0 | int i; |
27 | 0 | int j; |
28 | 0 | int codelen; |
29 | 0 | int codepoint; |
30 | 0 | const unsigned char *ustr = (const unsigned char *)str; |
31 | |
|
32 | 0 | if(!str) return MOSQ_ERR_INVAL; |
33 | 0 | if(len < 0 || len > 65536) return MOSQ_ERR_INVAL; |
34 | | |
35 | 0 | for(i=0; i<len; i++){ |
36 | 0 | if(ustr[i] == 0){ |
37 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
38 | 0 | }else if(ustr[i] <= 0x7f){ |
39 | 0 | codelen = 1; |
40 | 0 | codepoint = ustr[i]; |
41 | 0 | }else if((ustr[i] & 0xE0) == 0xC0){ |
42 | | /* 110xxxxx - 2 byte sequence */ |
43 | 0 | if(ustr[i] == 0xC0 || ustr[i] == 0xC1){ |
44 | | /* Invalid bytes */ |
45 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
46 | 0 | } |
47 | 0 | codelen = 2; |
48 | 0 | codepoint = (ustr[i] & 0x1F); |
49 | 0 | }else if((ustr[i] & 0xF0) == 0xE0){ |
50 | | /* 1110xxxx - 3 byte sequence */ |
51 | 0 | codelen = 3; |
52 | 0 | codepoint = (ustr[i] & 0x0F); |
53 | 0 | }else if((ustr[i] & 0xF8) == 0xF0){ |
54 | | /* 11110xxx - 4 byte sequence */ |
55 | 0 | if(ustr[i] > 0xF4){ |
56 | | /* Invalid, this would produce values > 0x10FFFF. */ |
57 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
58 | 0 | } |
59 | 0 | codelen = 4; |
60 | 0 | codepoint = (ustr[i] & 0x07); |
61 | 0 | }else{ |
62 | | /* Unexpected continuation byte. */ |
63 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
64 | 0 | } |
65 | | |
66 | | /* Reconstruct full code point */ |
67 | 0 | if(i >= len-codelen+1){ |
68 | | /* Not enough data */ |
69 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
70 | 0 | } |
71 | 0 | for(j=0; j<codelen-1; j++){ |
72 | 0 | if((ustr[++i] & 0xC0) != 0x80){ |
73 | | /* Not a continuation byte */ |
74 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
75 | 0 | } |
76 | 0 | codepoint = (codepoint<<6) | (ustr[i] & 0x3F); |
77 | 0 | } |
78 | | |
79 | | /* Check for UTF-16 high/low surrogates */ |
80 | 0 | if(codepoint >= 0xD800 && codepoint <= 0xDFFF){ |
81 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
82 | 0 | } |
83 | | |
84 | | /* Check for overlong or out of range encodings */ |
85 | | /* Checking codelen == 2 isn't necessary here, because it is already |
86 | | * covered above in the C0 and C1 checks. |
87 | | * if(codelen == 2 && codepoint < 0x0080){ |
88 | | * return MOSQ_ERR_MALFORMED_UTF8; |
89 | | * }else |
90 | | */ |
91 | 0 | if(codelen == 3 && codepoint < 0x0800){ |
92 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
93 | 0 | }else if(codelen == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)){ |
94 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
95 | 0 | } |
96 | | |
97 | | /* Check for non-characters */ |
98 | 0 | if(codepoint >= 0xFDD0 && codepoint <= 0xFDEF){ |
99 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
100 | 0 | } |
101 | 0 | if((codepoint & 0xFFFF) == 0xFFFE || (codepoint & 0xFFFF) == 0xFFFF){ |
102 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
103 | 0 | } |
104 | | /* Check for control characters */ |
105 | 0 | if(codepoint <= 0x001F || (codepoint >= 0x007F && codepoint <= 0x009F)){ |
106 | 0 | return MOSQ_ERR_MALFORMED_UTF8; |
107 | 0 | } |
108 | 0 | } |
109 | 0 | return MOSQ_ERR_SUCCESS; |
110 | 0 | } |