/src/bind9/lib/isc/utf8.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) Internet Systems Consortium, Inc. ("ISC") |
3 | | * |
4 | | * SPDX-License-Identifier: MPL-2.0 |
5 | | * |
6 | | * This Source Code Form is subject to the terms of the Mozilla Public |
7 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
8 | | * file, you can obtain one at https://mozilla.org/MPL/2.0/. |
9 | | * |
10 | | * See the COPYRIGHT file distributed with this work for additional |
11 | | * information regarding copyright ownership. |
12 | | */ |
13 | | |
14 | | #include <string.h> |
15 | | |
16 | | #include <isc/utf8.h> |
17 | | #include <isc/util.h> |
18 | | |
19 | | /* |
20 | | * UTF-8 is defined in "The Unicode Standard -- Version 4.0" |
21 | | * Also see RFC 3629. |
22 | | * |
23 | | * Char. number range | UTF-8 octet sequence |
24 | | * (hexadecimal) | (binary) |
25 | | * --------------------+--------------------------------------------- |
26 | | * 0000 0000-0000 007F | 0xxxxxxx |
27 | | * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx |
28 | | * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
29 | | * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
30 | | */ |
31 | | bool |
32 | 18.4k | isc_utf8_valid(const unsigned char *buf, size_t len) { |
33 | 18.4k | REQUIRE(buf != NULL); |
34 | | |
35 | 296k | for (size_t i = 0; i < len; i++) { |
36 | 278k | if (buf[i] <= 0x7f) { |
37 | 271k | continue; |
38 | 271k | } |
39 | 6.71k | if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 && |
40 | 3.27k | (buf[i + 1] & 0xc0) == 0x80) |
41 | 3.23k | { |
42 | 3.23k | unsigned int w; |
43 | 3.23k | w = (buf[i] & 0x1f) << 6; |
44 | 3.23k | w |= (buf[++i] & 0x3f); |
45 | 3.23k | if (w < 0x80) { |
46 | 6 | return false; |
47 | 6 | } |
48 | 3.22k | continue; |
49 | 3.23k | } |
50 | 3.48k | if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 && |
51 | 1.34k | (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) |
52 | 1.30k | { |
53 | 1.30k | unsigned int w; |
54 | 1.30k | w = (buf[i] & 0x0f) << 12; |
55 | 1.30k | w |= (buf[++i] & 0x3f) << 6; |
56 | 1.30k | w |= (buf[++i] & 0x3f); |
57 | 1.30k | if (w < 0x0800) { |
58 | 4 | return false; |
59 | 4 | } |
60 | 1.30k | continue; |
61 | 1.30k | } |
62 | 2.18k | if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 && |
63 | 2.01k | (buf[i + 1] & 0xc0) == 0x80 && |
64 | 2.00k | (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80) |
65 | 1.97k | { |
66 | 1.97k | unsigned int w; |
67 | 1.97k | w = (buf[i] & 0x07) << 18; |
68 | 1.97k | w |= (buf[++i] & 0x3f) << 12; |
69 | 1.97k | w |= (buf[++i] & 0x3f) << 6; |
70 | 1.97k | w |= (buf[++i] & 0x3f); |
71 | 1.97k | if (w < 0x10000 || w > 0x10FFFF) { |
72 | 30 | return false; |
73 | 30 | } |
74 | 1.94k | continue; |
75 | 1.97k | } |
76 | 203 | return false; |
77 | 2.18k | } |
78 | 18.2k | return true; |
79 | 18.4k | } |
80 | | |
81 | | bool |
82 | 10.6k | isc_utf8_bom(const unsigned char *buf, size_t len) { |
83 | 10.6k | REQUIRE(buf != NULL); |
84 | | |
85 | 10.6k | if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) { |
86 | 1 | return true; |
87 | 1 | } |
88 | 10.6k | return false; |
89 | 10.6k | } |