/src/open62541/deps/utf8.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
2 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
3 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4 | | * |
5 | | * Copyright 2024 (c) Fraunhofer IOSB (Author: Julius Pfrommer) |
6 | | */ |
7 | | |
8 | | #include "utf8.h" |
9 | | |
10 | 0 | #define UTF_PARSE_BYTE(pos) do { \ |
11 | 0 | byte = str[pos]; \ |
12 | 0 | if(UTF_UNLIKELY(byte < 0x80 || byte > 0xBF)) \ |
13 | 0 | return 0; /* Not a continuation byte */ \ |
14 | 0 | *codepoint = (*codepoint << 6) + (byte & 0x3F); \ |
15 | 0 | } while(0) |
16 | | |
17 | | unsigned |
18 | 0 | utf8_to_codepoint(const unsigned char *str, size_t len, unsigned *codepoint) { |
19 | | /* Ensure there is a character to read */ |
20 | 0 | if(UTF_UNLIKELY(len == 0)) |
21 | 0 | return 0; |
22 | | |
23 | 0 | *codepoint = str[0]; |
24 | 0 | if(UTF_LIKELY(*codepoint < 0x80)) |
25 | 0 | return 1; /* Normal ASCII */ |
26 | | |
27 | 0 | if(UTF_UNLIKELY(*codepoint <= 0xC1)) |
28 | 0 | return 0; /* Continuation byte not allowed here */ |
29 | | |
30 | 0 | unsigned count; |
31 | 0 | unsigned char byte; |
32 | 0 | if(*codepoint <= 0xDF) { /* 2-byte sequence */ |
33 | 0 | if(len < 2) |
34 | 0 | return 0; |
35 | 0 | count = 2; |
36 | 0 | *codepoint &= 0x1F; |
37 | 0 | UTF_PARSE_BYTE(1); |
38 | 0 | if(UTF_UNLIKELY(*codepoint < 0x80)) |
39 | 0 | return 0; /* Too small for the encoding length */ |
40 | 0 | } else if(*codepoint <= 0xEF) { /* 3-byte sequence */ |
41 | 0 | if(len < 3) |
42 | 0 | return 0; |
43 | 0 | count = 3; |
44 | 0 | *codepoint &= 0xF; |
45 | 0 | UTF_PARSE_BYTE(1); |
46 | 0 | UTF_PARSE_BYTE(2); |
47 | 0 | if(UTF_UNLIKELY(*codepoint < 0x800)) |
48 | 0 | return 0; /* Too small for the encoding length */ |
49 | 0 | } else if(*codepoint <= 0xF4) { /* 4-byte sequence */ |
50 | 0 | if(len < 4) |
51 | 0 | return 0; |
52 | 0 | count = 4; |
53 | 0 | *codepoint &= 0x7; |
54 | 0 | UTF_PARSE_BYTE(1); |
55 | 0 | UTF_PARSE_BYTE(2); |
56 | 0 | UTF_PARSE_BYTE(3); |
57 | 0 | if(UTF_UNLIKELY(*codepoint < 0x10000)) |
58 | 0 | return 0; /* Too small for the encoding length */ |
59 | 0 | } else { |
60 | 0 | return 0; /* Invalid utf8 encoding */ |
61 | 0 | } |
62 | | |
63 | 0 | if(UTF_UNLIKELY(*codepoint > 0x10FFFF)) |
64 | 0 | return 0; /* Not in the Unicode range */ |
65 | | |
66 | 0 | return count; |
67 | 0 | } |