/src/jansson-2.14/src/utf.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org> |
3 | | * |
4 | | * Jansson is free software; you can redistribute it and/or modify |
5 | | * it under the terms of the MIT license. See LICENSE for details. |
6 | | */ |
7 | | |
8 | | #include "utf.h" |
9 | | #include <string.h> |
10 | | |
11 | 0 | int utf8_encode(int32_t codepoint, char *buffer, size_t *size) { |
12 | 0 | if (codepoint < 0) |
13 | 0 | return -1; |
14 | 0 | else if (codepoint < 0x80) { |
15 | 0 | buffer[0] = (char)codepoint; |
16 | 0 | *size = 1; |
17 | 0 | } else if (codepoint < 0x800) { |
18 | 0 | buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); |
19 | 0 | buffer[1] = 0x80 + ((codepoint & 0x03F)); |
20 | 0 | *size = 2; |
21 | 0 | } else if (codepoint < 0x10000) { |
22 | 0 | buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); |
23 | 0 | buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); |
24 | 0 | buffer[2] = 0x80 + ((codepoint & 0x003F)); |
25 | 0 | *size = 3; |
26 | 0 | } else if (codepoint <= 0x10FFFF) { |
27 | 0 | buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); |
28 | 0 | buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); |
29 | 0 | buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); |
30 | 0 | buffer[3] = 0x80 + ((codepoint & 0x00003F)); |
31 | 0 | *size = 4; |
32 | 0 | } else |
33 | 0 | return -1; |
34 | | |
35 | 0 | return 0; |
36 | 0 | } |
37 | | |
38 | 0 | size_t utf8_check_first(char byte) { |
39 | 0 | unsigned char u = (unsigned char)byte; |
40 | |
|
41 | 0 | if (u < 0x80) |
42 | 0 | return 1; |
43 | | |
44 | 0 | if (0x80 <= u && u <= 0xBF) { |
45 | | /* second, third or fourth byte of a multi-byte |
46 | | sequence, i.e. a "continuation byte" */ |
47 | 0 | return 0; |
48 | 0 | } else if (u == 0xC0 || u == 0xC1) { |
49 | | /* overlong encoding of an ASCII byte */ |
50 | 0 | return 0; |
51 | 0 | } else if (0xC2 <= u && u <= 0xDF) { |
52 | | /* 2-byte sequence */ |
53 | 0 | return 2; |
54 | 0 | } |
55 | | |
56 | 0 | else if (0xE0 <= u && u <= 0xEF) { |
57 | | /* 3-byte sequence */ |
58 | 0 | return 3; |
59 | 0 | } else if (0xF0 <= u && u <= 0xF4) { |
60 | | /* 4-byte sequence */ |
61 | 0 | return 4; |
62 | 0 | } else { /* u >= 0xF5 */ |
63 | | /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid |
64 | | UTF-8 */ |
65 | 0 | return 0; |
66 | 0 | } |
67 | 0 | } |
68 | | |
69 | 0 | size_t utf8_check_full(const char *buffer, size_t size, int32_t *codepoint) { |
70 | 0 | size_t i; |
71 | 0 | int32_t value = 0; |
72 | 0 | unsigned char u = (unsigned char)buffer[0]; |
73 | |
|
74 | 0 | if (size == 2) { |
75 | 0 | value = u & 0x1F; |
76 | 0 | } else if (size == 3) { |
77 | 0 | value = u & 0xF; |
78 | 0 | } else if (size == 4) { |
79 | 0 | value = u & 0x7; |
80 | 0 | } else |
81 | 0 | return 0; |
82 | | |
83 | 0 | for (i = 1; i < size; i++) { |
84 | 0 | u = (unsigned char)buffer[i]; |
85 | |
|
86 | 0 | if (u < 0x80 || u > 0xBF) { |
87 | | /* not a continuation byte */ |
88 | 0 | return 0; |
89 | 0 | } |
90 | | |
91 | 0 | value = (value << 6) + (u & 0x3F); |
92 | 0 | } |
93 | | |
94 | 0 | if (value > 0x10FFFF) { |
95 | | /* not in Unicode range */ |
96 | 0 | return 0; |
97 | 0 | } |
98 | | |
99 | 0 | else if (0xD800 <= value && value <= 0xDFFF) { |
100 | | /* invalid code point (UTF-16 surrogate halves) */ |
101 | 0 | return 0; |
102 | 0 | } |
103 | | |
104 | 0 | else if ((size == 2 && value < 0x80) || (size == 3 && value < 0x800) || |
105 | 0 | (size == 4 && value < 0x10000)) { |
106 | | /* overlong encoding */ |
107 | 0 | return 0; |
108 | 0 | } |
109 | | |
110 | 0 | if (codepoint) |
111 | 0 | *codepoint = value; |
112 | |
|
113 | 0 | return 1; |
114 | 0 | } |
115 | | |
116 | 0 | const char *utf8_iterate(const char *buffer, size_t bufsize, int32_t *codepoint) { |
117 | 0 | size_t count; |
118 | 0 | int32_t value; |
119 | |
|
120 | 0 | if (!bufsize) |
121 | 0 | return buffer; |
122 | | |
123 | 0 | count = utf8_check_first(buffer[0]); |
124 | 0 | if (count <= 0) |
125 | 0 | return NULL; |
126 | | |
127 | 0 | if (count == 1) |
128 | 0 | value = (unsigned char)buffer[0]; |
129 | 0 | else { |
130 | 0 | if (count > bufsize || !utf8_check_full(buffer, count, &value)) |
131 | 0 | return NULL; |
132 | 0 | } |
133 | | |
134 | 0 | if (codepoint) |
135 | 0 | *codepoint = value; |
136 | |
|
137 | 0 | return buffer + count; |
138 | 0 | } |
139 | | |
140 | 0 | int utf8_check_string(const char *string, size_t length) { |
141 | 0 | size_t i; |
142 | |
|
143 | 0 | for (i = 0; i < length; i++) { |
144 | 0 | size_t count = utf8_check_first(string[i]); |
145 | 0 | if (count == 0) |
146 | 0 | return 0; |
147 | 0 | else if (count > 1) { |
148 | 0 | if (count > length - i) |
149 | 0 | return 0; |
150 | | |
151 | 0 | if (!utf8_check_full(&string[i], count, NULL)) |
152 | 0 | return 0; |
153 | | |
154 | 0 | i += count - 1; |
155 | 0 | } |
156 | 0 | } |
157 | | |
158 | 0 | return 1; |
159 | 0 | } |