/src/krb5/src/util/support/utf8.c
Line | Count | Source |
1 | | /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | | /* util/support/utf8.c */ |
3 | | /* |
4 | | * Copyright 2008 by the Massachusetts Institute of Technology. |
5 | | * All Rights Reserved. |
6 | | * |
7 | | * Export of this software from the United States of America may |
8 | | * require a specific license from the United States Government. |
9 | | * It is the responsibility of any person or organization contemplating |
10 | | * export to obtain such a license before exporting. |
11 | | * |
12 | | * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and |
13 | | * distribute this software and its documentation for any purpose and |
14 | | * without fee is hereby granted, provided that the above copyright |
15 | | * notice appear in all copies and that both that copyright notice and |
16 | | * this permission notice appear in supporting documentation, and that |
17 | | * the name of M.I.T. not be used in advertising or publicity pertaining |
18 | | * to distribution of the software without specific, written prior |
19 | | * permission. Furthermore if you modify this software you must label |
20 | | * your software as modified software and not distribute it in such a |
21 | | * fashion that it might be confused with the original M.I.T. software. |
22 | | * M.I.T. makes no representations about the suitability of |
23 | | * this software for any purpose. It is provided "as is" without express |
24 | | * or implied warranty. |
25 | | */ |
26 | | /* |
27 | | * Copyright 1998-2008 The OpenLDAP Foundation. |
28 | | * All rights reserved. |
29 | | * |
30 | | * Redistribution and use in source and binary forms, with or without |
31 | | * modification, are permitted only as authorized by the OpenLDAP |
32 | | * Public License. |
33 | | * |
34 | | * A copy of this license is available in the file LICENSE in the |
35 | | * top-level directory of the distribution or, alternatively, at |
36 | | * <https://www.OpenLDAP.org/license.html>. |
37 | | */ |
38 | | |
39 | | /* This work is part of OpenLDAP Software <https://www.openldap.org/>. */ |
40 | | |
41 | | /* Basic UTF-8 routines |
42 | | * |
43 | | * These routines are "dumb". Though they understand UTF-8, |
44 | | * they don't grok Unicode. That is, they can push bits, |
45 | | * but don't have a clue what the bits represent. That's |
46 | | * good enough for use with the KRB5 Client SDK. |
47 | | * |
48 | | * These routines are not optimized. |
49 | | */ |
50 | | |
51 | | #include "k5-platform.h" |
52 | | #include "k5-utf8.h" |
53 | | #include "supp-int.h" |
54 | | |
55 | | /* |
56 | | * Returns length indicated by first byte. |
57 | | */ |
58 | | const char krb5int_utf8_lentab[] = { |
59 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
60 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
61 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
62 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
63 | | 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
64 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
65 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
66 | | 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
67 | | |
68 | | /* |
69 | | * Make sure the UTF-8 char used the shortest possible encoding |
70 | | * returns charlen if valid, 0 if not. |
71 | | * |
72 | | * Here are the valid UTF-8 encodings, taken from RFC 3629 page 4. |
73 | | * The table is slightly modified from that of the RFC. |
74 | | * |
75 | | * UCS-4 range (hex) UTF-8 sequence (binary) |
76 | | * 0000 0000-0000 007F 0....... |
77 | | * 0000 0080-0000 07FF 110++++. 10...... |
78 | | * 0000 0800-0000 FFFF 1110++++ 10+..... 10...... |
79 | | * 0001 0000-0010 FFFF 11110+++ 10++.... 10...... 10...... |
80 | | * |
81 | | * The '.' bits are "don't cares". When validating a UTF-8 sequence, |
82 | | * at least one of the '+' bits must be set, otherwise the character |
83 | | * should have been encoded in fewer octets. Note that in the two-octet |
84 | | * case, only the first octet needs to be validated, and this is done |
85 | | * in the krb5int_utf8_lentab[] above. |
86 | | */ |
87 | | |
88 | | /* mask of required bits in second octet */ |
89 | | #undef c |
90 | | #define c const char |
91 | | c krb5int_utf8_mintab[] = { |
92 | | (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, |
93 | | (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, |
94 | | (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x00, (c)0x00, (c)0x00, |
95 | | (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 }; |
96 | | #undef c |
97 | | |
98 | | /* |
99 | | * Convert a UTF8 character to a UCS4 character. Return 0 on success, |
100 | | * -1 on failure. |
101 | | */ |
102 | | int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out) |
103 | 906 | { |
104 | 906 | const unsigned char *c = (const unsigned char *) p; |
105 | 906 | krb5_ucs4 ch; |
106 | 906 | int len, i; |
107 | 906 | static unsigned char mask[] = { |
108 | 906 | 0, 0x7f, 0x1f, 0x0f, 0x07 }; |
109 | | |
110 | 906 | *out = 0; |
111 | 906 | len = KRB5_UTF8_CHARLEN2(p, len); |
112 | | |
113 | 906 | if (len == 0) |
114 | 117 | return -1; |
115 | | |
116 | 789 | ch = c[0] & mask[len]; |
117 | | |
118 | 1.09k | for (i = 1; i < len; i++) { |
119 | 348 | if ((c[i] & 0xc0) != 0x80) |
120 | 44 | return -1; |
121 | | |
122 | 304 | ch <<= 6; |
123 | 304 | ch |= c[i] & 0x3f; |
124 | 304 | } |
125 | | |
126 | 745 | if (ch > 0x10ffff) |
127 | 3 | return -1; |
128 | | |
129 | 742 | *out = ch; |
130 | 742 | return 0; |
131 | 745 | } |
132 | | |
133 | | /* conv UCS-4 to UTF-8 */ |
134 | | size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) |
135 | 61.5k | { |
136 | 61.5k | size_t len = 0; |
137 | 61.5k | unsigned char *p = (unsigned char *) buf; |
138 | | |
139 | | /* not a valid Unicode character */ |
140 | 61.5k | if (c > 0x10ffff) |
141 | 0 | return 0; |
142 | | |
143 | | /* Just return length, don't convert */ |
144 | 61.5k | if (buf == NULL) { |
145 | 30.7k | if (c < 0x80) return 1; |
146 | 23.9k | else if (c < 0x800) return 2; |
147 | 20.6k | else if (c < 0x10000) return 3; |
148 | 1.15k | else return 4; |
149 | 30.7k | } |
150 | | |
151 | 30.7k | if (c < 0x80) { |
152 | 6.83k | p[len++] = c; |
153 | 23.9k | } else if (c < 0x800) { |
154 | 3.33k | p[len++] = 0xc0 | ( c >> 6 ); |
155 | 3.33k | p[len++] = 0x80 | ( c & 0x3f ); |
156 | 20.6k | } else if (c < 0x10000) { |
157 | 19.4k | p[len++] = 0xe0 | ( c >> 12 ); |
158 | 19.4k | p[len++] = 0x80 | ( (c >> 6) & 0x3f ); |
159 | 19.4k | p[len++] = 0x80 | ( c & 0x3f ); |
160 | 19.4k | } else /* if (c < 0x110000) */ { |
161 | 1.15k | p[len++] = 0xf0 | ( c >> 18 ); |
162 | 1.15k | p[len++] = 0x80 | ( (c >> 12) & 0x3f ); |
163 | 1.15k | p[len++] = 0x80 | ( (c >> 6) & 0x3f ); |
164 | 1.15k | p[len++] = 0x80 | ( c & 0x3f ); |
165 | 1.15k | } |
166 | | |
167 | 30.7k | return len; |
168 | 61.5k | } |