Line | Count | Source |
1 | | /** |
2 | | * UTF-8 utility functions |
3 | | * |
4 | | * (c) 2010-2016 Steve Bennett <steveb@workware.net.au> |
5 | | * |
6 | | * See LICENCE for licence details. |
7 | | */ |
8 | | |
9 | | #include <ctype.h> |
10 | | #include <stdlib.h> |
11 | | #include <string.h> |
12 | | #include <stdio.h> |
13 | | #include <assert.h> |
14 | | #include "utf8.h" |
15 | | |
16 | | /* This one is always implemented */ |
17 | | int utf8_fromunicode(char *p, unsigned uc) |
18 | 0 | { |
19 | 0 | if (uc <= 0x7f) { |
20 | 0 | *p = uc; |
21 | 0 | return 1; |
22 | 0 | } |
23 | 0 | else if (uc <= 0x7ff) { |
24 | 0 | *p++ = 0xc0 | ((uc & 0x7c0) >> 6); |
25 | 0 | *p = 0x80 | (uc & 0x3f); |
26 | 0 | return 2; |
27 | 0 | } |
28 | 0 | else if (uc <= 0xffff) { |
29 | 0 | *p++ = 0xe0 | ((uc & 0xf000) >> 12); |
30 | 0 | *p++ = 0x80 | ((uc & 0xfc0) >> 6); |
31 | 0 | *p = 0x80 | (uc & 0x3f); |
32 | 0 | return 3; |
33 | 0 | } |
34 | | /* Note: We silently truncate to 21 bits here: 0x1fffff */ |
35 | 0 | else { |
36 | 0 | *p++ = 0xf0 | ((uc & 0x1c0000) >> 18); |
37 | 0 | *p++ = 0x80 | ((uc & 0x3f000) >> 12); |
38 | 0 | *p++ = 0x80 | ((uc & 0xfc0) >> 6); |
39 | 0 | *p = 0x80 | (uc & 0x3f); |
40 | 0 | return 4; |
41 | 0 | } |
42 | 0 | } |
43 | | |
44 | | #if defined(USE_UTF8) && !defined(JIM_BOOTSTRAP) |
45 | | int utf8_charlen(int c) |
46 | 0 | { |
47 | 0 | if ((c & 0x80) == 0) { |
48 | 0 | return 1; |
49 | 0 | } |
50 | 0 | if ((c & 0xe0) == 0xc0) { |
51 | 0 | return 2; |
52 | 0 | } |
53 | 0 | if ((c & 0xf0) == 0xe0) { |
54 | 0 | return 3; |
55 | 0 | } |
56 | 0 | if ((c & 0xf8) == 0xf0) { |
57 | 0 | return 4; |
58 | 0 | } |
59 | | /* Invalid sequence, so treat it as a single byte */ |
60 | 0 | return 1; |
61 | 0 | } |
62 | | |
63 | | int utf8_index(const char *str, int index) |
64 | 0 | { |
65 | 0 | const char *s = str; |
66 | 0 | while (index--) { |
67 | 0 | s += utf8_charlen(*s); |
68 | 0 | } |
69 | 0 | return s - str; |
70 | 0 | } |
71 | | |
72 | | int utf8_tounicode(const char *str, int *uc) |
73 | 0 | { |
74 | 0 | unsigned const char *s = (unsigned const char *)str; |
75 | |
|
76 | 0 | if (s[0] < 0xc0) { |
77 | 0 | *uc = s[0]; |
78 | 0 | return 1; |
79 | 0 | } |
80 | 0 | if (s[0] < 0xe0) { |
81 | 0 | if ((s[1] & 0xc0) == 0x80) { |
82 | 0 | *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); |
83 | 0 | if (*uc >= 0x80) { |
84 | 0 | return 2; |
85 | 0 | } |
86 | | /* Otherwise this is an invalid sequence */ |
87 | 0 | } |
88 | 0 | } |
89 | 0 | else if (s[0] < 0xf0) { |
90 | 0 | if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) { |
91 | 0 | *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80); |
92 | 0 | if (*uc >= 0x800) { |
93 | 0 | return 3; |
94 | 0 | } |
95 | | /* Otherwise this is an invalid sequence */ |
96 | 0 | } |
97 | 0 | } |
98 | 0 | else if (s[0] < 0xf8) { |
99 | 0 | if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) { |
100 | 0 | *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80); |
101 | 0 | if (*uc >= 0x10000) { |
102 | 0 | return 4; |
103 | 0 | } |
104 | | /* Otherwise this is an invalid sequence */ |
105 | 0 | } |
106 | 0 | } |
107 | | |
108 | | /* Invalid sequence, so just return the byte */ |
109 | 0 | *uc = *s; |
110 | 0 | return 1; |
111 | 0 | } |
112 | | |
113 | | struct casemap { |
114 | | unsigned short code; /* code point */ |
115 | | unsigned short altcode; /* alternate case code point */ |
116 | | }; |
117 | | |
118 | | |
119 | | /* Generated mapping tables */ |
120 | | #include "_unicode_mapping.c" |
121 | | |
122 | 0 | #define ARRAYSIZE(A) sizeof(A) / sizeof(*(A)) |
123 | | |
124 | | static int cmp_casemap(const void *key, const void *cm) |
125 | 0 | { |
126 | 0 | return *(int *)key - (int)((const struct casemap *)cm)->code; |
127 | 0 | } |
128 | | |
129 | | static int utf8_map_case(const struct casemap *mapping, int num, int ch) |
130 | 0 | { |
131 | | /* We only support 16 bit case mapping */ |
132 | 0 | if (ch <= 0xffff) { |
133 | 0 | const struct casemap *cm = |
134 | 0 | bsearch(&ch, mapping, num, sizeof(*mapping), cmp_casemap); |
135 | |
|
136 | 0 | if (cm) { |
137 | 0 | return cm->altcode; |
138 | 0 | } |
139 | 0 | } |
140 | 0 | return ch; |
141 | 0 | } |
142 | | |
143 | | int utf8_upper(int ch) |
144 | 0 | { |
145 | 0 | if (isascii(ch)) { |
146 | 0 | return toupper(ch); |
147 | 0 | } |
148 | 0 | return utf8_map_case(unicode_case_mapping_upper, ARRAYSIZE(unicode_case_mapping_upper), ch); |
149 | 0 | } |
150 | | #endif /* JIM_BOOTSTRAP */ |