/src/quickjs/libunicode.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Unicode utilities |
3 | | * |
4 | | * Copyright (c) 2017-2018 Fabrice Bellard |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
19 | | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
22 | | * THE SOFTWARE. |
23 | | */ |
24 | | #ifndef LIBUNICODE_H |
25 | | #define LIBUNICODE_H |
26 | | |
27 | | #include <stdint.h> |
28 | | |
29 | | /* define it to include all the unicode tables (40KB larger) */ |
30 | | #define CONFIG_ALL_UNICODE |
31 | | |
32 | | #define LRE_CC_RES_LEN_MAX 3 |
33 | | |
34 | | /* char ranges */ |
35 | | |
36 | | typedef struct { |
37 | | int len; /* in points, always even */ |
38 | | int size; |
39 | | uint32_t *points; /* points sorted by increasing value */ |
40 | | void *mem_opaque; |
41 | | void *(*realloc_func)(void *opaque, void *ptr, size_t size); |
42 | | } CharRange; |
43 | | |
44 | | typedef enum { |
45 | | CR_OP_UNION, |
46 | | CR_OP_INTER, |
47 | | CR_OP_XOR, |
48 | | CR_OP_SUB, |
49 | | } CharRangeOpEnum; |
50 | | |
51 | | void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size)); |
52 | | void cr_free(CharRange *cr); |
53 | | int cr_realloc(CharRange *cr, int size); |
54 | | int cr_copy(CharRange *cr, const CharRange *cr1); |
55 | | |
56 | | static inline int cr_add_point(CharRange *cr, uint32_t v) |
57 | 20 | { |
58 | 20 | if (cr->len >= cr->size) { |
59 | 9 | if (cr_realloc(cr, cr->len + 1)) |
60 | 0 | return -1; |
61 | 9 | } |
62 | 20 | cr->points[cr->len++] = v; |
63 | 20 | return 0; |
64 | 20 | } Unexecuted instantiation: quickjs.c:cr_add_point Line | Count | Source | 57 | 20 | { | 58 | 20 | if (cr->len >= cr->size) { | 59 | 9 | if (cr_realloc(cr, cr->len + 1)) | 60 | 0 | return -1; | 61 | 9 | } | 62 | 20 | cr->points[cr->len++] = v; | 63 | 20 | return 0; | 64 | 20 | } |
Unexecuted instantiation: libunicode.c:cr_add_point |
65 | | |
66 | | static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2) |
67 | 0 | { |
68 | 0 | if ((cr->len + 2) > cr->size) { |
69 | 0 | if (cr_realloc(cr, cr->len + 2)) |
70 | 0 | return -1; |
71 | 0 | } |
72 | 0 | cr->points[cr->len++] = c1; |
73 | 0 | cr->points[cr->len++] = c2; |
74 | 0 | return 0; |
75 | 0 | } Unexecuted instantiation: quickjs.c:cr_add_interval Unexecuted instantiation: libregexp.c:cr_add_interval Unexecuted instantiation: libunicode.c:cr_add_interval |
76 | | |
77 | | int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, |
78 | | const uint32_t *b_pt, int b_len, int op); |
79 | | int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op); |
80 | | |
81 | | static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2) |
82 | 0 | { |
83 | 0 | uint32_t b_pt[2]; |
84 | 0 | b_pt[0] = c1; |
85 | 0 | b_pt[1] = c2 + 1; |
86 | 0 | return cr_op1(cr, b_pt, 2, CR_OP_UNION); |
87 | 0 | } Unexecuted instantiation: quickjs.c:cr_union_interval Unexecuted instantiation: libregexp.c:cr_union_interval Unexecuted instantiation: libunicode.c:cr_union_interval |
88 | | |
89 | | int cr_invert(CharRange *cr); |
90 | | |
91 | | int cr_regexp_canonicalize(CharRange *cr, int is_unicode); |
92 | | |
93 | | typedef enum { |
94 | | UNICODE_NFC, |
95 | | UNICODE_NFD, |
96 | | UNICODE_NFKC, |
97 | | UNICODE_NFKD, |
98 | | } UnicodeNormalizationEnum; |
99 | | |
100 | | int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, |
101 | | UnicodeNormalizationEnum n_type, |
102 | | void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size)); |
103 | | |
104 | | /* Unicode character range functions */ |
105 | | |
106 | | int unicode_script(CharRange *cr, const char *script_name, int is_ext); |
107 | | int unicode_general_category(CharRange *cr, const char *gc_name); |
108 | | int unicode_prop(CharRange *cr, const char *prop_name); |
109 | | |
110 | | typedef void UnicodeSequencePropCB(void *opaque, const uint32_t *buf, int len); |
111 | | int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque, |
112 | | CharRange *cr); |
113 | | |
114 | | int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); |
115 | | int lre_canonicalize(uint32_t c, int is_unicode); |
116 | | |
117 | | /* Code point type categories */ |
118 | | enum { |
119 | | UNICODE_C_SPACE = (1 << 0), |
120 | | UNICODE_C_DIGIT = (1 << 1), |
121 | | UNICODE_C_UPPER = (1 << 2), |
122 | | UNICODE_C_LOWER = (1 << 3), |
123 | | UNICODE_C_UNDER = (1 << 4), |
124 | | UNICODE_C_DOLLAR = (1 << 5), |
125 | | UNICODE_C_XDIGIT = (1 << 6), |
126 | | }; |
127 | | extern uint8_t const lre_ctype_bits[256]; |
128 | | |
129 | | /* zero or non-zero return value */ |
130 | | int lre_is_cased(uint32_t c); |
131 | | int lre_is_case_ignorable(uint32_t c); |
132 | | int lre_is_id_start(uint32_t c); |
133 | | int lre_is_id_continue(uint32_t c); |
134 | | |
135 | 144 | static inline int lre_is_space_byte(uint8_t c) { |
136 | 144 | return lre_ctype_bits[c] & UNICODE_C_SPACE; |
137 | 144 | } quickjs.c:lre_is_space_byte Line | Count | Source | 135 | 144 | static inline int lre_is_space_byte(uint8_t c) { | 136 | 144 | return lre_ctype_bits[c] & UNICODE_C_SPACE; | 137 | 144 | } |
Unexecuted instantiation: libregexp.c:lre_is_space_byte Unexecuted instantiation: libunicode.c:lre_is_space_byte |
138 | | |
139 | 145 | static inline int lre_is_id_start_byte(uint8_t c) { |
140 | 145 | return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | |
141 | 145 | UNICODE_C_UNDER | UNICODE_C_DOLLAR); |
142 | 145 | } quickjs.c:lre_is_id_start_byte Line | Count | Source | 139 | 145 | static inline int lre_is_id_start_byte(uint8_t c) { | 140 | 145 | return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | | 141 | 145 | UNICODE_C_UNDER | UNICODE_C_DOLLAR); | 142 | 145 | } |
Unexecuted instantiation: libregexp.c:lre_is_id_start_byte Unexecuted instantiation: libunicode.c:lre_is_id_start_byte |
143 | | |
144 | 3.14M | static inline int lre_is_id_continue_byte(uint8_t c) { |
145 | 3.14M | return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | |
146 | 3.14M | UNICODE_C_UNDER | UNICODE_C_DOLLAR | |
147 | 3.14M | UNICODE_C_DIGIT); |
148 | 3.14M | } quickjs.c:lre_is_id_continue_byte Line | Count | Source | 144 | 3.14M | static inline int lre_is_id_continue_byte(uint8_t c) { | 145 | 3.14M | return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | | 146 | 3.14M | UNICODE_C_UNDER | UNICODE_C_DOLLAR | | 147 | 3.14M | UNICODE_C_DIGIT); | 148 | 3.14M | } |
Unexecuted instantiation: libregexp.c:lre_is_id_continue_byte Unexecuted instantiation: libunicode.c:lre_is_id_continue_byte |
149 | | |
150 | | int lre_is_space_non_ascii(uint32_t c); |
151 | | |
152 | 144 | static inline int lre_is_space(uint32_t c) { |
153 | 144 | if (c < 256) |
154 | 144 | return lre_is_space_byte(c); |
155 | 0 | else |
156 | 0 | return lre_is_space_non_ascii(c); |
157 | 144 | } Line | Count | Source | 152 | 144 | static inline int lre_is_space(uint32_t c) { | 153 | 144 | if (c < 256) | 154 | 144 | return lre_is_space_byte(c); | 155 | 0 | else | 156 | 0 | return lre_is_space_non_ascii(c); | 157 | 144 | } |
Unexecuted instantiation: libregexp.c:lre_is_space Unexecuted instantiation: libunicode.c:lre_is_space |
158 | | |
159 | 145 | static inline int lre_js_is_ident_first(uint32_t c) { |
160 | 145 | if (c < 128) { |
161 | 145 | return lre_is_id_start_byte(c); |
162 | 145 | } else { |
163 | 0 | #ifdef CONFIG_ALL_UNICODE |
164 | 0 | return lre_is_id_start(c); |
165 | | #else |
166 | | return !lre_is_space_non_ascii(c); |
167 | | #endif |
168 | 0 | } |
169 | 145 | } quickjs.c:lre_js_is_ident_first Line | Count | Source | 159 | 145 | static inline int lre_js_is_ident_first(uint32_t c) { | 160 | 145 | if (c < 128) { | 161 | 145 | return lre_is_id_start_byte(c); | 162 | 145 | } else { | 163 | 0 | #ifdef CONFIG_ALL_UNICODE | 164 | 0 | return lre_is_id_start(c); | 165 | | #else | 166 | | return !lre_is_space_non_ascii(c); | 167 | | #endif | 168 | 0 | } | 169 | 145 | } |
Unexecuted instantiation: libregexp.c:lre_js_is_ident_first Unexecuted instantiation: libunicode.c:lre_js_is_ident_first |
170 | | |
171 | 3.14M | static inline int lre_js_is_ident_next(uint32_t c) { |
172 | 3.14M | if (c < 128) { |
173 | 3.14M | return lre_is_id_continue_byte(c); |
174 | 3.14M | } else { |
175 | | /* ZWNJ and ZWJ are accepted in identifiers */ |
176 | 1 | if (c >= 0x200C && c <= 0x200D) |
177 | 0 | return TRUE; |
178 | 1 | #ifdef CONFIG_ALL_UNICODE |
179 | 1 | return lre_is_id_continue(c); |
180 | | #else |
181 | | return !lre_is_space_non_ascii(c); |
182 | | #endif |
183 | 1 | } |
184 | 3.14M | } quickjs.c:lre_js_is_ident_next Line | Count | Source | 171 | 3.14M | static inline int lre_js_is_ident_next(uint32_t c) { | 172 | 3.14M | if (c < 128) { | 173 | 3.14M | return lre_is_id_continue_byte(c); | 174 | 3.14M | } else { | 175 | | /* ZWNJ and ZWJ are accepted in identifiers */ | 176 | 1 | if (c >= 0x200C && c <= 0x200D) | 177 | 0 | return TRUE; | 178 | 1 | #ifdef CONFIG_ALL_UNICODE | 179 | 1 | return lre_is_id_continue(c); | 180 | | #else | 181 | | return !lre_is_space_non_ascii(c); | 182 | | #endif | 183 | 1 | } | 184 | 3.14M | } |
Unexecuted instantiation: libregexp.c:lre_js_is_ident_next Unexecuted instantiation: libunicode.c:lre_js_is_ident_next |
185 | | |
186 | | #endif /* LIBUNICODE_H */ |