/src/mozilla-central/gfx/thebes/nsUnicodeRange.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #include "nsUnicodeRange.h" |
7 | | |
8 | | /********************************************************************** |
9 | | * Unicode subranges as defined in unicode 3.0 |
10 | | * x-western -> latin |
11 | | * 0000 - 036f |
12 | | * 1e00 - 1eff |
13 | | * 2000 - 206f (general punctuation) |
14 | | * 20a0 - 20cf (currency symbols) |
15 | | * 2100 - 214f (letterlike symbols) |
16 | | * 2150 - 218f (Number Forms) |
17 | | * el -> greek |
18 | | * 0370 - 03ff |
19 | | * 1f00 - 1fff |
20 | | * x-cyrillic -> cyrillic |
21 | | * 0400 - 04ff |
22 | | * he -> hebrew |
23 | | * 0590 - 05ff |
24 | | * ar -> arabic |
25 | | * 0600 - 06ff |
26 | | * fb50 - fdff (arabic presentation forms) |
27 | | * fe70 - feff (arabic presentation forms b) |
28 | | * th - thai |
29 | | * 0e00 - 0e7f |
30 | | * ko -> korean |
31 | | * ac00 - d7af (hangul Syllables) |
32 | | * 1100 - 11ff (jamo) |
33 | | * 3130 - 318f (hangul compatibility jamo) |
34 | | * ja |
35 | | * 3040 - 309f (hiragana) |
36 | | * 30a0 - 30ff (katakana) |
37 | | * zh-CN |
38 | | * zh-TW |
39 | | * |
40 | | * CJK |
41 | | * 3100 - 312f (bopomofo) |
42 | | * 31a0 - 31bf (bopomofo extended) |
43 | | * 3000 - 303f (CJK Symbols and Punctuation) |
44 | | * 2e80 - 2eff (CJK radicals supplement) |
45 | | * 2f00 - 2fdf (Kangxi Radicals) |
46 | | * 2ff0 - 2fff (Ideographic Description Characters) |
47 | | * 3190 - 319f (kanbun) |
48 | | * 3200 - 32ff (Enclosed CJK letters and Months) |
49 | | * 3300 - 33ff (CJK compatibility) |
50 | | * 3400 - 4dbf (CJK Unified Ideographs Extension A) |
51 | | * 4e00 - 9faf (CJK Unified Ideographs) |
52 | | * f900 - fa5f (CJK Compatibility Ideographs) |
53 | | * fe30 - fe4f (CJK compatibility Forms) |
54 | | * ff00 - ffef (halfwidth and fullwidth forms) |
55 | | * |
56 | | * Armenian |
57 | | * 0530 - 058f |
58 | | * Sriac |
59 | | * 0700 - 074f |
60 | | * Thaana |
61 | | * 0780 - 07bf |
62 | | * Devanagari |
63 | | * 0900 - 097f |
64 | | * Bengali |
65 | | * 0980 - 09ff |
66 | | * Gurmukhi |
67 | | * 0a00 - 0a7f |
68 | | * Gujarati |
69 | | * 0a80 - 0aff |
70 | | * Oriya |
71 | | * 0b00 - 0b7f |
72 | | * Tamil |
73 | | * 0b80 - 0bff |
74 | | * Telugu |
75 | | * 0c00 - 0c7f |
76 | | * Kannada |
77 | | * 0c80 - 0cff |
78 | | * Malayalam |
79 | | * 0d00 - 0d7f |
80 | | * Sinhala |
81 | | * 0d80 - 0def |
82 | | * Lao |
83 | | * 0e80 - 0eff |
84 | | * Tibetan |
85 | | * 0f00 - 0fbf |
86 | | * Myanmar |
87 | | * 1000 - 109f |
88 | | * Georgian |
89 | | * 10a0 - 10ff |
90 | | * Ethiopic |
91 | | * 1200 - 137f |
92 | | * Cherokee |
93 | | * 13a0 - 13ff |
94 | | * Canadian Aboriginal Syllabics |
95 | | * 1400 - 167f |
96 | | * Ogham |
97 | | * 1680 - 169f |
98 | | * Runic |
99 | | * 16a0 - 16ff |
100 | | * Khmer |
101 | | * 1780 - 17ff |
102 | | * Mongolian |
103 | | * 1800 - 18af |
104 | | * Misc - superscripts and subscripts |
105 | | * 2070 - 209f |
106 | | * Misc - Combining Diacritical Marks for Symbols |
107 | | * 20d0 - 20ff |
108 | | * Misc - Arrows |
109 | | * 2190 - 21ff |
110 | | * Misc - Mathematical Operators |
111 | | * 2200 - 22ff |
112 | | * Misc - Miscellaneous Technical |
113 | | * 2300 - 23ff |
114 | | * Misc - Control picture |
115 | | * 2400 - 243f |
116 | | * Misc - Optical character recognition |
117 | | * 2440 - 2450 |
118 | | * Misc - Enclose Alphanumerics |
119 | | * 2460 - 24ff |
120 | | * Misc - Box Drawing |
121 | | * 2500 - 257f |
122 | | * Misc - Block Elements |
123 | | * 2580 - 259f |
124 | | * Misc - Geometric Shapes |
125 | | * 25a0 - 25ff |
126 | | * Misc - Miscellaneous Symbols |
127 | | * 2600 - 267f |
128 | | * Misc - Dingbats |
129 | | * 2700 - 27bf |
130 | | * Misc - Braille Patterns |
131 | | * 2800 - 28ff |
132 | | * Yi Syllables |
133 | | * a000 - a48f |
134 | | * Yi radicals |
135 | | * a490 - a4cf |
136 | | * Alphabetic Presentation Forms |
137 | | * fb00 - fb4f |
138 | | * Misc - Combining half Marks |
139 | | * fe20 - fe2f |
140 | | * Misc - small form variants |
141 | | * fe50 - fe6f |
142 | | * Misc - Specials |
143 | | * fff0 - ffff |
144 | | *********************************************************************/ |
145 | | |
146 | | |
147 | | |
148 | | #define NUM_OF_SUBTABLES 10 |
149 | | #define SUBTABLE_SIZE 16 |
150 | | |
151 | | static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = |
152 | | { |
153 | | { // table for X--- |
154 | | kRangeTableBase+1, //u0xxx |
155 | | kRangeTableBase+2, //u1xxx |
156 | | kRangeTableBase+3, //u2xxx |
157 | | kRangeSetCJK, //u3xxx |
158 | | kRangeSetCJK, //u4xxx |
159 | | kRangeSetCJK, //u5xxx |
160 | | kRangeSetCJK, //u6xxx |
161 | | kRangeSetCJK, //u7xxx |
162 | | kRangeSetCJK, //u8xxx |
163 | | kRangeSetCJK, //u9xxx |
164 | | kRangeTableBase+4, //uaxxx |
165 | | kRangeKorean, //ubxxx |
166 | | kRangeKorean, //ucxxx |
167 | | kRangeTableBase+5, //udxxx |
168 | | kRangePrivate, //uexxx |
169 | | kRangeTableBase+6 //ufxxx |
170 | | }, |
171 | | { //table for 0X-- |
172 | | kRangeSetLatin, //u00xx |
173 | | kRangeSetLatin, //u01xx |
174 | | kRangeSetLatin, //u02xx |
175 | | kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks |
176 | | kRangeCyrillic, //u04xx |
177 | | kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian |
178 | | kRangeArabic, //u06xx |
179 | | kRangeTertiaryTable, //u07xx |
180 | | kRangeUnassigned, //u08xx |
181 | | kRangeTertiaryTable, //u09xx |
182 | | kRangeTertiaryTable, //u0axx |
183 | | kRangeTertiaryTable, //u0bxx |
184 | | kRangeTertiaryTable, //u0cxx |
185 | | kRangeTertiaryTable, //u0dxx |
186 | | kRangeTertiaryTable, //u0exx |
187 | | kRangeTibetan //u0fxx |
188 | | }, |
189 | | { //table for 1x-- |
190 | | kRangeTertiaryTable, //u10xx |
191 | | kRangeKorean, //u11xx |
192 | | kRangeEthiopic, //u12xx |
193 | | kRangeTertiaryTable, //u13xx |
194 | | kRangeCanadian, //u14xx |
195 | | kRangeCanadian, //u15xx |
196 | | kRangeTertiaryTable, //u16xx |
197 | | kRangeKhmer, //u17xx |
198 | | kRangeMongolian, //u18xx |
199 | | kRangeUnassigned, //u19xx |
200 | | kRangeUnassigned, //u1axx |
201 | | kRangeUnassigned, //u1bxx |
202 | | kRangeUnassigned, //u1cxx |
203 | | kRangeUnassigned, //u1dxx |
204 | | kRangeSetLatin, //u1exx |
205 | | kRangeGreek //u1fxx |
206 | | }, |
207 | | { //table for 2x-- |
208 | | kRangeSetLatin, //u20xx |
209 | | kRangeSetLatin, //u21xx |
210 | | kRangeMathOperators, //u22xx |
211 | | kRangeMiscTechnical, //u23xx |
212 | | kRangeControlOpticalEnclose, //u24xx |
213 | | kRangeBoxBlockGeometrics, //u25xx |
214 | | kRangeMiscSymbols, //u26xx |
215 | | kRangeDingbats, //u27xx |
216 | | kRangeBraillePattern, //u28xx |
217 | | kRangeUnassigned, //u29xx |
218 | | kRangeUnassigned, //u2axx |
219 | | kRangeUnassigned, //u2bxx |
220 | | kRangeUnassigned, //u2cxx |
221 | | kRangeUnassigned, //u2dxx |
222 | | kRangeSetCJK, //u2exx |
223 | | kRangeSetCJK //u2fxx |
224 | | }, |
225 | | { //table for ax-- |
226 | | kRangeYi, //ua0xx |
227 | | kRangeYi, //ua1xx |
228 | | kRangeYi, //ua2xx |
229 | | kRangeYi, //ua3xx |
230 | | kRangeYi, //ua4xx |
231 | | kRangeUnassigned, //ua5xx |
232 | | kRangeUnassigned, //ua6xx |
233 | | kRangeUnassigned, //ua7xx |
234 | | kRangeUnassigned, //ua8xx |
235 | | kRangeUnassigned, //ua9xx |
236 | | kRangeUnassigned, //uaaxx |
237 | | kRangeUnassigned, //uabxx |
238 | | kRangeKorean, //uacxx |
239 | | kRangeKorean, //uadxx |
240 | | kRangeKorean, //uaexx |
241 | | kRangeKorean //uafxx |
242 | | }, |
243 | | { //table for dx-- |
244 | | kRangeKorean, //ud0xx |
245 | | kRangeKorean, //ud1xx |
246 | | kRangeKorean, //ud2xx |
247 | | kRangeKorean, //ud3xx |
248 | | kRangeKorean, //ud4xx |
249 | | kRangeKorean, //ud5xx |
250 | | kRangeKorean, //ud6xx |
251 | | kRangeKorean, //ud7xx |
252 | | kRangeSurrogate, //ud8xx |
253 | | kRangeSurrogate, //ud9xx |
254 | | kRangeSurrogate, //udaxx |
255 | | kRangeSurrogate, //udbxx |
256 | | kRangeSurrogate, //udcxx |
257 | | kRangeSurrogate, //uddxx |
258 | | kRangeSurrogate, //udexx |
259 | | kRangeSurrogate //udfxx |
260 | | }, |
261 | | { // table for fx-- |
262 | | kRangePrivate, //uf0xx |
263 | | kRangePrivate, //uf1xx |
264 | | kRangePrivate, //uf2xx |
265 | | kRangePrivate, //uf3xx |
266 | | kRangePrivate, //uf4xx |
267 | | kRangePrivate, //uf5xx |
268 | | kRangePrivate, //uf6xx |
269 | | kRangePrivate, //uf7xx |
270 | | kRangePrivate, //uf8xx |
271 | | kRangeSetCJK, //uf9xx |
272 | | kRangeSetCJK, //ufaxx |
273 | | kRangeArabic, //ufbxx, includes alphabic presentation form |
274 | | kRangeArabic, //ufcxx |
275 | | kRangeArabic, //ufdxx |
276 | | kRangeTableBase+8, //ufexx |
277 | | kRangeTableBase+9 //uffxx, halfwidth and fullwidth forms, includes Specials |
278 | | }, |
279 | | { //table for 0x0500 - 0x05ff |
280 | | kRangeCyrillic, //u050x |
281 | | kRangeCyrillic, //u051x |
282 | | kRangeCyrillic, //u052x |
283 | | kRangeArmenian, //u053x |
284 | | kRangeArmenian, //u054x |
285 | | kRangeArmenian, //u055x |
286 | | kRangeArmenian, //u056x |
287 | | kRangeArmenian, //u057x |
288 | | kRangeArmenian, //u058x |
289 | | kRangeHebrew, //u059x |
290 | | kRangeHebrew, //u05ax |
291 | | kRangeHebrew, //u05bx |
292 | | kRangeHebrew, //u05cx |
293 | | kRangeHebrew, //u05dx |
294 | | kRangeHebrew, //u05ex |
295 | | kRangeHebrew //u05fx |
296 | | }, |
297 | | { //table for 0xfe00 - 0xfeff |
298 | | kRangeSetCJK, //ufe0x |
299 | | kRangeSetCJK, //ufe1x |
300 | | kRangeSetCJK, //ufe2x |
301 | | kRangeSetCJK, //ufe3x |
302 | | kRangeSetCJK, //ufe4x |
303 | | kRangeSetCJK, //ufe5x |
304 | | kRangeSetCJK, //ufe6x |
305 | | kRangeArabic, //ufe7x |
306 | | kRangeArabic, //ufe8x |
307 | | kRangeArabic, //ufe9x |
308 | | kRangeArabic, //ufeax |
309 | | kRangeArabic, //ufebx |
310 | | kRangeArabic, //ufecx |
311 | | kRangeArabic, //ufedx |
312 | | kRangeArabic, //ufeex |
313 | | kRangeArabic //ufefx |
314 | | }, |
315 | | { //table for 0xff00 - 0xffff |
316 | | kRangeSetCJK, //uff0x, fullwidth latin |
317 | | kRangeSetCJK, //uff1x, fullwidth latin |
318 | | kRangeSetCJK, //uff2x, fullwidth latin |
319 | | kRangeSetCJK, //uff3x, fullwidth latin |
320 | | kRangeSetCJK, //uff4x, fullwidth latin |
321 | | kRangeSetCJK, //uff5x, fullwidth latin |
322 | | kRangeSetCJK, //uff6x, halfwidth katakana |
323 | | kRangeSetCJK, //uff7x, halfwidth katakana |
324 | | kRangeSetCJK, //uff8x, halfwidth katakana |
325 | | kRangeSetCJK, //uff9x, halfwidth katakana |
326 | | kRangeSetCJK, //uffax, halfwidth hangul jamo |
327 | | kRangeSetCJK, //uffbx, halfwidth hangul jamo |
328 | | kRangeSetCJK, //uffcx, halfwidth hangul jamo |
329 | | kRangeSetCJK, //uffdx, halfwidth hangul jamo |
330 | | kRangeSetCJK, //uffex, fullwidth symbols |
331 | | kRangeSpecials, //ufffx, Specials |
332 | | }, |
333 | | }; |
334 | | |
335 | | // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) |
336 | | // code points so that the number of entries in the tertiary range |
337 | | // table for that range is obtained by dividing (0x1700 - 0x0700) by 128. |
338 | | // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal |
339 | | // syllabaries take multiple chunks and Ogham and Runic share a single chunk. |
340 | | #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80) |
341 | | |
342 | | static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] = |
343 | | { //table for 0x0700 - 0x1600 |
344 | | kRangeSyriac, //u070x |
345 | | kRangeThaana, //u078x |
346 | | kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) |
347 | | kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) |
348 | | kRangeDevanagari, //u090x |
349 | | kRangeBengali, //u098x |
350 | | kRangeGurmukhi, //u0a0x |
351 | | kRangeGujarati, //u0a8x |
352 | | kRangeOriya, //u0b0x |
353 | | kRangeTamil, //u0b8x |
354 | | kRangeTelugu, //u0c0x |
355 | | kRangeKannada, //u0c8x |
356 | | kRangeMalayalam, //u0d0x |
357 | | kRangeSinhala, //u0d8x |
358 | | kRangeThai, //u0e0x |
359 | | kRangeLao, //u0e8x |
360 | | kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) |
361 | | kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) |
362 | | kRangeMyanmar, //u100x |
363 | | kRangeGeorgian, //u108x |
364 | | kRangeKorean, //u110x place holder(resolved in the 2ndary tab.) |
365 | | kRangeKorean, //u118x place holder(resolved in the 2ndary tab.) |
366 | | kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) |
367 | | kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) |
368 | | kRangeEthiopic, //u130x |
369 | | kRangeCherokee, //u138x |
370 | | kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) |
371 | | kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) |
372 | | kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) |
373 | | kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) |
374 | | kRangeCanadian, //u160x |
375 | | kRangeOghamRunic //u168x this contains two scripts, Ogham & Runic |
376 | | }; |
377 | | |
378 | | // A two level index is almost enough for locating a range, with the |
379 | | // exception of u03xx and u05xx. Since we don't really care about range for |
380 | | // combining diacritical marks in our font application, they are |
381 | | // not discriminated further. But future adoption of this module for other use |
382 | | // should be aware of this limitation. The implementation can be extended if |
383 | | // there is such a need. |
384 | | // For Indic, Southeast Asian scripts and some other scripts between |
385 | | // U+0700 and U+16FF, it's extended to the third level. |
386 | | uint32_t FindCharUnicodeRange(uint32_t ch) |
387 | 0 | { |
388 | 0 | uint32_t range; |
389 | 0 | |
390 | 0 | // aggregate ranges for non-BMP codepoints |
391 | 0 | if (ch > 0xFFFF) { |
392 | 0 | uint32_t p = (ch >> 16); |
393 | 0 | if (p == 1) { |
394 | 0 | return kRangeSMP; |
395 | 0 | } else if (p == 2) { |
396 | 0 | return kRangeSetCJK; |
397 | 0 | } |
398 | 0 | return kRangeHigherPlanes; |
399 | 0 | } |
400 | 0 | |
401 | 0 | // lookup explicit range for BMP codepoints |
402 | 0 | // first general range |
403 | 0 | range = gUnicodeSubrangeTable[0][ch >> 12]; |
404 | 0 | |
405 | 0 | // if general range is good enough, return that |
406 | 0 | if (range < kRangeTableBase) |
407 | 0 | // we try to get a specific range |
408 | 0 | return range; |
409 | 0 | |
410 | 0 | // otherwise, use subrange tables |
411 | 0 | range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8]; |
412 | 0 | if (range < kRangeTableBase) |
413 | 0 | return range; |
414 | 0 | if (range < kRangeTertiaryTable) |
415 | 0 | return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4]; |
416 | 0 | |
417 | 0 | // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks |
418 | 0 | return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; |
419 | 0 | } |