Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/gfx/thebes/nsUnicodeRange.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* This Source Code Form is subject to the terms of the Mozilla Public
3
 * License, v. 2.0. If a copy of the MPL was not distributed with this
4
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6
#include "nsUnicodeRange.h"
7
8
/**********************************************************************
9
 * Unicode subranges as defined in unicode 3.0
10
 * x-western  -> latin
11
 *  0000 - 036f
12
 *  1e00 - 1eff
13
 *  2000 - 206f  (general punctuation)
14
 *  20a0 - 20cf  (currency symbols)
15
 *  2100 - 214f  (letterlike symbols)
16
 *  2150 - 218f  (Number Forms)
17
 * el         -> greek
18
 *  0370 - 03ff
19
 *  1f00 - 1fff
20
 * x-cyrillic -> cyrillic
21
 *  0400 - 04ff
22
 * he         -> hebrew
23
 *  0590 - 05ff
24
 * ar         -> arabic
25
 *  0600 - 06ff
26
 *  fb50 - fdff (arabic presentation forms)
27
 *  fe70 - feff (arabic presentation forms b)
28
 * th - thai
29
 *  0e00 - 0e7f
30
 * ko        -> korean
31
 *  ac00 - d7af  (hangul Syllables)
32
 *  1100 - 11ff    (jamo)
33
 *  3130 - 318f (hangul compatibility jamo)
34
 * ja
35
 *  3040 - 309f (hiragana)
36
 *  30a0 - 30ff (katakana)
37
 * zh-CN
38
 * zh-TW
39
 *
40
 * CJK
41
 *  3100 - 312f (bopomofo)
42
 *  31a0 - 31bf (bopomofo extended)
43
 *  3000 - 303f (CJK Symbols and Punctuation) 
44
 *  2e80 - 2eff (CJK radicals supplement)
45
 *  2f00 - 2fdf (Kangxi Radicals)
46
 *  2ff0 - 2fff (Ideographic Description Characters)
47
 *  3190 - 319f (kanbun)
48
 *  3200 - 32ff (Enclosed CJK letters and Months)
49
 *  3300 - 33ff (CJK compatibility)
50
 *  3400 - 4dbf (CJK Unified Ideographs Extension A)
51
 *  4e00 - 9faf (CJK Unified Ideographs)
52
 *  f900 - fa5f (CJK Compatibility Ideographs)
53
 *  fe30 - fe4f (CJK compatibility Forms)
54
 *  ff00 - ffef (halfwidth and fullwidth forms)
55
 *
56
 * Armenian
57
 *  0530 - 058f 
58
 * Sriac 
59
 *  0700 - 074f
60
 * Thaana
61
 *  0780 - 07bf
62
 * Devanagari
63
 *  0900 - 097f
64
 * Bengali
65
 *  0980 - 09ff
66
 * Gurmukhi
67
 *  0a00 - 0a7f
68
 * Gujarati
69
 *  0a80 - 0aff
70
 * Oriya
71
 *  0b00 - 0b7f
72
 * Tamil
73
 *  0b80 - 0bff
74
 * Telugu
75
 *  0c00 - 0c7f
76
 * Kannada
77
 *  0c80 - 0cff
78
 * Malayalam
79
 *  0d00 - 0d7f
80
 * Sinhala
81
 *  0d80 - 0def
82
 * Lao
83
 *  0e80 - 0eff
84
 * Tibetan
85
 *  0f00 - 0fbf
86
 * Myanmar
87
 *  1000 - 109f
88
 * Georgian
89
 *  10a0 - 10ff
90
 * Ethiopic
91
 *  1200 - 137f
92
 * Cherokee
93
 *  13a0 - 13ff
94
 * Canadian Aboriginal Syllabics
95
 *  1400 - 167f
96
 * Ogham
97
 *  1680 - 169f
98
 * Runic 
99
 *  16a0 - 16ff
100
 * Khmer
101
 *  1780 - 17ff
102
 * Mongolian
103
 *  1800 - 18af
104
 * Misc - superscripts and subscripts
105
 *  2070 - 209f
106
 * Misc - Combining Diacritical Marks for Symbols
107
 *  20d0 - 20ff
108
 * Misc - Arrows
109
 *  2190 - 21ff
110
 * Misc - Mathematical Operators
111
 *  2200 - 22ff
112
 * Misc - Miscellaneous Technical
113
 *  2300 - 23ff
114
 * Misc - Control picture
115
 *  2400 - 243f
116
 * Misc - Optical character recognition
117
 *  2440 - 2450
118
 * Misc - Enclose Alphanumerics
119
 *  2460 - 24ff
120
 * Misc - Box Drawing 
121
 *  2500 - 257f
122
 * Misc - Block Elements
123
 *  2580 - 259f
124
 * Misc - Geometric Shapes
125
 *  25a0 - 25ff
126
 * Misc - Miscellaneous Symbols
127
 *  2600 - 267f
128
 * Misc - Dingbats
129
 *  2700 - 27bf
130
 * Misc - Braille Patterns
131
 *  2800 - 28ff
132
 * Yi Syllables
133
 *  a000 - a48f
134
 * Yi radicals
135
 *  a490 - a4cf
136
 * Alphabetic Presentation Forms
137
 *  fb00 - fb4f
138
 * Misc - Combining half Marks
139
 *  fe20 - fe2f
140
 * Misc - small form variants
141
 *  fe50 - fe6f
142
 * Misc - Specials
143
 *  fff0 - ffff
144
 *********************************************************************/
145
146
147
148
#define NUM_OF_SUBTABLES      10
149
#define SUBTABLE_SIZE         16
150
151
static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = 
152
{ 
153
  { // table for X---
154
    kRangeTableBase+1,  //u0xxx
155
    kRangeTableBase+2,  //u1xxx
156
    kRangeTableBase+3,  //u2xxx
157
    kRangeSetCJK,       //u3xxx
158
    kRangeSetCJK,       //u4xxx
159
    kRangeSetCJK,       //u5xxx
160
    kRangeSetCJK,       //u6xxx
161
    kRangeSetCJK,       //u7xxx
162
    kRangeSetCJK,       //u8xxx
163
    kRangeSetCJK,       //u9xxx
164
    kRangeTableBase+4,  //uaxxx
165
    kRangeKorean,       //ubxxx
166
    kRangeKorean,       //ucxxx
167
    kRangeTableBase+5,  //udxxx
168
    kRangePrivate,      //uexxx
169
    kRangeTableBase+6   //ufxxx
170
  },
171
  { //table for 0X--
172
    kRangeSetLatin,          //u00xx
173
    kRangeSetLatin,          //u01xx
174
    kRangeSetLatin,          //u02xx
175
    kRangeGreek,             //u03xx     XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
176
    kRangeCyrillic,          //u04xx
177
    kRangeTableBase+7,       //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
178
    kRangeArabic,            //u06xx
179
    kRangeTertiaryTable,     //u07xx
180
    kRangeUnassigned,        //u08xx
181
    kRangeTertiaryTable,     //u09xx
182
    kRangeTertiaryTable,     //u0axx
183
    kRangeTertiaryTable,     //u0bxx
184
    kRangeTertiaryTable,     //u0cxx
185
    kRangeTertiaryTable,     //u0dxx
186
    kRangeTertiaryTable,     //u0exx
187
    kRangeTibetan            //u0fxx
188
  },
189
  { //table for 1x--
190
    kRangeTertiaryTable,     //u10xx
191
    kRangeKorean,            //u11xx
192
    kRangeEthiopic,          //u12xx
193
    kRangeTertiaryTable,     //u13xx
194
    kRangeCanadian,          //u14xx
195
    kRangeCanadian,          //u15xx
196
    kRangeTertiaryTable,     //u16xx
197
    kRangeKhmer,             //u17xx
198
    kRangeMongolian,         //u18xx
199
    kRangeUnassigned,        //u19xx
200
    kRangeUnassigned,        //u1axx
201
    kRangeUnassigned,        //u1bxx
202
    kRangeUnassigned,        //u1cxx
203
    kRangeUnassigned,        //u1dxx
204
    kRangeSetLatin,          //u1exx
205
    kRangeGreek              //u1fxx
206
  },
207
  { //table for 2x--
208
    kRangeSetLatin,          //u20xx
209
    kRangeSetLatin,          //u21xx
210
    kRangeMathOperators,     //u22xx
211
    kRangeMiscTechnical,     //u23xx
212
    kRangeControlOpticalEnclose, //u24xx
213
    kRangeBoxBlockGeometrics, //u25xx
214
    kRangeMiscSymbols,       //u26xx
215
    kRangeDingbats,          //u27xx
216
    kRangeBraillePattern,    //u28xx
217
    kRangeUnassigned,        //u29xx
218
    kRangeUnassigned,        //u2axx
219
    kRangeUnassigned,        //u2bxx
220
    kRangeUnassigned,        //u2cxx
221
    kRangeUnassigned,        //u2dxx
222
    kRangeSetCJK,            //u2exx
223
    kRangeSetCJK             //u2fxx
224
  },
225
  {  //table for ax--
226
    kRangeYi,                //ua0xx
227
    kRangeYi,                //ua1xx
228
    kRangeYi,                //ua2xx
229
    kRangeYi,                //ua3xx
230
    kRangeYi,                //ua4xx
231
    kRangeUnassigned,        //ua5xx
232
    kRangeUnassigned,        //ua6xx
233
    kRangeUnassigned,        //ua7xx
234
    kRangeUnassigned,        //ua8xx
235
    kRangeUnassigned,        //ua9xx
236
    kRangeUnassigned,        //uaaxx
237
    kRangeUnassigned,        //uabxx
238
    kRangeKorean,            //uacxx
239
    kRangeKorean,            //uadxx
240
    kRangeKorean,            //uaexx
241
    kRangeKorean             //uafxx
242
  },
243
  {  //table for dx--
244
    kRangeKorean,            //ud0xx
245
    kRangeKorean,            //ud1xx
246
    kRangeKorean,            //ud2xx
247
    kRangeKorean,            //ud3xx
248
    kRangeKorean,            //ud4xx
249
    kRangeKorean,            //ud5xx
250
    kRangeKorean,            //ud6xx
251
    kRangeKorean,            //ud7xx
252
    kRangeSurrogate,         //ud8xx
253
    kRangeSurrogate,         //ud9xx
254
    kRangeSurrogate,         //udaxx
255
    kRangeSurrogate,         //udbxx
256
    kRangeSurrogate,         //udcxx
257
    kRangeSurrogate,         //uddxx
258
    kRangeSurrogate,         //udexx
259
    kRangeSurrogate          //udfxx
260
  },
261
  { // table for fx--
262
    kRangePrivate,           //uf0xx 
263
    kRangePrivate,           //uf1xx 
264
    kRangePrivate,           //uf2xx 
265
    kRangePrivate,           //uf3xx 
266
    kRangePrivate,           //uf4xx 
267
    kRangePrivate,           //uf5xx 
268
    kRangePrivate,           //uf6xx 
269
    kRangePrivate,           //uf7xx 
270
    kRangePrivate,           //uf8xx 
271
    kRangeSetCJK,            //uf9xx 
272
    kRangeSetCJK,            //ufaxx 
273
    kRangeArabic,            //ufbxx, includes alphabic presentation form
274
    kRangeArabic,            //ufcxx
275
    kRangeArabic,            //ufdxx
276
    kRangeTableBase+8,       //ufexx
277
    kRangeTableBase+9        //uffxx, halfwidth and fullwidth forms, includes Specials
278
  },
279
  { //table for 0x0500 - 0x05ff
280
    kRangeCyrillic,          //u050x
281
    kRangeCyrillic,          //u051x
282
    kRangeCyrillic,          //u052x
283
    kRangeArmenian,          //u053x
284
    kRangeArmenian,          //u054x
285
    kRangeArmenian,          //u055x
286
    kRangeArmenian,          //u056x
287
    kRangeArmenian,          //u057x
288
    kRangeArmenian,          //u058x
289
    kRangeHebrew,            //u059x
290
    kRangeHebrew,            //u05ax
291
    kRangeHebrew,            //u05bx
292
    kRangeHebrew,            //u05cx
293
    kRangeHebrew,            //u05dx
294
    kRangeHebrew,            //u05ex
295
    kRangeHebrew             //u05fx
296
  },
297
  { //table for 0xfe00 - 0xfeff
298
    kRangeSetCJK,            //ufe0x
299
    kRangeSetCJK,            //ufe1x
300
    kRangeSetCJK,            //ufe2x
301
    kRangeSetCJK,            //ufe3x
302
    kRangeSetCJK,            //ufe4x
303
    kRangeSetCJK,            //ufe5x
304
    kRangeSetCJK,            //ufe6x
305
    kRangeArabic,            //ufe7x
306
    kRangeArabic,            //ufe8x
307
    kRangeArabic,            //ufe9x
308
    kRangeArabic,            //ufeax
309
    kRangeArabic,            //ufebx
310
    kRangeArabic,            //ufecx
311
    kRangeArabic,            //ufedx
312
    kRangeArabic,            //ufeex
313
    kRangeArabic             //ufefx
314
  },
315
  { //table for 0xff00 - 0xffff
316
    kRangeSetCJK,            //uff0x, fullwidth latin
317
    kRangeSetCJK,            //uff1x, fullwidth latin
318
    kRangeSetCJK,            //uff2x, fullwidth latin
319
    kRangeSetCJK,            //uff3x, fullwidth latin
320
    kRangeSetCJK,            //uff4x, fullwidth latin
321
    kRangeSetCJK,            //uff5x, fullwidth latin
322
    kRangeSetCJK,            //uff6x, halfwidth katakana
323
    kRangeSetCJK,            //uff7x, halfwidth katakana
324
    kRangeSetCJK,            //uff8x, halfwidth katakana
325
    kRangeSetCJK,            //uff9x, halfwidth katakana
326
    kRangeSetCJK,            //uffax, halfwidth hangul jamo
327
    kRangeSetCJK,            //uffbx, halfwidth hangul jamo
328
    kRangeSetCJK,            //uffcx, halfwidth hangul jamo
329
    kRangeSetCJK,            //uffdx, halfwidth hangul jamo
330
    kRangeSetCJK,            //uffex, fullwidth symbols
331
    kRangeSpecials,          //ufffx, Specials
332
  },
333
};
334
335
// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) 
336
// code points  so that the number of entries in the tertiary range
337
// table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
338
// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal 
339
// syllabaries take multiple chunks and Ogham and Runic share  a single chunk.
340
#define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
341
342
static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
343
{ //table for 0x0700 - 0x1600 
344
    kRangeSyriac,            //u070x
345
    kRangeThaana,            //u078x
346
    kRangeUnassigned,        //u080x  place holder(resolved in the 2ndary tab.)
347
    kRangeUnassigned,        //u088x  place holder(resolved in the 2ndary tab.)
348
    kRangeDevanagari,        //u090x
349
    kRangeBengali,           //u098x
350
    kRangeGurmukhi,          //u0a0x
351
    kRangeGujarati,          //u0a8x
352
    kRangeOriya,             //u0b0x
353
    kRangeTamil,             //u0b8x
354
    kRangeTelugu,            //u0c0x
355
    kRangeKannada,           //u0c8x
356
    kRangeMalayalam,         //u0d0x
357
    kRangeSinhala,           //u0d8x
358
    kRangeThai,              //u0e0x  
359
    kRangeLao,               //u0e8x
360
    kRangeTibetan,           //u0f0x  place holder(resolved in the 2ndary tab.)
361
    kRangeTibetan,           //u0f8x  place holder(resolved in the 2ndary tab.)
362
    kRangeMyanmar,           //u100x
363
    kRangeGeorgian,          //u108x
364
    kRangeKorean,            //u110x  place holder(resolved in the 2ndary tab.)
365
    kRangeKorean,            //u118x  place holder(resolved in the 2ndary tab.)
366
    kRangeEthiopic,          //u120x  place holder(resolved in the 2ndary tab.)
367
    kRangeEthiopic,          //u128x  place holder(resolved in the 2ndary tab.)
368
    kRangeEthiopic,          //u130x  
369
    kRangeCherokee,          //u138x
370
    kRangeCanadian,          //u140x  place holder(resolved in the 2ndary tab.)
371
    kRangeCanadian,          //u148x  place holder(resolved in the 2ndary tab.)
372
    kRangeCanadian,          //u150x  place holder(resolved in the 2ndary tab.)
373
    kRangeCanadian,          //u158x  place holder(resolved in the 2ndary tab.)
374
    kRangeCanadian,          //u160x  
375
    kRangeOghamRunic         //u168x  this contains two scripts, Ogham & Runic
376
};
377
378
// A two level index is almost enough for locating a range, with the 
379
// exception of u03xx and u05xx. Since we don't really care about range for
380
// combining diacritical marks in our font application, they are 
381
// not discriminated further. But future adoption of this module for other use 
382
// should be aware of this limitation. The implementation can be extended if 
383
// there is such a need.
384
// For Indic, Southeast Asian scripts and some other scripts between
385
// U+0700 and U+16FF, it's extended to the third level.
386
uint32_t FindCharUnicodeRange(uint32_t ch)
387
0
{
388
0
  uint32_t range;
389
0
  
390
0
  // aggregate ranges for non-BMP codepoints
391
0
  if (ch > 0xFFFF) {
392
0
    uint32_t p = (ch >> 16);
393
0
    if (p == 1) {
394
0
        return kRangeSMP;
395
0
    } else if (p == 2) {
396
0
        return kRangeSetCJK;
397
0
    }
398
0
    return kRangeHigherPlanes;
399
0
  }
400
0
401
0
  // lookup explicit range for BMP codepoints
402
0
  // first general range
403
0
  range = gUnicodeSubrangeTable[0][ch >> 12];
404
0
  
405
0
  // if general range is good enough, return that
406
0
  if (range < kRangeTableBase)
407
0
    // we try to get a specific range 
408
0
    return range;
409
0
410
0
  // otherwise, use subrange tables
411
0
  range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
412
0
  if (range < kRangeTableBase)
413
0
    return range;
414
0
  if (range < kRangeTertiaryTable)
415
0
    return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];
416
0
417
0
  // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
418
0
  return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
419
0
}