Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/unicharutil/util/nsUnicodeProperties.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/* vim:set ts=4 sw=4 sts=4 et cindent: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#include "nsUnicodeProperties.h"
8
#include "nsUnicodePropertyData.cpp"
9
10
#include "mozilla/ArrayUtils.h"
11
#include "nsCharTraits.h"
12
13
88.5k
#define UNICODE_BMP_LIMIT 0x10000
14
#define UNICODE_LIMIT     0x110000
15
16
const nsCharProps2&
17
GetCharProps2(uint32_t aCh)
18
88.5k
{
19
88.5k
    if (aCh < UNICODE_BMP_LIMIT) {
20
88.5k
        return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
21
88.5k
                              [aCh & ((1 << kCharProp2CharBits) - 1)];
22
88.5k
    }
23
34
    if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
24
34
        return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
25
34
                                               [(aCh & 0xffff) >> kCharProp2CharBits]]
26
34
                               [aCh & ((1 << kCharProp2CharBits) - 1)];
27
34
    }
28
0
29
0
    MOZ_ASSERT_UNREACHABLE("Getting CharProps for codepoint outside Unicode "
30
0
                           "range");
31
0
32
0
    // Default values for unassigned
33
0
    using namespace mozilla::unicode;
34
0
    static const nsCharProps2 undefined = {
35
0
        VERTICAL_ORIENTATION_R,
36
0
        0 // IdentifierType
37
0
    };
38
0
    return undefined;
39
0
}
40
41
namespace mozilla {
42
43
namespace unicode {
44
45
/*
46
To store properties for a million Unicode codepoints compactly, we use
47
a three-level array structure, with the Unicode values considered as
48
three elements: Plane, Page, and Char.
49
50
Space optimization happens because multiple Planes can refer to the same
51
Page array, and multiple Pages can refer to the same Char array holding
52
the actual values. In practice, most of the higher planes are empty and
53
thus share the same data; and within the BMP, there are also many pages
54
that repeat the same data for any given property.
55
56
Plane is usually zero, so we skip a lookup in this case, and require
57
that the Plane 0 pages are always the first set of entries in the Page
58
array.
59
60
The division of the remaining 16 bits into Page and Char fields is
61
adjusted for each property (by experiment using the generation tool)
62
to provide the most compact storage, depending on the distribution
63
of values.
64
*/
65
66
const nsUGenCategory sDetailedToGeneralCategory[] = {
67
  /*
68
   * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
69
   * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
70
   */
71
  /* CONTROL */             nsUGenCategory::kOther,
72
  /* FORMAT */              nsUGenCategory::kOther,
73
  /* UNASSIGNED */          nsUGenCategory::kOther,
74
  /* PRIVATE_USE */         nsUGenCategory::kOther,
75
  /* SURROGATE */           nsUGenCategory::kOther,
76
  /* LOWERCASE_LETTER */    nsUGenCategory::kLetter,
77
  /* MODIFIER_LETTER */     nsUGenCategory::kLetter,
78
  /* OTHER_LETTER */        nsUGenCategory::kLetter,
79
  /* TITLECASE_LETTER */    nsUGenCategory::kLetter,
80
  /* UPPERCASE_LETTER */    nsUGenCategory::kLetter,
81
  /* COMBINING_MARK */      nsUGenCategory::kMark,
82
  /* ENCLOSING_MARK */      nsUGenCategory::kMark,
83
  /* NON_SPACING_MARK */    nsUGenCategory::kMark,
84
  /* DECIMAL_NUMBER */      nsUGenCategory::kNumber,
85
  /* LETTER_NUMBER */       nsUGenCategory::kNumber,
86
  /* OTHER_NUMBER */        nsUGenCategory::kNumber,
87
  /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
88
  /* DASH_PUNCTUATION */    nsUGenCategory::kPunctuation,
89
  /* CLOSE_PUNCTUATION */   nsUGenCategory::kPunctuation,
90
  /* FINAL_PUNCTUATION */   nsUGenCategory::kPunctuation,
91
  /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
92
  /* OTHER_PUNCTUATION */   nsUGenCategory::kPunctuation,
93
  /* OPEN_PUNCTUATION */    nsUGenCategory::kPunctuation,
94
  /* CURRENCY_SYMBOL */     nsUGenCategory::kSymbol,
95
  /* MODIFIER_SYMBOL */     nsUGenCategory::kSymbol,
96
  /* MATH_SYMBOL */         nsUGenCategory::kSymbol,
97
  /* OTHER_SYMBOL */        nsUGenCategory::kSymbol,
98
  /* LINE_SEPARATOR */      nsUGenCategory::kSeparator,
99
  /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
100
  /* SPACE_SEPARATOR */     nsUGenCategory::kSeparator
101
};
102
103
const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
104
  HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
105
  HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
106
  HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
107
  HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
108
  HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
109
  HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
110
  HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
111
  HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
112
  HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
113
  HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
114
  HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
115
  HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
116
  HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
117
  HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
118
  HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
119
  HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
120
  HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
121
  HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
122
  HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
123
  HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
124
  HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
125
  HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
126
  HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
127
  HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
128
  HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
129
  HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
130
  HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
131
  HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
132
  HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
133
  HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
134
};
135
136
#define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \
137
  uint32_t Get##prefix_(uint32_t aCh) \
138
0
  { \
139
0
    if (aCh >= UNICODE_BMP_LIMIT) { \
140
0
      return aCh; \
141
0
    } \
142
0
    auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
143
0
    auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \
144
0
    uint32_t v = s##prefix_##Values[page][index]; \
145
0
    return v ? v : aCh; \
146
0
  }
Unexecuted instantiation: mozilla::unicode::GetFullWidth(unsigned int)
Unexecuted instantiation: mozilla::unicode::GetFullWidthInverse(unsigned int)
147
148
// full-width mappings only exist for BMP characters; all others are
149
// returned unchanged
150
DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
151
DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
152
153
bool
154
IsClusterExtender(uint32_t aCh, uint8_t aCategory)
155
0
{
156
0
    return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
157
0
             aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
158
0
            (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
159
0
            (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks
160
0
            (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers
161
0
            (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters
162
0
}
163
164
enum HSType {
165
    HST_NONE = U_HST_NOT_APPLICABLE,
166
    HST_L    = U_HST_LEADING_JAMO,
167
    HST_V    = U_HST_VOWEL_JAMO,
168
    HST_T    = U_HST_TRAILING_JAMO,
169
    HST_LV   = U_HST_LV_SYLLABLE,
170
    HST_LVT  = U_HST_LVT_SYLLABLE
171
};
172
173
static HSType
174
GetHangulSyllableType(uint32_t aCh)
175
0
{
176
0
    return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE));
177
0
}
178
179
void
180
ClusterIterator::Next()
181
0
{
182
0
    if (AtEnd()) {
183
0
        NS_WARNING("ClusterIterator has already reached the end");
184
0
        return;
185
0
    }
186
0
187
0
    uint32_t ch = *mPos++;
188
0
189
0
    if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
190
0
        NS_IS_LOW_SURROGATE(*mPos)) {
191
0
        ch = SURROGATE_TO_UCS4(ch, *mPos++);
192
0
    } else if ((ch & ~0xff) == 0x1100 ||
193
0
        (ch >= 0xa960 && ch <= 0xa97f) ||
194
0
        (ch >= 0xac00 && ch <= 0xd7ff)) {
195
0
        // Handle conjoining Jamo that make Hangul syllables
196
0
        HSType hangulState = GetHangulSyllableType(ch);
197
0
        while (mPos < mLimit) {
198
0
            ch = *mPos;
199
0
            HSType hangulType = GetHangulSyllableType(ch);
200
0
            switch (hangulType) {
201
0
            case HST_L:
202
0
            case HST_LV:
203
0
            case HST_LVT:
204
0
                if (hangulState == HST_L) {
205
0
                    hangulState = hangulType;
206
0
                    mPos++;
207
0
                    continue;
208
0
                }
209
0
                break;
210
0
            case HST_V:
211
0
                if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
212
0
                    (hangulState != HST_LVT)) {
213
0
                    hangulState = hangulType;
214
0
                    mPos++;
215
0
                    continue;
216
0
                }
217
0
                break;
218
0
            case HST_T:
219
0
                if (hangulState != HST_NONE && hangulState != HST_L) {
220
0
                    hangulState = hangulType;
221
0
                    mPos++;
222
0
                    continue;
223
0
                }
224
0
                break;
225
0
            default:
226
0
                break;
227
0
            }
228
0
            break;
229
0
        }
230
0
    }
231
0
232
0
    while (mPos < mLimit) {
233
0
        ch = *mPos;
234
0
235
0
        // Check for surrogate pairs; note that isolated surrogates will just
236
0
        // be treated as generic (non-cluster-extending) characters here,
237
0
        // which is fine for cluster-iterating purposes
238
0
        if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
239
0
            NS_IS_LOW_SURROGATE(*(mPos + 1))) {
240
0
            ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
241
0
        }
242
0
243
0
        if (!IsClusterExtender(ch)) {
244
0
            break;
245
0
        }
246
0
247
0
        mPos++;
248
0
        if (!IS_IN_BMP(ch)) {
249
0
            mPos++;
250
0
        }
251
0
    }
252
0
253
0
    NS_ASSERTION(mText < mPos && mPos <= mLimit,
254
0
                 "ClusterIterator::Next has overshot the string!");
255
0
}
256
257
void
258
ClusterReverseIterator::Next()
259
0
{
260
0
    if (AtEnd()) {
261
0
        NS_WARNING("ClusterReverseIterator has already reached the end");
262
0
        return;
263
0
    }
264
0
265
0
    uint32_t ch;
266
0
    do {
267
0
        ch = *--mPos;
268
0
269
0
        if (NS_IS_LOW_SURROGATE(ch) && mPos > mLimit &&
270
0
            NS_IS_HIGH_SURROGATE(*(mPos - 1))) {
271
0
            ch = SURROGATE_TO_UCS4(*--mPos, ch);
272
0
        }
273
0
274
0
        if (!IsClusterExtender(ch)) {
275
0
            break;
276
0
        }
277
0
    } while (mPos > mLimit);
278
0
279
0
    // XXX May need to handle conjoining Jamo
280
0
281
0
    NS_ASSERTION(mPos >= mLimit,
282
0
                 "ClusterReverseIterator::Next has overshot the string!");
283
0
}
284
285
uint32_t
286
CountGraphemeClusters(const char16_t* aText, uint32_t aLength)
287
0
{
288
0
  ClusterIterator iter(aText, aLength);
289
0
  uint32_t result = 0;
290
0
  while (!iter.AtEnd()) {
291
0
    ++result;
292
0
    iter.Next();
293
0
  }
294
0
  return result;
295
0
}
296
297
} // end namespace unicode
298
299
} // end namespace mozilla