Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/normlzr.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 *************************************************************************
5
 * COPYRIGHT: 
6
 * Copyright (c) 1996-2012, International Business Machines Corporation and
7
 * others. All Rights Reserved.
8
 *************************************************************************
9
 */
10
11
#include "unicode/utypes.h"
12
13
#if !UCONFIG_NO_NORMALIZATION
14
15
#include "unicode/uniset.h"
16
#include "unicode/unistr.h"
17
#include "unicode/chariter.h"
18
#include "unicode/schriter.h"
19
#include "unicode/uchriter.h"
20
#include "unicode/normlzr.h"
21
#include "unicode/utf16.h"
22
#include "cmemory.h"
23
#include "normalizer2impl.h"
24
#include "uprops.h"  // for uniset_getUnicode32Instance()
25
26
#if defined(_ARM64_) && defined(move32)
27
 // System can define move32 intrinsics, but the char iters define move32 method
28
 // using same undef trick in headers, so undef here to re-enable the method.
29
#undef move32
30
#endif
31
32
U_NAMESPACE_BEGIN
33
34
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
35
36
//-------------------------------------------------------------------------
37
// Constructors and other boilerplate
38
//-------------------------------------------------------------------------
39
40
Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
41
    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
42
    text(new StringCharacterIterator(str)),
43
    currentIndex(0), nextIndex(0),
44
    buffer(), bufferPos(0)
45
0
{
46
0
    init();
47
0
}
48
49
Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
50
    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
51
    text(new UCharCharacterIterator(str, length)),
52
    currentIndex(0), nextIndex(0),
53
    buffer(), bufferPos(0)
54
0
{
55
0
    init();
56
0
}
57
58
Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
59
    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
60
    text(iter.clone()),
61
    currentIndex(0), nextIndex(0),
62
    buffer(), bufferPos(0)
63
0
{
64
0
    init();
65
0
}
66
67
Normalizer::Normalizer(const Normalizer &copy) :
68
    UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
69
    text(copy.text->clone()),
70
    currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
71
    buffer(copy.buffer), bufferPos(copy.bufferPos)
72
0
{
73
0
    init();
74
0
}
75
76
void
77
0
Normalizer::init() {
78
0
    UErrorCode errorCode=U_ZERO_ERROR;
79
0
    fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
80
0
    if(fOptions&UNORM_UNICODE_3_2) {
81
0
        delete fFilteredNorm2;
82
0
        fNorm2=fFilteredNorm2=
83
0
            new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
84
0
    }
85
0
    if(U_FAILURE(errorCode)) {
86
0
        errorCode=U_ZERO_ERROR;
87
0
        fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
88
0
    }
89
0
}
90
91
Normalizer::~Normalizer()
92
0
{
93
0
    delete fFilteredNorm2;
94
0
    delete text;
95
0
}
96
97
Normalizer* 
98
Normalizer::clone() const
99
0
{
100
0
    return new Normalizer(*this);
101
0
}
102
103
/**
104
 * Generates a hash code for this iterator.
105
 */
106
int32_t Normalizer::hashCode() const
107
0
{
108
0
    return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
109
0
}
110
    
111
UBool Normalizer::operator==(const Normalizer& that) const
112
0
{
113
0
    return
114
0
        this==&that ||
115
0
        (fUMode==that.fUMode &&
116
0
        fOptions==that.fOptions &&
117
0
        *text==*that.text &&
118
0
        buffer==that.buffer &&
119
0
        bufferPos==that.bufferPos &&
120
0
        nextIndex==that.nextIndex);
121
0
}
122
123
//-------------------------------------------------------------------------
124
// Static utility methods
125
//-------------------------------------------------------------------------
126
127
void U_EXPORT2
128
Normalizer::normalize(const UnicodeString& source, 
129
                      UNormalizationMode mode, int32_t options,
130
                      UnicodeString& result, 
131
0
                      UErrorCode &status) {
132
0
    if(source.isBogus() || U_FAILURE(status)) {
133
0
        result.setToBogus();
134
0
        if(U_SUCCESS(status)) {
135
0
            status=U_ILLEGAL_ARGUMENT_ERROR;
136
0
        }
137
0
    } else {
138
0
        UnicodeString localDest;
139
0
        UnicodeString *dest;
140
0
141
0
        if(&source!=&result) {
142
0
            dest=&result;
143
0
        } else {
144
0
            // the source and result strings are the same object, use a temporary one
145
0
            dest=&localDest;
146
0
        }
147
0
        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
148
0
        if(U_SUCCESS(status)) {
149
0
            if(options&UNORM_UNICODE_3_2) {
150
0
                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
151
0
                    normalize(source, *dest, status);
152
0
            } else {
153
0
                n2->normalize(source, *dest, status);
154
0
            }
155
0
        }
156
0
        if(dest==&localDest && U_SUCCESS(status)) {
157
0
            result=*dest;
158
0
        }
159
0
    }
160
0
}
161
162
void U_EXPORT2
163
Normalizer::compose(const UnicodeString& source, 
164
                    UBool compat, int32_t options,
165
                    UnicodeString& result, 
166
0
                    UErrorCode &status) {
167
0
    normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
168
0
}
169
170
void U_EXPORT2
171
Normalizer::decompose(const UnicodeString& source, 
172
                      UBool compat, int32_t options,
173
                      UnicodeString& result, 
174
0
                      UErrorCode &status) {
175
0
    normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
176
0
}
177
178
UNormalizationCheckResult
179
Normalizer::quickCheck(const UnicodeString& source,
180
                       UNormalizationMode mode, int32_t options,
181
0
                       UErrorCode &status) {
182
0
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
183
0
    if(U_SUCCESS(status)) {
184
0
        if(options&UNORM_UNICODE_3_2) {
185
0
            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
186
0
                quickCheck(source, status);
187
0
        } else {
188
0
            return n2->quickCheck(source, status);
189
0
        }
190
0
    } else {
191
0
        return UNORM_MAYBE;
192
0
    }
193
0
}
194
195
UBool
196
Normalizer::isNormalized(const UnicodeString& source,
197
                         UNormalizationMode mode, int32_t options,
198
0
                         UErrorCode &status) {
199
0
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
200
0
    if(U_SUCCESS(status)) {
201
0
        if(options&UNORM_UNICODE_3_2) {
202
0
            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
203
0
                isNormalized(source, status);
204
0
        } else {
205
0
            return n2->isNormalized(source, status);
206
0
        }
207
0
    } else {
208
0
        return FALSE;
209
0
    }
210
0
}
211
212
UnicodeString & U_EXPORT2
213
Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
214
                        UnicodeString &result,
215
                        UNormalizationMode mode, int32_t options,
216
0
                        UErrorCode &errorCode) {
217
0
    if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
218
0
        result.setToBogus();
219
0
        if(U_SUCCESS(errorCode)) {
220
0
            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
221
0
        }
222
0
    } else {
223
0
        UnicodeString localDest;
224
0
        UnicodeString *dest;
225
0
226
0
        if(&right!=&result) {
227
0
            dest=&result;
228
0
        } else {
229
0
            // the right and result strings are the same object, use a temporary one
230
0
            dest=&localDest;
231
0
        }
232
0
        *dest=left;
233
0
        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
234
0
        if(U_SUCCESS(errorCode)) {
235
0
            if(options&UNORM_UNICODE_3_2) {
236
0
                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
237
0
                    append(*dest, right, errorCode);
238
0
            } else {
239
0
                n2->append(*dest, right, errorCode);
240
0
            }
241
0
        }
242
0
        if(dest==&localDest && U_SUCCESS(errorCode)) {
243
0
            result=*dest;
244
0
        }
245
0
    }
246
0
    return result;
247
0
}
248
249
//-------------------------------------------------------------------------
250
// Iteration API
251
//-------------------------------------------------------------------------
252
253
/**
254
 * Return the current character in the normalized text.
255
 */
256
0
UChar32 Normalizer::current() {
257
0
    if(bufferPos<buffer.length() || nextNormalize()) {
258
0
        return buffer.char32At(bufferPos);
259
0
    } else {
260
0
        return DONE;
261
0
    }
262
0
}
263
264
/**
265
 * Return the next character in the normalized text and advance
266
 * the iteration position by one.  If the end
267
 * of the text has already been reached, {@link #DONE} is returned.
268
 */
269
0
UChar32 Normalizer::next() {
270
0
    if(bufferPos<buffer.length() ||  nextNormalize()) {
271
0
        UChar32 c=buffer.char32At(bufferPos);
272
0
        bufferPos+=U16_LENGTH(c);
273
0
        return c;
274
0
    } else {
275
0
        return DONE;
276
0
    }
277
0
}
278
279
/**
280
 * Return the previous character in the normalized text and decrement
281
 * the iteration position by one.  If the beginning
282
 * of the text has already been reached, {@link #DONE} is returned.
283
 */
284
0
UChar32 Normalizer::previous() {
285
0
    if(bufferPos>0 || previousNormalize()) {
286
0
        UChar32 c=buffer.char32At(bufferPos-1);
287
0
        bufferPos-=U16_LENGTH(c);
288
0
        return c;
289
0
    } else {
290
0
        return DONE;
291
0
    }
292
0
}
293
294
0
void Normalizer::reset() {
295
0
    currentIndex=nextIndex=text->setToStart();
296
0
    clearBuffer();
297
0
}
298
299
void
300
0
Normalizer::setIndexOnly(int32_t index) {
301
0
    text->setIndex(index);  // pins index
302
0
    currentIndex=nextIndex=text->getIndex();
303
0
    clearBuffer();
304
0
}
305
306
/**
307
 * Return the first character in the normalized text.  This resets
308
 * the <tt>Normalizer's</tt> position to the beginning of the text.
309
 */
310
0
UChar32 Normalizer::first() {
311
0
    reset();
312
0
    return next();
313
0
}
314
315
/**
316
 * Return the last character in the normalized text.  This resets
317
 * the <tt>Normalizer's</tt> position to be just before the
318
 * the input text corresponding to that normalized character.
319
 */
320
0
UChar32 Normalizer::last() {
321
0
    currentIndex=nextIndex=text->setToEnd();
322
0
    clearBuffer();
323
0
    return previous();
324
0
}
325
326
/**
327
 * Retrieve the current iteration position in the input text that is
328
 * being normalized.  This method is useful in applications such as
329
 * searching, where you need to be able to determine the position in
330
 * the input text that corresponds to a given normalized output character.
331
 * <p>
332
 * <b>Note:</b> This method sets the position in the <em>input</em>, while
333
 * {@link #next} and {@link #previous} iterate through characters in the
334
 * <em>output</em>.  This means that there is not necessarily a one-to-one
335
 * correspondence between characters returned by <tt>next</tt> and
336
 * <tt>previous</tt> and the indices passed to and returned from
337
 * <tt>setIndex</tt> and {@link #getIndex}.
338
 *
339
 */
340
0
int32_t Normalizer::getIndex() const {
341
0
    if(bufferPos<buffer.length()) {
342
0
        return currentIndex;
343
0
    } else {
344
0
        return nextIndex;
345
0
    }
346
0
}
347
348
/**
349
 * Retrieve the index of the start of the input text.  This is the begin index
350
 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
351
 * over which this <tt>Normalizer</tt> is iterating
352
 */
353
0
int32_t Normalizer::startIndex() const {
354
0
    return text->startIndex();
355
0
}
356
357
/**
358
 * Retrieve the index of the end of the input text.  This is the end index
359
 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
360
 * over which this <tt>Normalizer</tt> is iterating
361
 */
362
0
int32_t Normalizer::endIndex() const {
363
0
    return text->endIndex();
364
0
}
365
366
//-------------------------------------------------------------------------
367
// Property access methods
368
//-------------------------------------------------------------------------
369
370
void
371
Normalizer::setMode(UNormalizationMode newMode) 
372
0
{
373
0
    fUMode = newMode;
374
0
    init();
375
0
}
376
377
UNormalizationMode
378
Normalizer::getUMode() const
379
0
{
380
0
    return fUMode;
381
0
}
382
383
void
384
Normalizer::setOption(int32_t option, 
385
                      UBool value) 
386
0
{
387
0
    if (value) {
388
0
        fOptions |= option;
389
0
    } else {
390
0
        fOptions &= (~option);
391
0
    }
392
0
    init();
393
0
}
394
395
UBool
396
Normalizer::getOption(int32_t option) const
397
0
{
398
0
    return (fOptions & option) != 0;
399
0
}
400
401
/**
402
 * Set the input text over which this <tt>Normalizer</tt> will iterate.
403
 * The iteration position is set to the beginning of the input text.
404
 */
405
void
406
Normalizer::setText(const UnicodeString& newText, 
407
                    UErrorCode &status)
408
0
{
409
0
    if (U_FAILURE(status)) {
410
0
        return;
411
0
    }
412
0
    CharacterIterator *newIter = new StringCharacterIterator(newText);
413
0
    if (newIter == NULL) {
414
0
        status = U_MEMORY_ALLOCATION_ERROR;
415
0
        return;
416
0
    }
417
0
    delete text;
418
0
    text = newIter;
419
0
    reset();
420
0
}
421
422
/**
423
 * Set the input text over which this <tt>Normalizer</tt> will iterate.
424
 * The iteration position is set to the beginning of the string.
425
 */
426
void
427
Normalizer::setText(const CharacterIterator& newText, 
428
                    UErrorCode &status) 
429
0
{
430
0
    if (U_FAILURE(status)) {
431
0
        return;
432
0
    }
433
0
    CharacterIterator *newIter = newText.clone();
434
0
    if (newIter == NULL) {
435
0
        status = U_MEMORY_ALLOCATION_ERROR;
436
0
        return;
437
0
    }
438
0
    delete text;
439
0
    text = newIter;
440
0
    reset();
441
0
}
442
443
void
444
Normalizer::setText(ConstChar16Ptr newText,
445
                    int32_t length,
446
                    UErrorCode &status)
447
0
{
448
0
    if (U_FAILURE(status)) {
449
0
        return;
450
0
    }
451
0
    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
452
0
    if (newIter == NULL) {
453
0
        status = U_MEMORY_ALLOCATION_ERROR;
454
0
        return;
455
0
    }
456
0
    delete text;
457
0
    text = newIter;
458
0
    reset();
459
0
}
460
461
/**
462
 * Copies the text under iteration into the UnicodeString referred to by "result".
463
 * @param result Receives a copy of the text under iteration.
464
 */
465
void
466
Normalizer::getText(UnicodeString&  result) 
467
0
{
468
0
    text->getText(result);
469
0
}
470
471
//-------------------------------------------------------------------------
472
// Private utility methods
473
//-------------------------------------------------------------------------
474
475
0
void Normalizer::clearBuffer() {
476
0
    buffer.remove();
477
0
    bufferPos=0;
478
0
}
479
480
UBool
481
0
Normalizer::nextNormalize() {
482
0
    clearBuffer();
483
0
    currentIndex=nextIndex;
484
0
    text->setIndex(nextIndex);
485
0
    if(!text->hasNext()) {
486
0
        return FALSE;
487
0
    }
488
0
    // Skip at least one character so we make progress.
489
0
    UnicodeString segment(text->next32PostInc());
490
0
    while(text->hasNext()) {
491
0
        UChar32 c;
492
0
        if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
493
0
            text->move32(-1, CharacterIterator::kCurrent);
494
0
            break;
495
0
        }
496
0
        segment.append(c);
497
0
    }
498
0
    nextIndex=text->getIndex();
499
0
    UErrorCode errorCode=U_ZERO_ERROR;
500
0
    fNorm2->normalize(segment, buffer, errorCode);
501
0
    return U_SUCCESS(errorCode) && !buffer.isEmpty();
502
0
}
503
504
UBool
505
0
Normalizer::previousNormalize() {
506
0
    clearBuffer();
507
0
    nextIndex=currentIndex;
508
0
    text->setIndex(currentIndex);
509
0
    if(!text->hasPrevious()) {
510
0
        return FALSE;
511
0
    }
512
0
    UnicodeString segment;
513
0
    while(text->hasPrevious()) {
514
0
        UChar32 c=text->previous32();
515
0
        segment.insert(0, c);
516
0
        if(fNorm2->hasBoundaryBefore(c)) {
517
0
            break;
518
0
        }
519
0
    }
520
0
    currentIndex=text->getIndex();
521
0
    UErrorCode errorCode=U_ZERO_ERROR;
522
0
    fNorm2->normalize(segment, buffer, errorCode);
523
0
    bufferPos=buffer.length();
524
0
    return U_SUCCESS(errorCode) && !buffer.isEmpty();
525
0
}
526
527
U_NAMESPACE_END
528
529
#endif /* #if !UCONFIG_NO_NORMALIZATION */