/src/mozilla-central/intl/icu/source/common/unorm.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (c) 1996-2014, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* File unorm.cpp
*
* Created by: Vladimir Weinstein 12052000
*
* Modification history :
*
* Date        Name        Description
* 02/01/01    synwee      Added normalization quickcheck enum and method.
* 02/12/01    synwee      Commented out quickcheck util api has been approved
*                         Added private method for doing FCD checks
* 02/23/01    synwee      Modified quickcheck and checkFCE to run through 
*                         string for codepoints < 0x300 for the normalization 
*                         mode NFC.
* 05/25/01+   Markus Scherer total rewrite, implement all normalization here
*                         instead of just wrappers around normlzr.cpp,
*                         load unorm.dat, support Unicode 3.1 with
*                         supplementary code points, etc.
* 2009-nov..2010-jan  Markus Scherer  total rewrite, new Normalizer2 API & code
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include "unicode/udata.h"
#include "unicode/ustring.h"
#include "unicode/uiter.h"
#include "unicode/unorm.h"
#include "unicode/unorm2.h"
#include "normalizer2impl.h"
#include "unormimp.h"
#include "uprops.h"
#include "ustr_imp.h"

U_NAMESPACE_USE

/* quick check functions ---------------------------------------------------- */

U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar *src,
                 int32_t srcLength, 
                 UNormalizationMode mode,
                 UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}

U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, 
                            UNormalizationMode mode, int32_t options,
                            UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
        return unorm2_quickCheck(
            reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
            src, srcLength, pErrorCode);
    } else {
        return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
    }
}

U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar *src, int32_t srcLength,
                   UNormalizationMode mode,
                   UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
}

U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
                              UNormalizationMode mode, int32_t options,
                              UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
        return unorm2_isNormalized(
            reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
            src, srcLength, pErrorCode);
    } else {
        return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
    }
}

/* normalize() API ---------------------------------------------------------- */

/** Public API for normalizing. */
U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar *src, int32_t srcLength,
                UNormalizationMode mode, int32_t options,
                UChar *dest, int32_t destCapacity,
                UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
        return unorm2_normalize(
            reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
            src, srcLength, dest, destCapacity, pErrorCode);
    } else {
        return unorm2_normalize((const UNormalizer2 *)n2,
            src, srcLength, dest, destCapacity, pErrorCode);
    }
}


/* iteration functions ------------------------------------------------------ */

static int32_t
_iterate(UCharIterator *src, UBool forward,
              UChar *dest, int32_t destCapacity,
              const Normalizer2 *n2,
              UBool doNormalize, UBool *pNeededToNormalize,
              UErrorCode *pErrorCode) {
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(pNeededToNormalize!=NULL) {
        *pNeededToNormalize=FALSE;
    }
    if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
        return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
    }

    UnicodeString buffer;
    UChar32 c;
    if(forward) {
        /* get one character and ignore its properties */
        buffer.append(uiter_next32(src));
        /* get all following characters until we see a boundary */
        while((c=uiter_next32(src))>=0) {
            if(n2->hasBoundaryBefore(c)) {
                /* back out the latest movement to stop at the boundary */
                src->move(src, -U16_LENGTH(c), UITER_CURRENT);
                break;
            } else {
                buffer.append(c);
            }
        }
    } else {
        while((c=uiter_previous32(src))>=0) {
            /* always write this character to the front of the buffer */
            buffer.insert(0, c);
            /* stop if this just-copied character is a boundary */
            if(n2->hasBoundaryBefore(c)) {
                break;
            }
        }
    }

    UnicodeString destString(dest, 0, destCapacity);
    if(buffer.length()>0 && doNormalize) {
        n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
        if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
            *pNeededToNormalize= destString!=buffer;
        }
        return destString.length();
    } else {
        /* just copy the source characters */
        return buffer.extract(dest, destCapacity, *pErrorCode);
    }
}

static int32_t
unorm_iterate(UCharIterator *src, UBool forward,
              UChar *dest, int32_t destCapacity,
              UNormalizationMode mode, int32_t options,
              UBool doNormalize, UBool *pNeededToNormalize,
              UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            return 0;
        }
        FilteredNormalizer2 fn2(*n2, *uni32);
        return _iterate(src, forward, dest, destCapacity,
            &fn2, doNormalize, pNeededToNormalize, pErrorCode);
    }
    return _iterate(src, forward, dest, destCapacity,
            n2, doNormalize, pNeededToNormalize, pErrorCode);
}

U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator *src,
               UChar *dest, int32_t destCapacity,
               UNormalizationMode mode, int32_t options,
               UBool doNormalize, UBool *pNeededToNormalize,
               UErrorCode *pErrorCode) {
    return unorm_iterate(src, FALSE,
                         dest, destCapacity,
                         mode, options,
                         doNormalize, pNeededToNormalize,
                         pErrorCode);
}

U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator *src,
           UChar *dest, int32_t destCapacity,
           UNormalizationMode mode, int32_t options,
           UBool doNormalize, UBool *pNeededToNormalize,
           UErrorCode *pErrorCode) {
    return unorm_iterate(src, TRUE,
                         dest, destCapacity,
                         mode, options,
                         doNormalize, pNeededToNormalize,
                         pErrorCode);
}

/* Concatenation of normalized strings -------------------------------------- */

static int32_t
_concatenate(const UChar *left, int32_t leftLength,
                  const UChar *right, int32_t rightLength,
                  UChar *dest, int32_t destCapacity,
                  const Normalizer2 *n2,
                  UErrorCode *pErrorCode) {
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(destCapacity<0 || (dest==NULL && destCapacity>0) ||
        left==NULL || leftLength<-1 || right==NULL || rightLength<-1) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* check for overlapping right and destination */
    if( dest!=NULL &&
        ((right>=dest && right<(dest+destCapacity)) ||
         (rightLength>0 && dest>=right && dest<(right+rightLength)))
    ) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* allow left==dest */
    UnicodeString destString;
    if(left==dest) {
        destString.setTo(dest, leftLength, destCapacity);
    } else {
        destString.setTo(dest, 0, destCapacity);
        destString.append(left, leftLength);
    }
    return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
           extract(dest, destCapacity, *pErrorCode);
}

U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar *left, int32_t leftLength,
                  const UChar *right, int32_t rightLength,
                  UChar *dest, int32_t destCapacity,
                  UNormalizationMode mode, int32_t options,
                  UErrorCode *pErrorCode) {
    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
    if(options&UNORM_UNICODE_3_2) {
        const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            return 0;
        }
        FilteredNormalizer2 fn2(*n2, *uni32);
        return _concatenate(left, leftLength, right, rightLength,
            dest, destCapacity, &fn2, pErrorCode);
    }
    return _concatenate(left, leftLength, right, rightLength,
        dest, destCapacity, n2, pErrorCode);
}

#endif /* #if !UCONFIG_NO_NORMALIZATION */

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		******************************************************************************
5		* Copyright (c) 1996-2014, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		******************************************************************************
8		* File unorm.cpp
9		*
10		* Created by: Vladimir Weinstein 12052000
11		*
12		* Modification history :
13		*
14		* Date Name Description
15		* 02/01/01 synwee Added normalization quickcheck enum and method.
16		* 02/12/01 synwee Commented out quickcheck util api has been approved
17		* Added private method for doing FCD checks
18		* 02/23/01 synwee Modified quickcheck and checkFCE to run through
19		* string for codepoints < 0x300 for the normalization
20		* mode NFC.
21		* 05/25/01+ Markus Scherer total rewrite, implement all normalization here
22		* instead of just wrappers around normlzr.cpp,
23		* load unorm.dat, support Unicode 3.1 with
24		* supplementary code points, etc.
25		* 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
26		*/
27
28		#include "unicode/utypes.h"
29
30		#if !UCONFIG_NO_NORMALIZATION
31
32		#include "unicode/udata.h"
33		#include "unicode/ustring.h"
34		#include "unicode/uiter.h"
35		#include "unicode/unorm.h"
36		#include "unicode/unorm2.h"
37		#include "normalizer2impl.h"
38		#include "unormimp.h"
39		#include "uprops.h"
40		#include "ustr_imp.h"
41
42		U_NAMESPACE_USE
43
44		/* quick check functions ---------------------------------------------------- */
45
46		U_CAPI UNormalizationCheckResult U_EXPORT2
47		unorm_quickCheck(const UChar *src,
48		int32_t srcLength,
49		UNormalizationMode mode,
50	0	UErrorCode *pErrorCode) {
51	0	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
52	0	return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53	0	}
54
55		U_CAPI UNormalizationCheckResult U_EXPORT2
56		unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57		UNormalizationMode mode, int32_t options,
58	0	UErrorCode *pErrorCode) {
59	0	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
60	0	if(options&UNORM_UNICODE_3_2) {
61	0	FilteredNormalizer2 fn2(n2, uniset_getUnicode32Instance(*pErrorCode));
62	0	return unorm2_quickCheck(
63	0	reinterpret_cast<const UNormalizer2 >(static_cast<Normalizer2 >(&fn2)),
64	0	src, srcLength, pErrorCode);
65	0	} else {
66	0	return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67	0	}
68	0	}
69
70		U_CAPI UBool U_EXPORT2
71		unorm_isNormalized(const UChar *src, int32_t srcLength,
72		UNormalizationMode mode,
73	0	UErrorCode *pErrorCode) {
74	0	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
75	0	return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76	0	}
77
78		U_CAPI UBool U_EXPORT2
79		unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80		UNormalizationMode mode, int32_t options,
81	0	UErrorCode *pErrorCode) {
82	0	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
83	0	if(options&UNORM_UNICODE_3_2) {
84	0	FilteredNormalizer2 fn2(n2, uniset_getUnicode32Instance(*pErrorCode));
85	0	return unorm2_isNormalized(
86	0	reinterpret_cast<const UNormalizer2 >(static_cast<Normalizer2 >(&fn2)),
87	0	src, srcLength, pErrorCode);
88	0	} else {
89	0	return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90	0	}
91	0	}
92
93		/* normalize() API ---------------------------------------------------------- */
94
95		/** Public API for normalizing. */
96		U_CAPI int32_t U_EXPORT2
97		unorm_normalize(const UChar *src, int32_t srcLength,
98		UNormalizationMode mode, int32_t options,
99		UChar *dest, int32_t destCapacity,
100	0	UErrorCode *pErrorCode) {
101	0	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
102	0	if(options&UNORM_UNICODE_3_2) {
103	0	FilteredNormalizer2 fn2(n2, uniset_getUnicode32Instance(*pErrorCode));
104	0	return unorm2_normalize(
105	0	reinterpret_cast<const UNormalizer2 >(static_cast<Normalizer2 >(&fn2)),
106	0	src, srcLength, dest, destCapacity, pErrorCode);
107	0	} else {
108	0	return unorm2_normalize((const UNormalizer2 *)n2,
109	0	src, srcLength, dest, destCapacity, pErrorCode);
110	0	}
111	0	}
112
113
114		/* iteration functions ------------------------------------------------------ */
115
116		static int32_t
117		_iterate(UCharIterator *src, UBool forward,
118		UChar *dest, int32_t destCapacity,
119		const Normalizer2 *n2,
120		UBool doNormalize, UBool *pNeededToNormalize,
121	0	UErrorCode *pErrorCode) {
122	0	if(U_FAILURE(*pErrorCode)) {
123	0	return 0;
124	0	}
125	0	if(destCapacity<0 \|\| (dest==NULL && destCapacity>0) \|\| src==NULL) {
126	0	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
127	0	return 0;
128	0	}
129	0
130	0	if(pNeededToNormalize!=NULL) {
131	0	*pNeededToNormalize=FALSE;
132	0	}
133	0	if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
134	0	return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
135	0	}
136	0
137	0	UnicodeString buffer;
138	0	UChar32 c;
139	0	if(forward) {
140	0	/* get one character and ignore its properties */
141	0	buffer.append(uiter_next32(src));
142	0	/* get all following characters until we see a boundary */
143	0	while((c=uiter_next32(src))>=0) {
144	0	if(n2->hasBoundaryBefore(c)) {
145	0	/* back out the latest movement to stop at the boundary */
146	0	src->move(src, -U16_LENGTH(c), UITER_CURRENT);
147	0	break;
148	0	} else {
149	0	buffer.append(c);
150	0	}
151	0	}
152	0	} else {
153	0	while((c=uiter_previous32(src))>=0) {
154	0	/* always write this character to the front of the buffer */
155	0	buffer.insert(0, c);
156	0	/* stop if this just-copied character is a boundary */
157	0	if(n2->hasBoundaryBefore(c)) {
158	0	break;
159	0	}
160	0	}
161	0	}
162	0
163	0	UnicodeString destString(dest, 0, destCapacity);
164	0	if(buffer.length()>0 && doNormalize) {
165	0	n2->normalize(buffer, destString, pErrorCode).extract(dest, destCapacity, pErrorCode);
166	0	if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
167	0	*pNeededToNormalize= destString!=buffer;
168	0	}
169	0	return destString.length();
170	0	} else {
171	0	/* just copy the source characters */
172	0	return buffer.extract(dest, destCapacity, *pErrorCode);
173	0	}
174	0	}
175
176		static int32_t
177		unorm_iterate(UCharIterator *src, UBool forward,
178		UChar *dest, int32_t destCapacity,
179		UNormalizationMode mode, int32_t options,
180		UBool doNormalize, UBool *pNeededToNormalize,
181	0	UErrorCode *pErrorCode) {
182	0	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
183	0	if(options&UNORM_UNICODE_3_2) {
184	0	const UnicodeSet uni32 = uniset_getUnicode32Instance(pErrorCode);
185	0	if(U_FAILURE(*pErrorCode)) {
186	0	return 0;
187	0	}
188	0	FilteredNormalizer2 fn2(n2, uni32);
189	0	return _iterate(src, forward, dest, destCapacity,
190	0	&fn2, doNormalize, pNeededToNormalize, pErrorCode);
191	0	}
192	0	return _iterate(src, forward, dest, destCapacity,
193	0	n2, doNormalize, pNeededToNormalize, pErrorCode);
194	0	}
195
196		U_CAPI int32_t U_EXPORT2
197		unorm_previous(UCharIterator *src,
198		UChar *dest, int32_t destCapacity,
199		UNormalizationMode mode, int32_t options,
200		UBool doNormalize, UBool *pNeededToNormalize,
201	0	UErrorCode *pErrorCode) {
202	0	return unorm_iterate(src, FALSE,
203	0	dest, destCapacity,
204	0	mode, options,
205	0	doNormalize, pNeededToNormalize,
206	0	pErrorCode);
207	0	}
208
209		U_CAPI int32_t U_EXPORT2
210		unorm_next(UCharIterator *src,
211		UChar *dest, int32_t destCapacity,
212		UNormalizationMode mode, int32_t options,
213		UBool doNormalize, UBool *pNeededToNormalize,
214	0	UErrorCode *pErrorCode) {
215	0	return unorm_iterate(src, TRUE,
216	0	dest, destCapacity,
217	0	mode, options,
218	0	doNormalize, pNeededToNormalize,
219	0	pErrorCode);
220	0	}
221
222		/* Concatenation of normalized strings -------------------------------------- */
223
224		static int32_t
225		_concatenate(const UChar *left, int32_t leftLength,
226		const UChar *right, int32_t rightLength,
227		UChar *dest, int32_t destCapacity,
228		const Normalizer2 *n2,
229	0	UErrorCode *pErrorCode) {
230	0	if(U_FAILURE(*pErrorCode)) {
231	0	return 0;
232	0	}
233	0	if(destCapacity<0 \|\| (dest==NULL && destCapacity>0) \|\|
234	0	left==NULL \|\| leftLength<-1 \|\| right==NULL \|\| rightLength<-1) {
235	0	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
236	0	return 0;
237	0	}
238	0
239	0	/* check for overlapping right and destination */
240	0	if( dest!=NULL &&
241	0	((right>=dest && right<(dest+destCapacity)) \|\|
242	0	(rightLength>0 && dest>=right && dest<(right+rightLength)))
243	0	) {
244	0	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
245	0	return 0;
246	0	}
247	0
248	0	/* allow left==dest */
249	0	UnicodeString destString;
250	0	if(left==dest) {
251	0	destString.setTo(dest, leftLength, destCapacity);
252	0	} else {
253	0	destString.setTo(dest, 0, destCapacity);
254	0	destString.append(left, leftLength);
255	0	}
256	0	return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
257	0	extract(dest, destCapacity, *pErrorCode);
258	0	}
259
260		U_CAPI int32_t U_EXPORT2
261		unorm_concatenate(const UChar *left, int32_t leftLength,
262		const UChar *right, int32_t rightLength,
263		UChar *dest, int32_t destCapacity,
264		UNormalizationMode mode, int32_t options,
265	0	UErrorCode *pErrorCode) {
266	0	const Normalizer2 n2=Normalizer2Factory::getInstance(mode, pErrorCode);
267	0	if(options&UNORM_UNICODE_3_2) {
268	0	const UnicodeSet uni32 = uniset_getUnicode32Instance(pErrorCode);
269	0	if(U_FAILURE(*pErrorCode)) {
270	0	return 0;
271	0	}
272	0	FilteredNormalizer2 fn2(n2, uni32);
273	0	return _concatenate(left, leftLength, right, rightLength,
274	0	dest, destCapacity, &fn2, pErrorCode);
275	0	}
276	0	return _concatenate(left, leftLength, right, rightLength,
277	0	dest, destCapacity, n2, pErrorCode);
278	0	}
279
280		#endif /* #if !UCONFIG_NO_NORMALIZATION */