/src/icu/source/common/unistr_case.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 1999-2014, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | *******************************************************************************  | 
10  |  | *   file name:  unistr_case.cpp  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:2  | 
14  |  | *  | 
15  |  | *   created on: 2004aug19  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | *  | 
18  |  | *   Case-mapping functions moved here from unistr.cpp  | 
19  |  | */  | 
20  |  |  | 
21  |  | #include "unicode/utypes.h"  | 
22  |  | #include "unicode/brkiter.h"  | 
23  |  | #include "unicode/casemap.h"  | 
24  |  | #include "unicode/edits.h"  | 
25  |  | #include "unicode/putil.h"  | 
26  |  | #include "cstring.h"  | 
27  |  | #include "cmemory.h"  | 
28  |  | #include "unicode/ustring.h"  | 
29  |  | #include "unicode/unistr.h"  | 
30  |  | #include "unicode/uchar.h"  | 
31  |  | #include "uassert.h"  | 
32  |  | #include "ucasemap_imp.h"  | 
33  |  | #include "uelement.h"  | 
34  |  |  | 
35  |  | U_NAMESPACE_BEGIN  | 
36  |  |  | 
37  |  | //========================================  | 
38  |  | // Read-only implementation  | 
39  |  | //========================================  | 
40  |  |  | 
41  |  | int8_t  | 
42  |  | UnicodeString::doCaseCompare(int32_t start,  | 
43  |  |                              int32_t length,  | 
44  |  |                              const UChar *srcChars,  | 
45  |  |                              int32_t srcStart,  | 
46  |  |                              int32_t srcLength,  | 
47  |  |                              uint32_t options) const  | 
48  | 0  | { | 
49  |  |   // compare illegal string values  | 
50  |  |   // treat const UChar *srcChars==NULL as an empty string  | 
51  | 0  |   if(isBogus()) { | 
52  | 0  |     return -1;  | 
53  | 0  |   }  | 
54  |  |  | 
55  |  |   // pin indices to legal values  | 
56  | 0  |   pinIndices(start, length);  | 
57  |  | 
  | 
58  | 0  |   if(srcChars == NULL) { | 
59  | 0  |     srcStart = srcLength = 0;  | 
60  | 0  |   }  | 
61  |  |  | 
62  |  |   // get the correct pointer  | 
63  | 0  |   const UChar *chars = getArrayStart();  | 
64  |  | 
  | 
65  | 0  |   chars += start;  | 
66  | 0  |   if(srcStart!=0) { | 
67  | 0  |     srcChars += srcStart;  | 
68  | 0  |   }  | 
69  |  | 
  | 
70  | 0  |   if(chars != srcChars) { | 
71  | 0  |     UErrorCode errorCode=U_ZERO_ERROR;  | 
72  | 0  |     int32_t result=u_strcmpFold(chars, length, srcChars, srcLength,  | 
73  | 0  |                                 options|U_COMPARE_IGNORE_CASE, &errorCode);  | 
74  | 0  |     if(result!=0) { | 
75  | 0  |       return (int8_t)(result >> 24 | 1);  | 
76  | 0  |     }  | 
77  | 0  |   } else { | 
78  |  |     // get the srcLength if necessary  | 
79  | 0  |     if(srcLength < 0) { | 
80  | 0  |       srcLength = u_strlen(srcChars + srcStart);  | 
81  | 0  |     }  | 
82  | 0  |     if(length != srcLength) { | 
83  | 0  |       return (int8_t)((length - srcLength) >> 24 | 1);  | 
84  | 0  |     }  | 
85  | 0  |   }  | 
86  | 0  |   return 0;  | 
87  | 0  | }  | 
88  |  |  | 
89  |  | //========================================  | 
90  |  | // Write implementation  | 
91  |  | //========================================  | 
92  |  |  | 
93  |  | UnicodeString &  | 
94  |  | UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM  | 
95  | 0  |                        UStringCaseMapper *stringCaseMapper) { | 
96  | 0  |   if(isEmpty() || !isWritable()) { | 
97  |  |     // nothing to do  | 
98  | 0  |     return *this;  | 
99  | 0  |   }  | 
100  |  |  | 
101  | 0  |   UChar oldBuffer[2 * US_STACKBUF_SIZE];  | 
102  | 0  |   UChar *oldArray;  | 
103  | 0  |   int32_t oldLength = length();  | 
104  | 0  |   int32_t newLength;  | 
105  | 0  |   UBool writable = isBufferWritable();  | 
106  | 0  |   UErrorCode errorCode = U_ZERO_ERROR;  | 
107  |  | 
  | 
108  | 0  | #if !UCONFIG_NO_BREAK_ITERATION  | 
109  |  |   // Read-only alias to the original string contents for the titlecasing BreakIterator.  | 
110  |  |   // We cannot set the iterator simply to *this because *this is being modified.  | 
111  | 0  |   UnicodeString oldString;  | 
112  | 0  | #endif  | 
113  |  |  | 
114  |  |   // Try to avoid heap-allocating a new character array for this string.  | 
115  | 0  |   if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) { | 
116  |  |     // Short string: Copy the contents into a temporary buffer and  | 
117  |  |     // case-map back into the current array, or into the stack buffer.  | 
118  | 0  |     UChar *buffer = getArrayStart();  | 
119  | 0  |     int32_t capacity;  | 
120  | 0  |     oldArray = oldBuffer;  | 
121  | 0  |     u_memcpy(oldBuffer, buffer, oldLength);  | 
122  | 0  |     if (writable) { | 
123  | 0  |       capacity = getCapacity();  | 
124  | 0  |     } else { | 
125  |  |       // Switch from the read-only alias or shared heap buffer to the stack buffer.  | 
126  | 0  |       if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) { | 
127  | 0  |         return *this;  | 
128  | 0  |       }  | 
129  | 0  |       U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);  | 
130  | 0  |       buffer = fUnion.fStackFields.fBuffer;  | 
131  | 0  |       capacity = US_STACKBUF_SIZE;  | 
132  | 0  |     }  | 
133  | 0  | #if !UCONFIG_NO_BREAK_ITERATION  | 
134  | 0  |     if (iter != nullptr) { | 
135  | 0  |       oldString.setTo(FALSE, oldArray, oldLength);  | 
136  | 0  |       iter->setText(oldString);  | 
137  | 0  |     }  | 
138  | 0  | #endif  | 
139  | 0  |     newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR  | 
140  | 0  |                                  buffer, capacity,  | 
141  | 0  |                                  oldArray, oldLength, NULL, errorCode);  | 
142  | 0  |     if (U_SUCCESS(errorCode)) { | 
143  | 0  |       setLength(newLength);  | 
144  | 0  |       return *this;  | 
145  | 0  |     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) { | 
146  |  |       // common overflow handling below  | 
147  | 0  |     } else { | 
148  | 0  |       setToBogus();  | 
149  | 0  |       return *this;  | 
150  | 0  |     }  | 
151  | 0  |   } else { | 
152  |  |     // Longer string or read-only buffer:  | 
153  |  |     // Collect only changes and then apply them to this string.  | 
154  |  |     // Case mapping often changes only small parts of a string,  | 
155  |  |     // and often does not change its length.  | 
156  | 0  |     oldArray = getArrayStart();  | 
157  | 0  |     Edits edits;  | 
158  | 0  |     UChar replacementChars[200];  | 
159  | 0  | #if !UCONFIG_NO_BREAK_ITERATION  | 
160  | 0  |     if (iter != nullptr) { | 
161  | 0  |       oldString.setTo(FALSE, oldArray, oldLength);  | 
162  | 0  |       iter->setText(oldString);  | 
163  | 0  |     }  | 
164  | 0  | #endif  | 
165  | 0  |     stringCaseMapper(caseLocale, options | U_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR  | 
166  | 0  |                      replacementChars, UPRV_LENGTHOF(replacementChars),  | 
167  | 0  |                      oldArray, oldLength, &edits, errorCode);  | 
168  | 0  |     if (U_SUCCESS(errorCode)) { | 
169  |  |       // Grow the buffer at most once, not for multiple doReplace() calls.  | 
170  | 0  |       newLength = oldLength + edits.lengthDelta();  | 
171  | 0  |       if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) { | 
172  | 0  |         return *this;  | 
173  | 0  |       }  | 
174  | 0  |       for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) { | 
175  | 0  |         doReplace(ei.destinationIndex(), ei.oldLength(),  | 
176  | 0  |                   replacementChars, ei.replacementIndex(), ei.newLength());  | 
177  | 0  |       }  | 
178  | 0  |       if (U_FAILURE(errorCode)) { | 
179  | 0  |         setToBogus();  | 
180  | 0  |       }  | 
181  | 0  |       return *this;  | 
182  | 0  |     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) { | 
183  |  |       // common overflow handling below  | 
184  | 0  |       newLength = oldLength + edits.lengthDelta();  | 
185  | 0  |     } else { | 
186  | 0  |       setToBogus();  | 
187  | 0  |       return *this;  | 
188  | 0  |     }  | 
189  | 0  |   }  | 
190  |  |  | 
191  |  |   // Handle buffer overflow, newLength is known.  | 
192  |  |   // We need to allocate a new buffer for the internal string case mapping function.  | 
193  |  |   // This is very similar to how doReplace() keeps the old array pointer  | 
194  |  |   // and deletes the old array itself after it is done.  | 
195  |  |   // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.  | 
196  | 0  |   int32_t *bufferToDelete = 0;  | 
197  | 0  |   if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) { | 
198  | 0  |     return *this;  | 
199  | 0  |   }  | 
200  | 0  |   errorCode = U_ZERO_ERROR;  | 
201  |  |   // No need to iter->setText() again: The case mapper restarts via iter->first().  | 
202  | 0  |   newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR  | 
203  | 0  |                                getArrayStart(), getCapacity(),  | 
204  | 0  |                                oldArray, oldLength, NULL, errorCode);  | 
205  | 0  |   if (bufferToDelete) { | 
206  | 0  |     uprv_free(bufferToDelete);  | 
207  | 0  |   }  | 
208  | 0  |   if (U_SUCCESS(errorCode)) { | 
209  | 0  |     setLength(newLength);  | 
210  | 0  |   } else { | 
211  | 0  |     setToBogus();  | 
212  | 0  |   }  | 
213  | 0  |   return *this;  | 
214  | 0  | }  | 
215  |  |  | 
216  |  | UnicodeString &  | 
217  | 0  | UnicodeString::foldCase(uint32_t options) { | 
218  | 0  |   return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold);  | 
219  | 0  | }  | 
220  |  |  | 
221  |  | U_NAMESPACE_END  | 
222  |  |  | 
223  |  | // Defined here to reduce dependencies on break iterator  | 
224  |  | U_CAPI int32_t U_EXPORT2  | 
225  | 0  | uhash_hashCaselessUnicodeString(const UElement key) { | 
226  | 0  |     U_NAMESPACE_USE  | 
227  | 0  |     const UnicodeString *str = (const UnicodeString*) key.pointer;  | 
228  | 0  |     if (str == NULL) { | 
229  | 0  |         return 0;  | 
230  | 0  |     }  | 
231  |  |     // Inefficient; a better way would be to have a hash function in  | 
232  |  |     // UnicodeString that does case folding on the fly.  | 
233  | 0  |     UnicodeString copy(*str);  | 
234  | 0  |     return copy.foldCase().hashCode();  | 
235  | 0  | }  | 
236  |  |  | 
237  |  | // Defined here to reduce dependencies on break iterator  | 
238  |  | U_CAPI UBool U_EXPORT2  | 
239  | 0  | uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2) { | 
240  | 0  |     U_NAMESPACE_USE  | 
241  | 0  |     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;  | 
242  | 0  |     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;  | 
243  | 0  |     if (str1 == str2) { | 
244  | 0  |         return TRUE;  | 
245  | 0  |     }  | 
246  | 0  |     if (str1 == NULL || str2 == NULL) { | 
247  | 0  |         return FALSE;  | 
248  | 0  |     }  | 
249  | 0  |     return str1->caseCompare(*str2, U_FOLD_CASE_DEFAULT) == 0;  | 
250  | 0  | }  |