/src/icu/source/common/ustring.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ****************************************************************************** |
5 | | * |
6 | | * Copyright (C) 1998-2016, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ****************************************************************************** |
10 | | * |
11 | | * File ustring.cpp |
12 | | * |
13 | | * Modification History: |
14 | | * |
15 | | * Date Name Description |
16 | | * 12/07/98 bertrand Creation. |
17 | | ****************************************************************************** |
18 | | */ |
19 | | |
20 | | #include "unicode/utypes.h" |
21 | | #include "unicode/putil.h" |
22 | | #include "unicode/uchar.h" |
23 | | #include "unicode/ustring.h" |
24 | | #include "unicode/utf16.h" |
25 | | #include "cstring.h" |
26 | | #include "cwchar.h" |
27 | | #include "cmemory.h" |
28 | | #include "ustr_imp.h" |
29 | | |
30 | | /* ANSI string.h - style functions ------------------------------------------ */ |
31 | | |
32 | | /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ |
33 | 0 | #define U_BMP_MAX 0xffff |
34 | | |
35 | | /* Forward binary string search functions ----------------------------------- */ |
36 | | |
37 | | /* |
38 | | * Test if a substring match inside a string is at code point boundaries. |
39 | | * All pointers refer to the same buffer. |
40 | | * The limit pointer may be NULL, all others must be real pointers. |
41 | | */ |
42 | | static inline UBool |
43 | 0 | isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { |
44 | 0 | if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { |
45 | | /* the leading edge of the match is in the middle of a surrogate pair */ |
46 | 0 | return FALSE; |
47 | 0 | } |
48 | 0 | if(U16_IS_LEAD(*(matchLimit-1)) && matchLimit!=limit && U16_IS_TRAIL(*matchLimit)) { |
49 | | /* the trailing edge of the match is in the middle of a surrogate pair */ |
50 | 0 | return FALSE; |
51 | 0 | } |
52 | 0 | return TRUE; |
53 | 0 | } |
54 | | |
55 | | U_CAPI UChar * U_EXPORT2 |
56 | | u_strFindFirst(const UChar *s, int32_t length, |
57 | 0 | const UChar *sub, int32_t subLength) { |
58 | 0 | const UChar *start, *p, *q, *subLimit; |
59 | 0 | UChar c, cs, cq; |
60 | |
|
61 | 0 | if(sub==NULL || subLength<-1) { |
62 | 0 | return (UChar *)s; |
63 | 0 | } |
64 | 0 | if(s==NULL || length<-1) { |
65 | 0 | return NULL; |
66 | 0 | } |
67 | | |
68 | 0 | start=s; |
69 | |
|
70 | 0 | if(length<0 && subLength<0) { |
71 | | /* both strings are NUL-terminated */ |
72 | 0 | if((cs=*sub++)==0) { |
73 | 0 | return (UChar *)s; |
74 | 0 | } |
75 | 0 | if(*sub==0 && !U16_IS_SURROGATE(cs)) { |
76 | | /* the substring consists of a single, non-surrogate BMP code point */ |
77 | 0 | return u_strchr(s, cs); |
78 | 0 | } |
79 | | |
80 | 0 | while((c=*s++)!=0) { |
81 | 0 | if(c==cs) { |
82 | | /* found first substring UChar, compare rest */ |
83 | 0 | p=s; |
84 | 0 | q=sub; |
85 | 0 | for(;;) { |
86 | 0 | if((cq=*q)==0) { |
87 | 0 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { |
88 | 0 | return (UChar *)(s-1); /* well-formed match */ |
89 | 0 | } else { |
90 | 0 | break; /* no match because surrogate pair is split */ |
91 | 0 | } |
92 | 0 | } |
93 | 0 | if((c=*p)==0) { |
94 | 0 | return NULL; /* no match, and none possible after s */ |
95 | 0 | } |
96 | 0 | if(c!=cq) { |
97 | 0 | break; /* no match */ |
98 | 0 | } |
99 | 0 | ++p; |
100 | 0 | ++q; |
101 | 0 | } |
102 | 0 | } |
103 | 0 | } |
104 | | |
105 | | /* not found */ |
106 | 0 | return NULL; |
107 | 0 | } |
108 | | |
109 | 0 | if(subLength<0) { |
110 | 0 | subLength=u_strlen(sub); |
111 | 0 | } |
112 | 0 | if(subLength==0) { |
113 | 0 | return (UChar *)s; |
114 | 0 | } |
115 | | |
116 | | /* get sub[0] to search for it fast */ |
117 | 0 | cs=*sub++; |
118 | 0 | --subLength; |
119 | 0 | subLimit=sub+subLength; |
120 | |
|
121 | 0 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { |
122 | | /* the substring consists of a single, non-surrogate BMP code point */ |
123 | 0 | return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); |
124 | 0 | } |
125 | | |
126 | 0 | if(length<0) { |
127 | | /* s is NUL-terminated */ |
128 | 0 | while((c=*s++)!=0) { |
129 | 0 | if(c==cs) { |
130 | | /* found first substring UChar, compare rest */ |
131 | 0 | p=s; |
132 | 0 | q=sub; |
133 | 0 | for(;;) { |
134 | 0 | if(q==subLimit) { |
135 | 0 | if(isMatchAtCPBoundary(start, s-1, p, NULL)) { |
136 | 0 | return (UChar *)(s-1); /* well-formed match */ |
137 | 0 | } else { |
138 | 0 | break; /* no match because surrogate pair is split */ |
139 | 0 | } |
140 | 0 | } |
141 | 0 | if((c=*p)==0) { |
142 | 0 | return NULL; /* no match, and none possible after s */ |
143 | 0 | } |
144 | 0 | if(c!=*q) { |
145 | 0 | break; /* no match */ |
146 | 0 | } |
147 | 0 | ++p; |
148 | 0 | ++q; |
149 | 0 | } |
150 | 0 | } |
151 | 0 | } |
152 | 0 | } else { |
153 | 0 | const UChar *limit, *preLimit; |
154 | | |
155 | | /* subLength was decremented above */ |
156 | 0 | if(length<=subLength) { |
157 | 0 | return NULL; /* s is shorter than sub */ |
158 | 0 | } |
159 | | |
160 | 0 | limit=s+length; |
161 | | |
162 | | /* the substring must start before preLimit */ |
163 | 0 | preLimit=limit-subLength; |
164 | |
|
165 | 0 | while(s!=preLimit) { |
166 | 0 | c=*s++; |
167 | 0 | if(c==cs) { |
168 | | /* found first substring UChar, compare rest */ |
169 | 0 | p=s; |
170 | 0 | q=sub; |
171 | 0 | for(;;) { |
172 | 0 | if(q==subLimit) { |
173 | 0 | if(isMatchAtCPBoundary(start, s-1, p, limit)) { |
174 | 0 | return (UChar *)(s-1); /* well-formed match */ |
175 | 0 | } else { |
176 | 0 | break; /* no match because surrogate pair is split */ |
177 | 0 | } |
178 | 0 | } |
179 | 0 | if(*p!=*q) { |
180 | 0 | break; /* no match */ |
181 | 0 | } |
182 | 0 | ++p; |
183 | 0 | ++q; |
184 | 0 | } |
185 | 0 | } |
186 | 0 | } |
187 | 0 | } |
188 | | |
189 | | /* not found */ |
190 | 0 | return NULL; |
191 | 0 | } |
192 | | |
193 | | U_CAPI UChar * U_EXPORT2 |
194 | 0 | u_strstr(const UChar *s, const UChar *substring) { |
195 | 0 | return u_strFindFirst(s, -1, substring, -1); |
196 | 0 | } |
197 | | |
198 | | U_CAPI UChar * U_EXPORT2 |
199 | 0 | u_strchr(const UChar *s, UChar c) { |
200 | 0 | if(U16_IS_SURROGATE(c)) { |
201 | | /* make sure to not find half of a surrogate pair */ |
202 | 0 | return u_strFindFirst(s, -1, &c, 1); |
203 | 0 | } else { |
204 | 0 | UChar cs; |
205 | | |
206 | | /* trivial search for a BMP code point */ |
207 | 0 | for(;;) { |
208 | 0 | if((cs=*s)==c) { |
209 | 0 | return (UChar *)s; |
210 | 0 | } |
211 | 0 | if(cs==0) { |
212 | 0 | return NULL; |
213 | 0 | } |
214 | 0 | ++s; |
215 | 0 | } |
216 | 0 | } |
217 | 0 | } |
218 | | |
219 | | U_CAPI UChar * U_EXPORT2 |
220 | 0 | u_strchr32(const UChar *s, UChar32 c) { |
221 | 0 | if((uint32_t)c<=U_BMP_MAX) { |
222 | | /* find BMP code point */ |
223 | 0 | return u_strchr(s, (UChar)c); |
224 | 0 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
225 | | /* find supplementary code point as surrogate pair */ |
226 | 0 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); |
227 | |
|
228 | 0 | while((cs=*s++)!=0) { |
229 | 0 | if(cs==lead && *s==trail) { |
230 | 0 | return (UChar *)(s-1); |
231 | 0 | } |
232 | 0 | } |
233 | 0 | return NULL; |
234 | 0 | } else { |
235 | | /* not a Unicode code point, not findable */ |
236 | 0 | return NULL; |
237 | 0 | } |
238 | 0 | } |
239 | | |
240 | | U_CAPI UChar * U_EXPORT2 |
241 | 0 | u_memchr(const UChar *s, UChar c, int32_t count) { |
242 | 0 | if(count<=0) { |
243 | 0 | return NULL; /* no string */ |
244 | 0 | } else if(U16_IS_SURROGATE(c)) { |
245 | | /* make sure to not find half of a surrogate pair */ |
246 | 0 | return u_strFindFirst(s, count, &c, 1); |
247 | 0 | } else { |
248 | | /* trivial search for a BMP code point */ |
249 | 0 | const UChar *limit=s+count; |
250 | 0 | do { |
251 | 0 | if(*s==c) { |
252 | 0 | return (UChar *)s; |
253 | 0 | } |
254 | 0 | } while(++s!=limit); |
255 | 0 | return NULL; |
256 | 0 | } |
257 | 0 | } |
258 | | |
259 | | U_CAPI UChar * U_EXPORT2 |
260 | 0 | u_memchr32(const UChar *s, UChar32 c, int32_t count) { |
261 | 0 | if((uint32_t)c<=U_BMP_MAX) { |
262 | | /* find BMP code point */ |
263 | 0 | return u_memchr(s, (UChar)c, count); |
264 | 0 | } else if(count<2) { |
265 | | /* too short for a surrogate pair */ |
266 | 0 | return NULL; |
267 | 0 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
268 | | /* find supplementary code point as surrogate pair */ |
269 | 0 | const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ |
270 | 0 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); |
271 | |
|
272 | 0 | do { |
273 | 0 | if(*s==lead && *(s+1)==trail) { |
274 | 0 | return (UChar *)s; |
275 | 0 | } |
276 | 0 | } while(++s!=limit); |
277 | 0 | return NULL; |
278 | 0 | } else { |
279 | | /* not a Unicode code point, not findable */ |
280 | 0 | return NULL; |
281 | 0 | } |
282 | 0 | } |
283 | | |
284 | | /* Backward binary string search functions ---------------------------------- */ |
285 | | |
286 | | U_CAPI UChar * U_EXPORT2 |
287 | | u_strFindLast(const UChar *s, int32_t length, |
288 | 0 | const UChar *sub, int32_t subLength) { |
289 | 0 | const UChar *start, *limit, *p, *q, *subLimit; |
290 | 0 | UChar c, cs; |
291 | |
|
292 | 0 | if(sub==NULL || subLength<-1) { |
293 | 0 | return (UChar *)s; |
294 | 0 | } |
295 | 0 | if(s==NULL || length<-1) { |
296 | 0 | return NULL; |
297 | 0 | } |
298 | | |
299 | | /* |
300 | | * This implementation is more lazy than the one for u_strFindFirst(): |
301 | | * There is no special search code for NUL-terminated strings. |
302 | | * It does not seem to be worth it for searching substrings to |
303 | | * search forward and find all matches like in u_strrchr() and similar. |
304 | | * Therefore, we simply get both string lengths and search backward. |
305 | | * |
306 | | * markus 2002oct23 |
307 | | */ |
308 | | |
309 | 0 | if(subLength<0) { |
310 | 0 | subLength=u_strlen(sub); |
311 | 0 | } |
312 | 0 | if(subLength==0) { |
313 | 0 | return (UChar *)s; |
314 | 0 | } |
315 | | |
316 | | /* get sub[subLength-1] to search for it fast */ |
317 | 0 | subLimit=sub+subLength; |
318 | 0 | cs=*(--subLimit); |
319 | 0 | --subLength; |
320 | |
|
321 | 0 | if(subLength==0 && !U16_IS_SURROGATE(cs)) { |
322 | | /* the substring consists of a single, non-surrogate BMP code point */ |
323 | 0 | return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); |
324 | 0 | } |
325 | | |
326 | 0 | if(length<0) { |
327 | 0 | length=u_strlen(s); |
328 | 0 | } |
329 | | |
330 | | /* subLength was decremented above */ |
331 | 0 | if(length<=subLength) { |
332 | 0 | return NULL; /* s is shorter than sub */ |
333 | 0 | } |
334 | | |
335 | 0 | start=s; |
336 | 0 | limit=s+length; |
337 | | |
338 | | /* the substring must start no later than s+subLength */ |
339 | 0 | s+=subLength; |
340 | |
|
341 | 0 | while(s!=limit) { |
342 | 0 | c=*(--limit); |
343 | 0 | if(c==cs) { |
344 | | /* found last substring UChar, compare rest */ |
345 | 0 | p=limit; |
346 | 0 | q=subLimit; |
347 | 0 | for(;;) { |
348 | 0 | if(q==sub) { |
349 | 0 | if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { |
350 | 0 | return (UChar *)p; /* well-formed match */ |
351 | 0 | } else { |
352 | 0 | break; /* no match because surrogate pair is split */ |
353 | 0 | } |
354 | 0 | } |
355 | 0 | if(*(--p)!=*(--q)) { |
356 | 0 | break; /* no match */ |
357 | 0 | } |
358 | 0 | } |
359 | 0 | } |
360 | 0 | } |
361 | | |
362 | | /* not found */ |
363 | 0 | return NULL; |
364 | 0 | } |
365 | | |
366 | | U_CAPI UChar * U_EXPORT2 |
367 | 0 | u_strrstr(const UChar *s, const UChar *substring) { |
368 | 0 | return u_strFindLast(s, -1, substring, -1); |
369 | 0 | } |
370 | | |
371 | | U_CAPI UChar * U_EXPORT2 |
372 | 0 | u_strrchr(const UChar *s, UChar c) { |
373 | 0 | if(U16_IS_SURROGATE(c)) { |
374 | | /* make sure to not find half of a surrogate pair */ |
375 | 0 | return u_strFindLast(s, -1, &c, 1); |
376 | 0 | } else { |
377 | 0 | const UChar *result=NULL; |
378 | 0 | UChar cs; |
379 | | |
380 | | /* trivial search for a BMP code point */ |
381 | 0 | for(;;) { |
382 | 0 | if((cs=*s)==c) { |
383 | 0 | result=s; |
384 | 0 | } |
385 | 0 | if(cs==0) { |
386 | 0 | return (UChar *)result; |
387 | 0 | } |
388 | 0 | ++s; |
389 | 0 | } |
390 | 0 | } |
391 | 0 | } |
392 | | |
393 | | U_CAPI UChar * U_EXPORT2 |
394 | 0 | u_strrchr32(const UChar *s, UChar32 c) { |
395 | 0 | if((uint32_t)c<=U_BMP_MAX) { |
396 | | /* find BMP code point */ |
397 | 0 | return u_strrchr(s, (UChar)c); |
398 | 0 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
399 | | /* find supplementary code point as surrogate pair */ |
400 | 0 | const UChar *result=NULL; |
401 | 0 | UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); |
402 | |
|
403 | 0 | while((cs=*s++)!=0) { |
404 | 0 | if(cs==lead && *s==trail) { |
405 | 0 | result=s-1; |
406 | 0 | } |
407 | 0 | } |
408 | 0 | return (UChar *)result; |
409 | 0 | } else { |
410 | | /* not a Unicode code point, not findable */ |
411 | 0 | return NULL; |
412 | 0 | } |
413 | 0 | } |
414 | | |
415 | | U_CAPI UChar * U_EXPORT2 |
416 | 0 | u_memrchr(const UChar *s, UChar c, int32_t count) { |
417 | 0 | if(count<=0) { |
418 | 0 | return NULL; /* no string */ |
419 | 0 | } else if(U16_IS_SURROGATE(c)) { |
420 | | /* make sure to not find half of a surrogate pair */ |
421 | 0 | return u_strFindLast(s, count, &c, 1); |
422 | 0 | } else { |
423 | | /* trivial search for a BMP code point */ |
424 | 0 | const UChar *limit=s+count; |
425 | 0 | do { |
426 | 0 | if(*(--limit)==c) { |
427 | 0 | return (UChar *)limit; |
428 | 0 | } |
429 | 0 | } while(s!=limit); |
430 | 0 | return NULL; |
431 | 0 | } |
432 | 0 | } |
433 | | |
434 | | U_CAPI UChar * U_EXPORT2 |
435 | 0 | u_memrchr32(const UChar *s, UChar32 c, int32_t count) { |
436 | 0 | if((uint32_t)c<=U_BMP_MAX) { |
437 | | /* find BMP code point */ |
438 | 0 | return u_memrchr(s, (UChar)c, count); |
439 | 0 | } else if(count<2) { |
440 | | /* too short for a surrogate pair */ |
441 | 0 | return NULL; |
442 | 0 | } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
443 | | /* find supplementary code point as surrogate pair */ |
444 | 0 | const UChar *limit=s+count-1; |
445 | 0 | UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); |
446 | |
|
447 | 0 | do { |
448 | 0 | if(*limit==trail && *(limit-1)==lead) { |
449 | 0 | return (UChar *)(limit-1); |
450 | 0 | } |
451 | 0 | } while(s!=--limit); |
452 | 0 | return NULL; |
453 | 0 | } else { |
454 | | /* not a Unicode code point, not findable */ |
455 | 0 | return NULL; |
456 | 0 | } |
457 | 0 | } |
458 | | |
459 | | /* Tokenization functions --------------------------------------------------- */ |
460 | | |
461 | | /* |
462 | | * Match each code point in a string against each code point in the matchSet. |
463 | | * Return the index of the first string code point that |
464 | | * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. |
465 | | * Return -(string length)-1 if there is no such code point. |
466 | | */ |
467 | | static int32_t |
468 | 0 | _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { |
469 | 0 | int32_t matchLen, matchBMPLen, strItr, matchItr; |
470 | 0 | UChar32 stringCh, matchCh; |
471 | 0 | UChar c, c2; |
472 | | |
473 | | /* first part of matchSet contains only BMP code points */ |
474 | 0 | matchBMPLen = 0; |
475 | 0 | while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { |
476 | 0 | ++matchBMPLen; |
477 | 0 | } |
478 | | |
479 | | /* second part of matchSet contains BMP and supplementary code points */ |
480 | 0 | matchLen = matchBMPLen; |
481 | 0 | while(matchSet[matchLen] != 0) { |
482 | 0 | ++matchLen; |
483 | 0 | } |
484 | |
|
485 | 0 | for(strItr = 0; (c = string[strItr]) != 0;) { |
486 | 0 | ++strItr; |
487 | 0 | if(U16_IS_SINGLE(c)) { |
488 | 0 | if(polarity) { |
489 | 0 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { |
490 | 0 | if(c == matchSet[matchItr]) { |
491 | 0 | return strItr - 1; /* one matches */ |
492 | 0 | } |
493 | 0 | } |
494 | 0 | } else { |
495 | 0 | for(matchItr = 0; matchItr < matchLen; ++matchItr) { |
496 | 0 | if(c == matchSet[matchItr]) { |
497 | 0 | goto endloop; |
498 | 0 | } |
499 | 0 | } |
500 | 0 | return strItr - 1; /* none matches */ |
501 | 0 | } |
502 | 0 | } else { |
503 | | /* |
504 | | * No need to check for string length before U16_IS_TRAIL |
505 | | * because c2 could at worst be the terminating NUL. |
506 | | */ |
507 | 0 | if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { |
508 | 0 | ++strItr; |
509 | 0 | stringCh = U16_GET_SUPPLEMENTARY(c, c2); |
510 | 0 | } else { |
511 | 0 | stringCh = c; /* unpaired trail surrogate */ |
512 | 0 | } |
513 | |
|
514 | 0 | if(polarity) { |
515 | 0 | for(matchItr = matchBMPLen; matchItr < matchLen;) { |
516 | 0 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); |
517 | 0 | if(stringCh == matchCh) { |
518 | 0 | return strItr - U16_LENGTH(stringCh); /* one matches */ |
519 | 0 | } |
520 | 0 | } |
521 | 0 | } else { |
522 | 0 | for(matchItr = matchBMPLen; matchItr < matchLen;) { |
523 | 0 | U16_NEXT(matchSet, matchItr, matchLen, matchCh); |
524 | 0 | if(stringCh == matchCh) { |
525 | 0 | goto endloop; |
526 | 0 | } |
527 | 0 | } |
528 | 0 | return strItr - U16_LENGTH(stringCh); /* none matches */ |
529 | 0 | } |
530 | 0 | } |
531 | 0 | endloop: |
532 | 0 | /* wish C had continue with labels like Java... */; |
533 | 0 | } |
534 | | |
535 | | /* Didn't find it. */ |
536 | 0 | return -strItr-1; |
537 | 0 | } |
538 | | |
539 | | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ |
540 | | U_CAPI UChar * U_EXPORT2 |
541 | | u_strpbrk(const UChar *string, const UChar *matchSet) |
542 | 0 | { |
543 | 0 | int32_t idx = _matchFromSet(string, matchSet, TRUE); |
544 | 0 | if(idx >= 0) { |
545 | 0 | return (UChar *)string + idx; |
546 | 0 | } else { |
547 | 0 | return NULL; |
548 | 0 | } |
549 | 0 | } |
550 | | |
551 | | /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ |
552 | | U_CAPI int32_t U_EXPORT2 |
553 | | u_strcspn(const UChar *string, const UChar *matchSet) |
554 | 0 | { |
555 | 0 | int32_t idx = _matchFromSet(string, matchSet, TRUE); |
556 | 0 | if(idx >= 0) { |
557 | 0 | return idx; |
558 | 0 | } else { |
559 | 0 | return -idx - 1; /* == u_strlen(string) */ |
560 | 0 | } |
561 | 0 | } |
562 | | |
563 | | /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ |
564 | | U_CAPI int32_t U_EXPORT2 |
565 | | u_strspn(const UChar *string, const UChar *matchSet) |
566 | 0 | { |
567 | 0 | int32_t idx = _matchFromSet(string, matchSet, FALSE); |
568 | 0 | if(idx >= 0) { |
569 | 0 | return idx; |
570 | 0 | } else { |
571 | 0 | return -idx - 1; /* == u_strlen(string) */ |
572 | 0 | } |
573 | 0 | } |
574 | | |
575 | | /* ----- Text manipulation functions --- */ |
576 | | |
577 | | U_CAPI UChar* U_EXPORT2 |
578 | | u_strtok_r(UChar *src, |
579 | | const UChar *delim, |
580 | | UChar **saveState) |
581 | 0 | { |
582 | 0 | UChar *tokSource; |
583 | 0 | UChar *nextToken; |
584 | 0 | uint32_t nonDelimIdx; |
585 | | |
586 | | /* If saveState is NULL, the user messed up. */ |
587 | 0 | if (src != NULL) { |
588 | 0 | tokSource = src; |
589 | 0 | *saveState = src; /* Set to "src" in case there are no delimiters */ |
590 | 0 | } |
591 | 0 | else if (*saveState) { |
592 | 0 | tokSource = *saveState; |
593 | 0 | } |
594 | 0 | else { |
595 | | /* src == NULL && *saveState == NULL */ |
596 | | /* This shouldn't happen. We already finished tokenizing. */ |
597 | 0 | return NULL; |
598 | 0 | } |
599 | | |
600 | | /* Skip initial delimiters */ |
601 | 0 | nonDelimIdx = u_strspn(tokSource, delim); |
602 | 0 | tokSource = &tokSource[nonDelimIdx]; |
603 | |
|
604 | 0 | if (*tokSource) { |
605 | 0 | nextToken = u_strpbrk(tokSource, delim); |
606 | 0 | if (nextToken != NULL) { |
607 | | /* Create a token */ |
608 | 0 | *(nextToken++) = 0; |
609 | 0 | *saveState = nextToken; |
610 | 0 | return tokSource; |
611 | 0 | } |
612 | 0 | else if (*saveState) { |
613 | | /* Return the last token */ |
614 | 0 | *saveState = NULL; |
615 | 0 | return tokSource; |
616 | 0 | } |
617 | 0 | } |
618 | 0 | else { |
619 | | /* No tokens were found. Only delimiters were left. */ |
620 | 0 | *saveState = NULL; |
621 | 0 | } |
622 | 0 | return NULL; |
623 | 0 | } |
624 | | |
625 | | /* Miscellaneous functions -------------------------------------------------- */ |
626 | | |
627 | | U_CAPI UChar* U_EXPORT2 |
628 | | u_strcat(UChar *dst, |
629 | | const UChar *src) |
630 | 0 | { |
631 | 0 | UChar *anchor = dst; /* save a pointer to start of dst */ |
632 | |
|
633 | 0 | while(*dst != 0) { /* To end of first string */ |
634 | 0 | ++dst; |
635 | 0 | } |
636 | 0 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ |
637 | 0 | } |
638 | |
|
639 | 0 | return anchor; |
640 | 0 | } |
641 | | |
642 | | U_CAPI UChar* U_EXPORT2 |
643 | | u_strncat(UChar *dst, |
644 | | const UChar *src, |
645 | | int32_t n ) |
646 | 0 | { |
647 | 0 | if(n > 0) { |
648 | 0 | UChar *anchor = dst; /* save a pointer to start of dst */ |
649 | |
|
650 | 0 | while(*dst != 0) { /* To end of first string */ |
651 | 0 | ++dst; |
652 | 0 | } |
653 | 0 | while((*dst = *src) != 0) { /* copy string 2 over */ |
654 | 0 | ++dst; |
655 | 0 | if(--n == 0) { |
656 | 0 | *dst = 0; |
657 | 0 | break; |
658 | 0 | } |
659 | 0 | ++src; |
660 | 0 | } |
661 | |
|
662 | 0 | return anchor; |
663 | 0 | } else { |
664 | 0 | return dst; |
665 | 0 | } |
666 | 0 | } |
667 | | |
668 | | /* ----- Text property functions --- */ |
669 | | |
670 | | U_CAPI int32_t U_EXPORT2 |
671 | | u_strcmp(const UChar *s1, |
672 | | const UChar *s2) |
673 | 0 | { |
674 | 0 | UChar c1, c2; |
675 | |
|
676 | 0 | for(;;) { |
677 | 0 | c1=*s1++; |
678 | 0 | c2=*s2++; |
679 | 0 | if (c1 != c2 || c1 == 0) { |
680 | 0 | break; |
681 | 0 | } |
682 | 0 | } |
683 | 0 | return (int32_t)c1 - (int32_t)c2; |
684 | 0 | } |
685 | | |
686 | | U_CFUNC int32_t U_EXPORT2 |
687 | | uprv_strCompare(const UChar *s1, int32_t length1, |
688 | | const UChar *s2, int32_t length2, |
689 | 0 | UBool strncmpStyle, UBool codePointOrder) { |
690 | 0 | const UChar *start1, *start2, *limit1, *limit2; |
691 | 0 | UChar c1, c2; |
692 | | |
693 | | /* setup for fix-up */ |
694 | 0 | start1=s1; |
695 | 0 | start2=s2; |
696 | | |
697 | | /* compare identical prefixes - they do not need to be fixed up */ |
698 | 0 | if(length1<0 && length2<0) { |
699 | | /* strcmp style, both NUL-terminated */ |
700 | 0 | if(s1==s2) { |
701 | 0 | return 0; |
702 | 0 | } |
703 | | |
704 | 0 | for(;;) { |
705 | 0 | c1=*s1; |
706 | 0 | c2=*s2; |
707 | 0 | if(c1!=c2) { |
708 | 0 | break; |
709 | 0 | } |
710 | 0 | if(c1==0) { |
711 | 0 | return 0; |
712 | 0 | } |
713 | 0 | ++s1; |
714 | 0 | ++s2; |
715 | 0 | } |
716 | | |
717 | | /* setup for fix-up */ |
718 | 0 | limit1=limit2=NULL; |
719 | 0 | } else if(strncmpStyle) { |
720 | | /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ |
721 | 0 | if(s1==s2) { |
722 | 0 | return 0; |
723 | 0 | } |
724 | | |
725 | 0 | limit1=start1+length1; |
726 | |
|
727 | 0 | for(;;) { |
728 | | /* both lengths are same, check only one limit */ |
729 | 0 | if(s1==limit1) { |
730 | 0 | return 0; |
731 | 0 | } |
732 | | |
733 | 0 | c1=*s1; |
734 | 0 | c2=*s2; |
735 | 0 | if(c1!=c2) { |
736 | 0 | break; |
737 | 0 | } |
738 | 0 | if(c1==0) { |
739 | 0 | return 0; |
740 | 0 | } |
741 | 0 | ++s1; |
742 | 0 | ++s2; |
743 | 0 | } |
744 | | |
745 | | /* setup for fix-up */ |
746 | 0 | limit2=start2+length1; /* use length1 here, too, to enforce assumption */ |
747 | 0 | } else { |
748 | | /* memcmp/UnicodeString style, both length-specified */ |
749 | 0 | int32_t lengthResult; |
750 | |
|
751 | 0 | if(length1<0) { |
752 | 0 | length1=u_strlen(s1); |
753 | 0 | } |
754 | 0 | if(length2<0) { |
755 | 0 | length2=u_strlen(s2); |
756 | 0 | } |
757 | | |
758 | | /* limit1=start1+min(length1, length2) */ |
759 | 0 | if(length1<length2) { |
760 | 0 | lengthResult=-1; |
761 | 0 | limit1=start1+length1; |
762 | 0 | } else if(length1==length2) { |
763 | 0 | lengthResult=0; |
764 | 0 | limit1=start1+length1; |
765 | 0 | } else /* length1>length2 */ { |
766 | 0 | lengthResult=1; |
767 | 0 | limit1=start1+length2; |
768 | 0 | } |
769 | |
|
770 | 0 | if(s1==s2) { |
771 | 0 | return lengthResult; |
772 | 0 | } |
773 | | |
774 | 0 | for(;;) { |
775 | | /* check pseudo-limit */ |
776 | 0 | if(s1==limit1) { |
777 | 0 | return lengthResult; |
778 | 0 | } |
779 | | |
780 | 0 | c1=*s1; |
781 | 0 | c2=*s2; |
782 | 0 | if(c1!=c2) { |
783 | 0 | break; |
784 | 0 | } |
785 | 0 | ++s1; |
786 | 0 | ++s2; |
787 | 0 | } |
788 | | |
789 | | /* setup for fix-up */ |
790 | 0 | limit1=start1+length1; |
791 | 0 | limit2=start2+length2; |
792 | 0 | } |
793 | | |
794 | | /* if both values are in or above the surrogate range, fix them up */ |
795 | 0 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { |
796 | | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ |
797 | 0 | if( |
798 | 0 | (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) || |
799 | 0 | (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1))) |
800 | 0 | ) { |
801 | | /* part of a surrogate pair, leave >=d800 */ |
802 | 0 | } else { |
803 | | /* BMP code point - may be surrogate code point - make <d800 */ |
804 | 0 | c1-=0x2800; |
805 | 0 | } |
806 | |
|
807 | 0 | if( |
808 | 0 | (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) || |
809 | 0 | (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1))) |
810 | 0 | ) { |
811 | | /* part of a surrogate pair, leave >=d800 */ |
812 | 0 | } else { |
813 | | /* BMP code point - may be surrogate code point - make <d800 */ |
814 | 0 | c2-=0x2800; |
815 | 0 | } |
816 | 0 | } |
817 | | |
818 | | /* now c1 and c2 are in the requested (code unit or code point) order */ |
819 | 0 | return (int32_t)c1-(int32_t)c2; |
820 | 0 | } |
821 | | |
822 | | /* |
823 | | * Compare two strings as presented by UCharIterators. |
824 | | * Use code unit or code point order. |
825 | | * When the function returns, it is undefined where the iterators |
826 | | * have stopped. |
827 | | */ |
828 | | U_CAPI int32_t U_EXPORT2 |
829 | 0 | u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { |
830 | 0 | UChar32 c1, c2; |
831 | | |
832 | | /* argument checking */ |
833 | 0 | if(iter1==NULL || iter2==NULL) { |
834 | 0 | return 0; /* bad arguments */ |
835 | 0 | } |
836 | 0 | if(iter1==iter2) { |
837 | 0 | return 0; /* identical iterators */ |
838 | 0 | } |
839 | | |
840 | | /* reset iterators to start? */ |
841 | 0 | iter1->move(iter1, 0, UITER_START); |
842 | 0 | iter2->move(iter2, 0, UITER_START); |
843 | | |
844 | | /* compare identical prefixes - they do not need to be fixed up */ |
845 | 0 | for(;;) { |
846 | 0 | c1=iter1->next(iter1); |
847 | 0 | c2=iter2->next(iter2); |
848 | 0 | if(c1!=c2) { |
849 | 0 | break; |
850 | 0 | } |
851 | 0 | if(c1==-1) { |
852 | 0 | return 0; |
853 | 0 | } |
854 | 0 | } |
855 | | |
856 | | /* if both values are in or above the surrogate range, fix them up */ |
857 | 0 | if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { |
858 | | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ |
859 | 0 | if( |
860 | 0 | (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || |
861 | 0 | (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) |
862 | 0 | ) { |
863 | | /* part of a surrogate pair, leave >=d800 */ |
864 | 0 | } else { |
865 | | /* BMP code point - may be surrogate code point - make <d800 */ |
866 | 0 | c1-=0x2800; |
867 | 0 | } |
868 | |
|
869 | 0 | if( |
870 | 0 | (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) || |
871 | 0 | (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) |
872 | 0 | ) { |
873 | | /* part of a surrogate pair, leave >=d800 */ |
874 | 0 | } else { |
875 | | /* BMP code point - may be surrogate code point - make <d800 */ |
876 | 0 | c2-=0x2800; |
877 | 0 | } |
878 | 0 | } |
879 | | |
880 | | /* now c1 and c2 are in the requested (code unit or code point) order */ |
881 | 0 | return (int32_t)c1-(int32_t)c2; |
882 | 0 | } |
883 | | |
884 | | #if 0 |
885 | | /* |
886 | | * u_strCompareIter() does not leave the iterators _on_ the different units. |
887 | | * This is possible but would cost a few extra indirect function calls to back |
888 | | * up if the last unit (c1 or c2 respectively) was >=0. |
889 | | * |
890 | | * Consistently leaving them _behind_ the different units is not an option |
891 | | * because the current "unit" is the end of the string if that is reached, |
892 | | * and in such a case the iterator does not move. |
893 | | * For example, when comparing "ab" with "abc", both iterators rest _on_ the end |
894 | | * of their strings. Calling previous() on each does not move them to where |
895 | | * the comparison fails. |
896 | | * |
897 | | * So the simplest semantics is to not define where the iterators end up. |
898 | | * |
899 | | * The following fragment is part of what would need to be done for backing up. |
900 | | */ |
901 | | void fragment { |
902 | | /* iff a surrogate is part of a surrogate pair, leave >=d800 */ |
903 | | if(c1<=0xdbff) { |
904 | | if(!U16_IS_TRAIL(iter1->current(iter1))) { |
905 | | /* lead surrogate code point - make <d800 */ |
906 | | c1-=0x2800; |
907 | | } |
908 | | } else if(c1<=0xdfff) { |
909 | | int32_t idx=iter1->getIndex(iter1, UITER_CURRENT); |
910 | | iter1->previous(iter1); /* ==c1 */ |
911 | | if(!U16_IS_LEAD(iter1->previous(iter1))) { |
912 | | /* trail surrogate code point - make <d800 */ |
913 | | c1-=0x2800; |
914 | | } |
915 | | /* go back to behind where the difference is */ |
916 | | iter1->move(iter1, idx, UITER_ZERO); |
917 | | } else /* 0xe000<=c1<=0xffff */ { |
918 | | /* BMP code point - make <d800 */ |
919 | | c1-=0x2800; |
920 | | } |
921 | | } |
922 | | #endif |
923 | | |
924 | | U_CAPI int32_t U_EXPORT2 |
925 | | u_strCompare(const UChar *s1, int32_t length1, |
926 | | const UChar *s2, int32_t length2, |
927 | 0 | UBool codePointOrder) { |
928 | | /* argument checking */ |
929 | 0 | if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { |
930 | 0 | return 0; |
931 | 0 | } |
932 | 0 | return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); |
933 | 0 | } |
934 | | |
935 | | /* String compare in code point order - u_strcmp() compares in code unit order. */ |
936 | | U_CAPI int32_t U_EXPORT2 |
937 | 0 | u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { |
938 | 0 | return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); |
939 | 0 | } |
940 | | |
941 | | U_CAPI int32_t U_EXPORT2 |
942 | | u_strncmp(const UChar *s1, |
943 | | const UChar *s2, |
944 | | int32_t n) |
945 | 0 | { |
946 | 0 | if(n > 0) { |
947 | 0 | int32_t rc; |
948 | 0 | for(;;) { |
949 | 0 | rc = (int32_t)*s1 - (int32_t)*s2; |
950 | 0 | if(rc != 0 || *s1 == 0 || --n == 0) { |
951 | 0 | return rc; |
952 | 0 | } |
953 | 0 | ++s1; |
954 | 0 | ++s2; |
955 | 0 | } |
956 | 0 | } else { |
957 | 0 | return 0; |
958 | 0 | } |
959 | 0 | } |
960 | | |
961 | | U_CAPI int32_t U_EXPORT2 |
962 | 0 | u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { |
963 | 0 | return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); |
964 | 0 | } |
965 | | |
966 | | U_CAPI UChar* U_EXPORT2 |
967 | | u_strcpy(UChar *dst, |
968 | | const UChar *src) |
969 | 0 | { |
970 | 0 | UChar *anchor = dst; /* save a pointer to start of dst */ |
971 | |
|
972 | 0 | while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ |
973 | 0 | } |
974 | |
|
975 | 0 | return anchor; |
976 | 0 | } |
977 | | |
978 | | U_CAPI UChar* U_EXPORT2 |
979 | | u_strncpy(UChar *dst, |
980 | | const UChar *src, |
981 | | int32_t n) |
982 | 0 | { |
983 | 0 | UChar *anchor = dst; /* save a pointer to start of dst */ |
984 | | |
985 | | /* copy string 2 over */ |
986 | 0 | while(n > 0 && (*(dst++) = *(src++)) != 0) { |
987 | 0 | --n; |
988 | 0 | } |
989 | |
|
990 | 0 | return anchor; |
991 | 0 | } |
992 | | |
993 | | U_CAPI int32_t U_EXPORT2 |
994 | | u_strlen(const UChar *s) |
995 | 0 | { |
996 | | #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR |
997 | | return (int32_t)uprv_wcslen((const wchar_t *)s); |
998 | | #else |
999 | 0 | const UChar *t = s; |
1000 | 0 | while(*t != 0) { |
1001 | 0 | ++t; |
1002 | 0 | } |
1003 | 0 | return t - s; |
1004 | 0 | #endif |
1005 | 0 | } |
1006 | | |
1007 | | U_CAPI int32_t U_EXPORT2 |
1008 | 0 | u_countChar32(const UChar *s, int32_t length) { |
1009 | 0 | int32_t count; |
1010 | |
|
1011 | 0 | if(s==NULL || length<-1) { |
1012 | 0 | return 0; |
1013 | 0 | } |
1014 | | |
1015 | 0 | count=0; |
1016 | 0 | if(length>=0) { |
1017 | 0 | while(length>0) { |
1018 | 0 | ++count; |
1019 | 0 | if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { |
1020 | 0 | s+=2; |
1021 | 0 | length-=2; |
1022 | 0 | } else { |
1023 | 0 | ++s; |
1024 | 0 | --length; |
1025 | 0 | } |
1026 | 0 | } |
1027 | 0 | } else /* length==-1 */ { |
1028 | 0 | UChar c; |
1029 | |
|
1030 | 0 | for(;;) { |
1031 | 0 | if((c=*s++)==0) { |
1032 | 0 | break; |
1033 | 0 | } |
1034 | 0 | ++count; |
1035 | | |
1036 | | /* |
1037 | | * sufficient to look ahead one because of UTF-16; |
1038 | | * safe to look ahead one because at worst that would be the terminating NUL |
1039 | | */ |
1040 | 0 | if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { |
1041 | 0 | ++s; |
1042 | 0 | } |
1043 | 0 | } |
1044 | 0 | } |
1045 | 0 | return count; |
1046 | 0 | } |
1047 | | |
1048 | | U_CAPI UBool U_EXPORT2 |
1049 | 0 | u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { |
1050 | |
|
1051 | 0 | if(number<0) { |
1052 | 0 | return TRUE; |
1053 | 0 | } |
1054 | 0 | if(s==NULL || length<-1) { |
1055 | 0 | return FALSE; |
1056 | 0 | } |
1057 | | |
1058 | 0 | if(length==-1) { |
1059 | | /* s is NUL-terminated */ |
1060 | 0 | UChar c; |
1061 | | |
1062 | | /* count code points until they exceed */ |
1063 | 0 | for(;;) { |
1064 | 0 | if((c=*s++)==0) { |
1065 | 0 | return FALSE; |
1066 | 0 | } |
1067 | 0 | if(number==0) { |
1068 | 0 | return TRUE; |
1069 | 0 | } |
1070 | 0 | if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { |
1071 | 0 | ++s; |
1072 | 0 | } |
1073 | 0 | --number; |
1074 | 0 | } |
1075 | 0 | } else { |
1076 | | /* length>=0 known */ |
1077 | 0 | const UChar *limit; |
1078 | 0 | int32_t maxSupplementary; |
1079 | | |
1080 | | /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ |
1081 | 0 | if(((length+1)/2)>number) { |
1082 | 0 | return TRUE; |
1083 | 0 | } |
1084 | | |
1085 | | /* check if s does not even contain enough UChars */ |
1086 | 0 | maxSupplementary=length-number; |
1087 | 0 | if(maxSupplementary<=0) { |
1088 | 0 | return FALSE; |
1089 | 0 | } |
1090 | | /* there are maxSupplementary=length-number more UChars than asked-for code points */ |
1091 | | |
1092 | | /* |
1093 | | * count code points until they exceed and also check that there are |
1094 | | * no more than maxSupplementary supplementary code points (UChar pairs) |
1095 | | */ |
1096 | 0 | limit=s+length; |
1097 | 0 | for(;;) { |
1098 | 0 | if(s==limit) { |
1099 | 0 | return FALSE; |
1100 | 0 | } |
1101 | 0 | if(number==0) { |
1102 | 0 | return TRUE; |
1103 | 0 | } |
1104 | 0 | if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { |
1105 | 0 | ++s; |
1106 | 0 | if(--maxSupplementary<=0) { |
1107 | | /* too many pairs - too few code points */ |
1108 | 0 | return FALSE; |
1109 | 0 | } |
1110 | 0 | } |
1111 | 0 | --number; |
1112 | 0 | } |
1113 | 0 | } |
1114 | 0 | } |
1115 | | |
1116 | | U_CAPI UChar * U_EXPORT2 |
1117 | 0 | u_memcpy(UChar *dest, const UChar *src, int32_t count) { |
1118 | 0 | if(count > 0) { |
1119 | 0 | uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR); |
1120 | 0 | } |
1121 | 0 | return dest; |
1122 | 0 | } |
1123 | | |
1124 | | U_CAPI UChar * U_EXPORT2 |
1125 | 0 | u_memmove(UChar *dest, const UChar *src, int32_t count) { |
1126 | 0 | if(count > 0) { |
1127 | 0 | uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR); |
1128 | 0 | } |
1129 | 0 | return dest; |
1130 | 0 | } |
1131 | | |
1132 | | U_CAPI UChar * U_EXPORT2 |
1133 | 0 | u_memset(UChar *dest, UChar c, int32_t count) { |
1134 | 0 | if(count > 0) { |
1135 | 0 | UChar *ptr = dest; |
1136 | 0 | UChar *limit = dest + count; |
1137 | |
|
1138 | 0 | while (ptr < limit) { |
1139 | 0 | *(ptr++) = c; |
1140 | 0 | } |
1141 | 0 | } |
1142 | 0 | return dest; |
1143 | 0 | } |
1144 | | |
1145 | | U_CAPI int32_t U_EXPORT2 |
1146 | 0 | u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { |
1147 | 0 | if(count > 0) { |
1148 | 0 | const UChar *limit = buf1 + count; |
1149 | 0 | int32_t result; |
1150 | |
|
1151 | 0 | while (buf1 < limit) { |
1152 | 0 | result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; |
1153 | 0 | if (result != 0) { |
1154 | 0 | return result; |
1155 | 0 | } |
1156 | 0 | buf1++; |
1157 | 0 | buf2++; |
1158 | 0 | } |
1159 | 0 | } |
1160 | 0 | return 0; |
1161 | 0 | } |
1162 | | |
1163 | | U_CAPI int32_t U_EXPORT2 |
1164 | 0 | u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { |
1165 | 0 | return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); |
1166 | 0 | } |
1167 | | |
1168 | | /* u_unescape & support fns ------------------------------------------------- */ |
1169 | | |
1170 | | /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ |
1171 | | static const UChar UNESCAPE_MAP[] = { |
1172 | | /*" 0x22, 0x22 */ |
1173 | | /*' 0x27, 0x27 */ |
1174 | | /*? 0x3F, 0x3F */ |
1175 | | /*\ 0x5C, 0x5C */ |
1176 | | /*a*/ 0x61, 0x07, |
1177 | | /*b*/ 0x62, 0x08, |
1178 | | /*e*/ 0x65, 0x1b, |
1179 | | /*f*/ 0x66, 0x0c, |
1180 | | /*n*/ 0x6E, 0x0a, |
1181 | | /*r*/ 0x72, 0x0d, |
1182 | | /*t*/ 0x74, 0x09, |
1183 | | /*v*/ 0x76, 0x0b |
1184 | | }; |
1185 | | enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) }; |
1186 | | |
1187 | | /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ |
1188 | 0 | static int32_t _digit8(UChar c) { |
1189 | 0 | if (c >= u'0' && c <= u'7') { |
1190 | 0 | return c - u'0'; |
1191 | 0 | } |
1192 | 0 | return -1; |
1193 | 0 | } |
1194 | | |
1195 | | /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ |
1196 | 0 | static int32_t _digit16(UChar c) { |
1197 | 0 | if (c >= u'0' && c <= u'9') { |
1198 | 0 | return c - u'0'; |
1199 | 0 | } |
1200 | 0 | if (c >= u'A' && c <= u'F') { |
1201 | 0 | return c - (u'A' - 10); |
1202 | 0 | } |
1203 | 0 | if (c >= u'a' && c <= u'f') { |
1204 | 0 | return c - (u'a' - 10); |
1205 | 0 | } |
1206 | 0 | return -1; |
1207 | 0 | } |
1208 | | |
1209 | | /* Parse a single escape sequence. Although this method deals in |
1210 | | * UChars, it does not use C++ or UnicodeString. This allows it to |
1211 | | * be used from C contexts. */ |
1212 | | U_CAPI UChar32 U_EXPORT2 |
1213 | | u_unescapeAt(UNESCAPE_CHAR_AT charAt, |
1214 | | int32_t *offset, |
1215 | | int32_t length, |
1216 | 0 | void *context) { |
1217 | |
|
1218 | 0 | int32_t start = *offset; |
1219 | 0 | UChar32 c; |
1220 | 0 | UChar32 result = 0; |
1221 | 0 | int8_t n = 0; |
1222 | 0 | int8_t minDig = 0; |
1223 | 0 | int8_t maxDig = 0; |
1224 | 0 | int8_t bitsPerDigit = 4; |
1225 | 0 | int32_t dig; |
1226 | 0 | UBool braces = FALSE; |
1227 | | |
1228 | | /* Check that offset is in range */ |
1229 | 0 | if (*offset < 0 || *offset >= length) { |
1230 | 0 | goto err; |
1231 | 0 | } |
1232 | | |
1233 | | /* Fetch first UChar after '\\' */ |
1234 | 0 | c = charAt((*offset)++, context); |
1235 | | |
1236 | | /* Convert hexadecimal and octal escapes */ |
1237 | 0 | switch (c) { |
1238 | 0 | case u'u': |
1239 | 0 | minDig = maxDig = 4; |
1240 | 0 | break; |
1241 | 0 | case u'U': |
1242 | 0 | minDig = maxDig = 8; |
1243 | 0 | break; |
1244 | 0 | case u'x': |
1245 | 0 | minDig = 1; |
1246 | 0 | if (*offset < length && charAt(*offset, context) == u'{') { |
1247 | 0 | ++(*offset); |
1248 | 0 | braces = TRUE; |
1249 | 0 | maxDig = 8; |
1250 | 0 | } else { |
1251 | 0 | maxDig = 2; |
1252 | 0 | } |
1253 | 0 | break; |
1254 | 0 | default: |
1255 | 0 | dig = _digit8(c); |
1256 | 0 | if (dig >= 0) { |
1257 | 0 | minDig = 1; |
1258 | 0 | maxDig = 3; |
1259 | 0 | n = 1; /* Already have first octal digit */ |
1260 | 0 | bitsPerDigit = 3; |
1261 | 0 | result = dig; |
1262 | 0 | } |
1263 | 0 | break; |
1264 | 0 | } |
1265 | 0 | if (minDig != 0) { |
1266 | 0 | while (*offset < length && n < maxDig) { |
1267 | 0 | c = charAt(*offset, context); |
1268 | 0 | dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c); |
1269 | 0 | if (dig < 0) { |
1270 | 0 | break; |
1271 | 0 | } |
1272 | 0 | result = (result << bitsPerDigit) | dig; |
1273 | 0 | ++(*offset); |
1274 | 0 | ++n; |
1275 | 0 | } |
1276 | 0 | if (n < minDig) { |
1277 | 0 | goto err; |
1278 | 0 | } |
1279 | 0 | if (braces) { |
1280 | 0 | if (c != u'}') { |
1281 | 0 | goto err; |
1282 | 0 | } |
1283 | 0 | ++(*offset); |
1284 | 0 | } |
1285 | 0 | if (result < 0 || result >= 0x110000) { |
1286 | 0 | goto err; |
1287 | 0 | } |
1288 | | /* If an escape sequence specifies a lead surrogate, see if |
1289 | | * there is a trail surrogate after it, either as an escape or |
1290 | | * as a literal. If so, join them up into a supplementary. |
1291 | | */ |
1292 | 0 | if (*offset < length && U16_IS_LEAD(result)) { |
1293 | 0 | int32_t ahead = *offset + 1; |
1294 | 0 | c = charAt(*offset, context); |
1295 | 0 | if (c == u'\\' && ahead < length) { |
1296 | | // Calling ourselves recursively may cause a stack overflow if |
1297 | | // we have repeated escaped lead surrogates. |
1298 | | // Limit the length to 11 ("x{0000DFFF}") after ahead. |
1299 | 0 | int32_t tailLimit = ahead + 11; |
1300 | 0 | if (tailLimit > length) { |
1301 | 0 | tailLimit = length; |
1302 | 0 | } |
1303 | 0 | c = u_unescapeAt(charAt, &ahead, tailLimit, context); |
1304 | 0 | } |
1305 | 0 | if (U16_IS_TRAIL(c)) { |
1306 | 0 | *offset = ahead; |
1307 | 0 | result = U16_GET_SUPPLEMENTARY(result, c); |
1308 | 0 | } |
1309 | 0 | } |
1310 | 0 | return result; |
1311 | 0 | } |
1312 | | |
1313 | | /* Convert C-style escapes in table */ |
1314 | 0 | for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { |
1315 | 0 | if (c == UNESCAPE_MAP[i]) { |
1316 | 0 | return UNESCAPE_MAP[i+1]; |
1317 | 0 | } else if (c < UNESCAPE_MAP[i]) { |
1318 | 0 | break; |
1319 | 0 | } |
1320 | 0 | } |
1321 | | |
1322 | | /* Map \cX to control-X: X & 0x1F */ |
1323 | 0 | if (c == u'c' && *offset < length) { |
1324 | 0 | c = charAt((*offset)++, context); |
1325 | 0 | if (U16_IS_LEAD(c) && *offset < length) { |
1326 | 0 | UChar c2 = charAt(*offset, context); |
1327 | 0 | if (U16_IS_TRAIL(c2)) { |
1328 | 0 | ++(*offset); |
1329 | 0 | c = U16_GET_SUPPLEMENTARY(c, c2); |
1330 | 0 | } |
1331 | 0 | } |
1332 | 0 | return 0x1F & c; |
1333 | 0 | } |
1334 | | |
1335 | | /* If no special forms are recognized, then consider |
1336 | | * the backslash to generically escape the next character. |
1337 | | * Deal with surrogate pairs. */ |
1338 | 0 | if (U16_IS_LEAD(c) && *offset < length) { |
1339 | 0 | UChar c2 = charAt(*offset, context); |
1340 | 0 | if (U16_IS_TRAIL(c2)) { |
1341 | 0 | ++(*offset); |
1342 | 0 | return U16_GET_SUPPLEMENTARY(c, c2); |
1343 | 0 | } |
1344 | 0 | } |
1345 | 0 | return c; |
1346 | | |
1347 | 0 | err: |
1348 | | /* Invalid escape sequence */ |
1349 | 0 | *offset = start; /* Reset to initial value */ |
1350 | 0 | return (UChar32)0xFFFFFFFF; |
1351 | 0 | } |
1352 | | |
1353 | | /* u_unescapeAt() callback to return a UChar from a char* */ |
1354 | | static UChar U_CALLCONV |
1355 | 0 | _charPtr_charAt(int32_t offset, void *context) { |
1356 | 0 | UChar c16; |
1357 | | /* It would be more efficient to access the invariant tables |
1358 | | * directly but there is no API for that. */ |
1359 | 0 | u_charsToUChars(((char*) context) + offset, &c16, 1); |
1360 | 0 | return c16; |
1361 | 0 | } |
1362 | | |
1363 | | /* Append an escape-free segment of the text; used by u_unescape() */ |
1364 | | static void _appendUChars(UChar *dest, int32_t destCapacity, |
1365 | 0 | const char *src, int32_t srcLen) { |
1366 | 0 | if (destCapacity < 0) { |
1367 | 0 | destCapacity = 0; |
1368 | 0 | } |
1369 | 0 | if (srcLen > destCapacity) { |
1370 | 0 | srcLen = destCapacity; |
1371 | 0 | } |
1372 | 0 | u_charsToUChars(src, dest, srcLen); |
1373 | 0 | } |
1374 | | |
1375 | | /* Do an invariant conversion of char* -> UChar*, with escape parsing */ |
1376 | | U_CAPI int32_t U_EXPORT2 |
1377 | 0 | u_unescape(const char *src, UChar *dest, int32_t destCapacity) { |
1378 | 0 | const char *segment = src; |
1379 | 0 | int32_t i = 0; |
1380 | 0 | char c; |
1381 | |
|
1382 | 0 | while ((c=*src) != 0) { |
1383 | | /* '\\' intentionally written as compiler-specific |
1384 | | * character constant to correspond to compiler-specific |
1385 | | * char* constants. */ |
1386 | 0 | if (c == '\\') { |
1387 | 0 | int32_t lenParsed = 0; |
1388 | 0 | UChar32 c32; |
1389 | 0 | if (src != segment) { |
1390 | 0 | if (dest != NULL) { |
1391 | 0 | _appendUChars(dest + i, destCapacity - i, |
1392 | 0 | segment, (int32_t)(src - segment)); |
1393 | 0 | } |
1394 | 0 | i += (int32_t)(src - segment); |
1395 | 0 | } |
1396 | 0 | ++src; /* advance past '\\' */ |
1397 | 0 | c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); |
1398 | 0 | if (lenParsed == 0) { |
1399 | 0 | goto err; |
1400 | 0 | } |
1401 | 0 | src += lenParsed; /* advance past escape seq. */ |
1402 | 0 | if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) { |
1403 | 0 | U16_APPEND_UNSAFE(dest, i, c32); |
1404 | 0 | } else { |
1405 | 0 | i += U16_LENGTH(c32); |
1406 | 0 | } |
1407 | 0 | segment = src; |
1408 | 0 | } else { |
1409 | 0 | ++src; |
1410 | 0 | } |
1411 | 0 | } |
1412 | 0 | if (src != segment) { |
1413 | 0 | if (dest != NULL) { |
1414 | 0 | _appendUChars(dest + i, destCapacity - i, |
1415 | 0 | segment, (int32_t)(src - segment)); |
1416 | 0 | } |
1417 | 0 | i += (int32_t)(src - segment); |
1418 | 0 | } |
1419 | 0 | if (dest != NULL && i < destCapacity) { |
1420 | 0 | dest[i] = 0; |
1421 | 0 | } |
1422 | 0 | return i; |
1423 | | |
1424 | 0 | err: |
1425 | 0 | if (dest != NULL && destCapacity > 0) { |
1426 | 0 | *dest = 0; |
1427 | 0 | } |
1428 | 0 | return 0; |
1429 | 0 | } |
1430 | | |
1431 | | /* NUL-termination of strings ----------------------------------------------- */ |
1432 | | |
1433 | | /** |
1434 | | * NUL-terminate a string no matter what its type. |
1435 | | * Set warning and error codes accordingly. |
1436 | | */ |
1437 | 0 | #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) UPRV_BLOCK_MACRO_BEGIN { \ |
1438 | 0 | if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ |
1439 | 0 | /* not a public function, so no complete argument checking */ \ |
1440 | 0 | \ |
1441 | 0 | if(length<0) { \ |
1442 | 0 | /* assume that the caller handles this */ \ |
1443 | 0 | } else if(length<destCapacity) { \ |
1444 | 0 | /* NUL-terminate the string, the NUL fits */ \ |
1445 | 0 | dest[length]=0; \ |
1446 | 0 | /* unset the not-terminated warning but leave all others */ \ |
1447 | 0 | if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ |
1448 | 0 | *pErrorCode=U_ZERO_ERROR; \ |
1449 | 0 | } \ |
1450 | 0 | } else if(length==destCapacity) { \ |
1451 | 0 | /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ |
1452 | 0 | *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ |
1453 | 0 | } else /* length>destCapacity */ { \ |
1454 | 0 | /* even the string itself did not fit - set an error code */ \ |
1455 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ |
1456 | 0 | } \ |
1457 | 0 | } \ |
1458 | 0 | } UPRV_BLOCK_MACRO_END |
1459 | | |
1460 | | U_CAPI UChar U_EXPORT2 |
1461 | 0 | u_asciiToUpper(UChar c) { |
1462 | 0 | if (u'a' <= c && c <= u'z') { |
1463 | 0 | c = c + u'A' - u'a'; |
1464 | 0 | } |
1465 | 0 | return c; |
1466 | 0 | } |
1467 | | |
1468 | | U_CAPI int32_t U_EXPORT2 |
1469 | 0 | u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
1470 | 0 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
1471 | 0 | return length; |
1472 | 0 | } |
1473 | | |
1474 | | U_CAPI int32_t U_EXPORT2 |
1475 | 0 | u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
1476 | 0 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
1477 | 0 | return length; |
1478 | 0 | } |
1479 | | |
1480 | | U_CAPI int32_t U_EXPORT2 |
1481 | 0 | u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
1482 | 0 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
1483 | 0 | return length; |
1484 | 0 | } |
1485 | | |
1486 | | U_CAPI int32_t U_EXPORT2 |
1487 | 0 | u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
1488 | 0 | __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
1489 | 0 | return length; |
1490 | 0 | } |
1491 | | |
1492 | | // Compute the hash code for a string -------------------------------------- *** |
1493 | | |
1494 | | // Moved here from uhash.c so that UnicodeString::hashCode() does not depend |
1495 | | // on UHashtable code. |
1496 | | |
1497 | | /* |
1498 | | Compute the hash by iterating sparsely over about 32 (up to 63) |
1499 | | characters spaced evenly through the string. For each character, |
1500 | | multiply the previous hash value by a prime number and add the new |
1501 | | character in, like a linear congruential random number generator, |
1502 | | producing a pseudorandom deterministic value well distributed over |
1503 | | the output range. [LIU] |
1504 | | */ |
1505 | | |
1506 | 0 | #define STRING_HASH(TYPE, STR, STRLEN, DEREF) UPRV_BLOCK_MACRO_BEGIN { \ |
1507 | 0 | uint32_t hash = 0; \ |
1508 | 0 | const TYPE *p = (const TYPE*) STR; \ |
1509 | 0 | if (p != NULL) { \ |
1510 | 0 | int32_t len = (int32_t)(STRLEN); \ |
1511 | 0 | int32_t inc = ((len - 32) / 32) + 1; \ |
1512 | 0 | const TYPE *limit = p + len; \ |
1513 | 0 | while (p<limit) { \ |
1514 | 0 | hash = (hash * 37) + DEREF; \ |
1515 | 0 | p += inc; \ |
1516 | 0 | } \ |
1517 | 0 | } \ |
1518 | 0 | return static_cast<int32_t>(hash); \ |
1519 | 0 | } UPRV_BLOCK_MACRO_END |
1520 | | |
1521 | | /* Used by UnicodeString to compute its hashcode - Not public API. */ |
1522 | | U_CAPI int32_t U_EXPORT2 |
1523 | 0 | ustr_hashUCharsN(const UChar *str, int32_t length) { |
1524 | 0 | STRING_HASH(UChar, str, length, *p); |
1525 | 0 | } |
1526 | | |
1527 | | U_CAPI int32_t U_EXPORT2 |
1528 | 0 | ustr_hashCharsN(const char *str, int32_t length) { |
1529 | 0 | STRING_HASH(uint8_t, str, length, *p); |
1530 | 0 | } |
1531 | | |
1532 | | U_CAPI int32_t U_EXPORT2 |
1533 | 0 | ustr_hashICharsN(const char *str, int32_t length) { |
1534 | 0 | STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p)); |
1535 | 0 | } |