/src/icu/source/i18n/bocsu.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | *   Copyright (C) 2001-2014, International Business Machines  | 
6  |  | *   Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | *   file name:  bocsu.cpp  | 
9  |  | *   encoding:   UTF-8  | 
10  |  | *   tab size:   8 (not used)  | 
11  |  | *   indentation:4  | 
12  |  | *  | 
13  |  | *   Author: Markus W. Scherer  | 
14  |  | *  | 
15  |  | *   Modification history:  | 
16  |  | *   05/18/2001  weiv    Made into separate module  | 
17  |  | */  | 
18  |  |  | 
19  |  |  | 
20  |  | #include "unicode/utypes.h"  | 
21  |  |  | 
22  |  | #if !UCONFIG_NO_COLLATION  | 
23  |  |  | 
24  |  | #include "unicode/bytestream.h"  | 
25  |  | #include "unicode/utf16.h"  | 
26  |  | #include "bocsu.h"  | 
27  |  |  | 
28  |  | /*  | 
29  |  |  * encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,  | 
30  |  |  * preserving lexical order  | 
31  |  |  */  | 
32  |  | static uint8_t *  | 
33  | 0  | u_writeDiff(int32_t diff, uint8_t *p) { | 
34  | 0  |     if(diff>=SLOPE_REACH_NEG_1) { | 
35  | 0  |         if(diff<=SLOPE_REACH_POS_1) { | 
36  | 0  |             *p++=(uint8_t)(SLOPE_MIDDLE+diff);  | 
37  | 0  |         } else if(diff<=SLOPE_REACH_POS_2) { | 
38  | 0  |             *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT));  | 
39  | 0  |             *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);  | 
40  | 0  |         } else if(diff<=SLOPE_REACH_POS_3) { | 
41  | 0  |             p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);  | 
42  | 0  |             diff/=SLOPE_TAIL_COUNT;  | 
43  | 0  |             p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);  | 
44  | 0  |             *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT));  | 
45  | 0  |             p+=3;  | 
46  | 0  |         } else { | 
47  | 0  |             p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);  | 
48  | 0  |             diff/=SLOPE_TAIL_COUNT;  | 
49  | 0  |             p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);  | 
50  | 0  |             diff/=SLOPE_TAIL_COUNT;  | 
51  | 0  |             p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);  | 
52  | 0  |             *p=SLOPE_MAX;  | 
53  | 0  |             p+=4;  | 
54  | 0  |         }  | 
55  | 0  |     } else { | 
56  | 0  |         int32_t m;  | 
57  |  | 
  | 
58  | 0  |         if(diff>=SLOPE_REACH_NEG_2) { | 
59  | 0  |             NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);  | 
60  | 0  |             *p++=(uint8_t)(SLOPE_START_NEG_2+diff);  | 
61  | 0  |             *p++=(uint8_t)(SLOPE_MIN+m);  | 
62  | 0  |         } else if(diff>=SLOPE_REACH_NEG_3) { | 
63  | 0  |             NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);  | 
64  | 0  |             p[2]=(uint8_t)(SLOPE_MIN+m);  | 
65  | 0  |             NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);  | 
66  | 0  |             p[1]=(uint8_t)(SLOPE_MIN+m);  | 
67  | 0  |             *p=(uint8_t)(SLOPE_START_NEG_3+diff);  | 
68  | 0  |             p+=3;  | 
69  | 0  |         } else { | 
70  | 0  |             NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);  | 
71  | 0  |             p[3]=(uint8_t)(SLOPE_MIN+m);  | 
72  | 0  |             NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);  | 
73  | 0  |             p[2]=(uint8_t)(SLOPE_MIN+m);  | 
74  | 0  |             NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);  | 
75  | 0  |             p[1]=(uint8_t)(SLOPE_MIN+m);  | 
76  | 0  |             *p=SLOPE_MIN;  | 
77  | 0  |             p+=4;  | 
78  | 0  |         }  | 
79  | 0  |     }  | 
80  | 0  |     return p;  | 
81  | 0  | }  | 
82  |  |  | 
83  |  | /*  | 
84  |  |  * Encode the code points of a string as  | 
85  |  |  * a sequence of byte-encoded differences (slope detection),  | 
86  |  |  * preserving lexical order.  | 
87  |  |  *  | 
88  |  |  * Optimize the difference-taking for runs of Unicode text within  | 
89  |  |  * small scripts:  | 
90  |  |  *  | 
91  |  |  * Most small scripts are allocated within aligned 128-blocks of Unicode  | 
92  |  |  * code points. Lexical order is preserved if "prev" is always moved  | 
93  |  |  * into the middle of such a block.  | 
94  |  |  *  | 
95  |  |  * Additionally, "prev" is moved from anywhere in the Unihan  | 
96  |  |  * area into the middle of that area.  | 
97  |  |  * Note that the identical-level run in a sort key is generated from  | 
98  |  |  * NFD text - there are never Hangul characters included.  | 
99  |  |  */  | 
100  |  | U_CFUNC UChar32  | 
101  | 0  | u_writeIdenticalLevelRun(UChar32 prev, const UChar *s, int32_t length, icu::ByteSink &sink) { | 
102  | 0  |     char scratch[64];  | 
103  | 0  |     int32_t capacity;  | 
104  |  | 
  | 
105  | 0  |     int32_t i=0;  | 
106  | 0  |     while(i<length) { | 
107  | 0  |         char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity);  | 
108  | 0  |         uint8_t *p;  | 
109  |  |         // We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much,  | 
110  |  |         // but we do not want to force the sink.GetAppendBuffer() to allocate  | 
111  |  |         // for a large min_capacity because we might actually only write one byte.  | 
112  | 0  |         if(capacity<16) { | 
113  | 0  |             buffer=scratch;  | 
114  | 0  |             capacity=(int32_t)sizeof(scratch);  | 
115  | 0  |         }  | 
116  | 0  |         p=reinterpret_cast<uint8_t *>(buffer);  | 
117  | 0  |         uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES;  | 
118  | 0  |         while(i<length && p<=lastSafe) { | 
119  | 0  |             if(prev<0x4e00 || prev>=0xa000) { | 
120  | 0  |                 prev=(prev&~0x7f)-SLOPE_REACH_NEG_1;  | 
121  | 0  |             } else { | 
122  |  |                 /*  | 
123  |  |                  * Unihan U+4e00..U+9fa5:  | 
124  |  |                  * double-bytes down from the upper end  | 
125  |  |                  */  | 
126  | 0  |                 prev=0x9fff-SLOPE_REACH_POS_2;  | 
127  | 0  |             }  | 
128  |  | 
  | 
129  | 0  |             UChar32 c;  | 
130  | 0  |             U16_NEXT(s, i, length, c);  | 
131  | 0  |             if(c==0xfffe) { | 
132  | 0  |                 *p++=2;  // merge separator  | 
133  | 0  |                 prev=0;  | 
134  | 0  |             } else { | 
135  | 0  |                 p=u_writeDiff(c-prev, p);  | 
136  | 0  |                 prev=c;  | 
137  | 0  |             }  | 
138  | 0  |         }  | 
139  | 0  |         sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer)));  | 
140  | 0  |     }  | 
141  | 0  |     return prev;  | 
142  | 0  | }  | 
143  |  |  | 
144  |  | #endif /* #if !UCONFIG_NO_COLLATION */  |