Coverage Report

Created: 2024-04-24 06:23

/src/icu/source/common/ucnvbocu.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2002-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  ucnvbocu.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2002mar27
16
*   created by: Markus W. Scherer
17
*
18
*   This is an implementation of the Binary Ordered Compression for Unicode,
19
*   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20
*/
21
22
#include "unicode/utypes.h"
23
24
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25
26
#include "unicode/ucnv.h"
27
#include "unicode/ucnv_cb.h"
28
#include "unicode/utf16.h"
29
#include "putilimp.h"
30
#include "ucnv_bld.h"
31
#include "ucnv_cnv.h"
32
#include "uassert.h"
33
34
/* BOCU-1 constants and macros ---------------------------------------------- */
35
36
/*
37
 * BOCU-1 encodes the code points of a Unicode string as
38
 * a sequence of byte-encoded differences (slope detection),
39
 * preserving lexical order.
40
 *
41
 * Optimize the difference-taking for runs of Unicode text within
42
 * small scripts:
43
 *
44
 * Most small scripts are allocated within aligned 128-blocks of Unicode
45
 * code points. Lexical order is preserved if the "previous code point" state
46
 * is always moved into the middle of such a block.
47
 *
48
 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
49
 * areas into the middle of those areas.
50
 *
51
 * C0 control codes and space are encoded with their US-ASCII bytes.
52
 * "prev" is reset for C0 controls but not for space.
53
 */
54
55
/* initial value for "prev": middle of the ASCII range */
56
0
#define BOCU1_ASCII_PREV        0x40
57
58
/* bounding byte values for differences */
59
0
#define BOCU1_MIN               0x21
60
0
#define BOCU1_MIDDLE            0x90
61
#define BOCU1_MAX_LEAD          0xfe
62
0
#define BOCU1_MAX_TRAIL         0xff
63
0
#define BOCU1_RESET             0xff
64
65
/* number of lead bytes */
66
#define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
67
68
/* adjust trail byte counts for the use of some C0 control byte values */
69
0
#define BOCU1_TRAIL_CONTROLS_COUNT  20
70
0
#define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
71
72
/* number of trail bytes */
73
0
#define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74
75
/*
76
 * number of positive and negative single-byte codes
77
 * (counting 0==BOCU1_MIDDLE among the positive ones)
78
 */
79
0
#define BOCU1_SINGLE            64
80
81
/* number of lead bytes for positive and negative 2/3/4-byte sequences */
82
0
#define BOCU1_LEAD_2            43
83
0
#define BOCU1_LEAD_3            3
84
#define BOCU1_LEAD_4            1
85
86
/* The difference value range for single-byters. */
87
0
#define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
88
0
#define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
89
90
/* The difference value range for double-byters. */
91
0
#define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92
0
#define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
93
94
/* The difference value range for 3-byters. */
95
#define BOCU1_REACH_POS_3   \
96
0
    (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97
98
0
#define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
99
100
/* The lead byte start values. */
101
0
#define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
102
0
#define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
103
0
#define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
104
     /* ==BOCU1_MAX_LEAD */
105
106
0
#define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
107
0
#define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
108
#define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109
     /* ==BOCU1_MIN+1 */
110
111
/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
112
#define BOCU1_LENGTH_FROM_LEAD(lead) \
113
    ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
114
     (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
115
     (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
116
117
/* The length of a byte sequence, according to its packed form. */
118
#define BOCU1_LENGTH_FROM_PACKED(packed) \
119
0
    ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120
121
/*
122
 * 12 commonly used C0 control codes (and space) are only used to encode
123
 * themselves directly,
124
 * which makes BOCU-1 MIME-usable and reasonably safe for
125
 * ASCII-oriented software.
126
 *
127
 * These controls are
128
 *  0   NUL
129
 *
130
 *  7   BEL
131
 *  8   BS
132
 *
133
 *  9   TAB
134
 *  a   LF
135
 *  b   VT
136
 *  c   FF
137
 *  d   CR
138
 *
139
 *  e   SO
140
 *  f   SI
141
 *
142
 * 1a   SUB
143
 * 1b   ESC
144
 *
145
 * The other 20 C0 controls are also encoded directly (to preserve order)
146
 * but are also used as trail bytes in difference encoding
147
 * (for better compression).
148
 */
149
0
#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150
151
/*
152
 * Byte value map for control codes,
153
 * from external byte values 0x00..0x20
154
 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
155
 * External byte values that are illegal as trail bytes are mapped to -1.
156
 */
157
static const int8_t
158
bocu1ByteToTrail[BOCU1_MIN]={
159
/*  0     1     2     3     4     5     6     7    */
160
    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
161
162
/*  8     9     a     b     c     d     e     f    */
163
    -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
164
165
/*  10    11    12    13    14    15    16    17   */
166
    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
167
168
/*  18    19    1a    1b    1c    1d    1e    1f   */
169
    0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
170
171
/*  20   */
172
    -1
173
};
174
175
/*
176
 * Byte value map for control codes,
177
 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
178
 * to external byte values 0x00..0x20.
179
 */
180
static const int8_t
181
bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
182
/*  0     1     2     3     4     5     6     7    */
183
    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
184
185
/*  8     9     a     b     c     d     e     f    */
186
    0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
187
188
/*  10    11    12    13   */
189
    0x1c, 0x1d, 0x1e, 0x1f
190
};
191
192
/**
193
 * Integer division and modulo with negative numerators
194
 * yields negative modulo results and quotients that are one more than
195
 * what we need here.
196
 * This macro adjust the results so that the modulo-value m is always >=0.
197
 *
198
 * For positive n, the if() condition is always FALSE.
199
 *
200
 * @param n Number to be split into quotient and rest.
201
 *          Will be modified to contain the quotient.
202
 * @param d Divisor.
203
 * @param m Output variable for the rest (modulo result).
204
 */
205
0
#define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
206
0
    (m)=(n)%(d); \
207
0
    (n)/=(d); \
208
0
    if((m)<0) { \
209
0
        --(n); \
210
0
        (m)+=(d); \
211
0
    } \
212
0
} UPRV_BLOCK_MACRO_END
213
214
/* Faster versions of packDiff() for single-byte-encoded diff values. */
215
216
/** Is a diff value encodable in a single byte? */
217
0
#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
218
219
/** Encode a diff value in a single byte. */
220
0
#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
221
222
/** Is a diff value encodable in two bytes? */
223
0
#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
224
225
/* BOCU-1 implementation functions ------------------------------------------ */
226
227
0
#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228
229
/**
230
 * Compute the next "previous" value for differencing
231
 * from the current code point.
232
 *
233
 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
234
 * @return "previous code point" state value
235
 */
236
static inline int32_t
237
0
bocu1Prev(int32_t c) {
238
    /* compute new prev */
239
0
    if(/* 0x3040<=c && */ c<=0x309f) {
240
        /* Hiragana is not 128-aligned */
241
0
        return 0x3070;
242
0
    } else if(0x4e00<=c && c<=0x9fa5) {
243
        /* CJK Unihan */
244
0
        return 0x4e00-BOCU1_REACH_NEG_2;
245
0
    } else if(0xac00<=c /* && c<=0xd7a3 */) {
246
        /* Korean Hangul */
247
0
        return (0xd7a3+0xac00)/2;
248
0
    } else {
249
        /* mostly small scripts */
250
0
        return BOCU1_SIMPLE_PREV(c);
251
0
    }
252
0
}
253
254
/** Fast version of bocu1Prev() for most scripts. */
255
0
#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256
257
/*
258
 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
259
 * The UConverter fields are used as follows:
260
 *
261
 * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262
 *
263
 * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
264
 * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
265
 */
266
267
/* BOCU-1-from-Unicode conversion functions --------------------------------- */
268
269
/**
270
 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271
 * and return a packed integer with them.
272
 *
273
 * The encoding favors small absolute differences with short encodings
274
 * to compress runs of same-script characters.
275
 *
276
 * Optimized version with unrolled loops and fewer floating-point operations
277
 * than the standard packDiff().
278
 *
279
 * @param diff difference value -0x10ffff..0x10ffff
280
 * @return
281
 *      0x010000zz for 1-byte sequence zz
282
 *      0x0200yyzz for 2-byte sequence yy zz
283
 *      0x03xxyyzz for 3-byte sequence xx yy zz
284
 *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285
 */
286
static int32_t
287
0
packDiff(int32_t diff) {
288
0
    int32_t result, m;
289
290
0
    U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
291
0
    if(diff>=BOCU1_REACH_NEG_1) {
292
        /* mostly positive differences, and single-byte negative ones */
293
#if 0   /* single-byte case handled in macros, see below */
294
        if(diff<=BOCU1_REACH_POS_1) {
295
            /* single byte */
296
            return 0x01000000|(BOCU1_MIDDLE+diff);
297
        } else
298
#endif
299
0
        if(diff<=BOCU1_REACH_POS_2) {
300
            /* two bytes */
301
0
            diff-=BOCU1_REACH_POS_1+1;
302
0
            result=0x02000000;
303
304
0
            m=diff%BOCU1_TRAIL_COUNT;
305
0
            diff/=BOCU1_TRAIL_COUNT;
306
0
            result|=BOCU1_TRAIL_TO_BYTE(m);
307
308
0
            result|=(BOCU1_START_POS_2+diff)<<8;
309
0
        } else if(diff<=BOCU1_REACH_POS_3) {
310
            /* three bytes */
311
0
            diff-=BOCU1_REACH_POS_2+1;
312
0
            result=0x03000000;
313
314
0
            m=diff%BOCU1_TRAIL_COUNT;
315
0
            diff/=BOCU1_TRAIL_COUNT;
316
0
            result|=BOCU1_TRAIL_TO_BYTE(m);
317
318
0
            m=diff%BOCU1_TRAIL_COUNT;
319
0
            diff/=BOCU1_TRAIL_COUNT;
320
0
            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
321
322
0
            result|=(BOCU1_START_POS_3+diff)<<16;
323
0
        } else {
324
            /* four bytes */
325
0
            diff-=BOCU1_REACH_POS_3+1;
326
327
0
            m=diff%BOCU1_TRAIL_COUNT;
328
0
            diff/=BOCU1_TRAIL_COUNT;
329
0
            result=BOCU1_TRAIL_TO_BYTE(m);
330
331
0
            m=diff%BOCU1_TRAIL_COUNT;
332
0
            diff/=BOCU1_TRAIL_COUNT;
333
0
            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
334
335
            /*
336
             * We know that / and % would deliver quotient 0 and rest=diff.
337
             * Avoid division and modulo for performance.
338
             */
339
0
            result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
340
341
0
            result|=((uint32_t)BOCU1_START_POS_4)<<24;
342
0
        }
343
0
    } else {
344
        /* two- to four-byte negative differences */
345
0
        if(diff>=BOCU1_REACH_NEG_2) {
346
            /* two bytes */
347
0
            diff-=BOCU1_REACH_NEG_1;
348
0
            result=0x02000000;
349
350
0
            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
351
0
            result|=BOCU1_TRAIL_TO_BYTE(m);
352
353
0
            result|=(BOCU1_START_NEG_2+diff)<<8;
354
0
        } else if(diff>=BOCU1_REACH_NEG_3) {
355
            /* three bytes */
356
0
            diff-=BOCU1_REACH_NEG_2;
357
0
            result=0x03000000;
358
359
0
            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360
0
            result|=BOCU1_TRAIL_TO_BYTE(m);
361
362
0
            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
363
0
            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
364
365
0
            result|=(BOCU1_START_NEG_3+diff)<<16;
366
0
        } else {
367
            /* four bytes */
368
0
            diff-=BOCU1_REACH_NEG_3;
369
370
0
            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371
0
            result=BOCU1_TRAIL_TO_BYTE(m);
372
373
0
            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
374
0
            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
375
376
            /*
377
             * We know that NEGDIVMOD would deliver
378
             * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
379
             * Avoid division and modulo for performance.
380
             */
381
0
            m=diff+BOCU1_TRAIL_COUNT;
382
0
            result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
383
384
0
            result|=BOCU1_MIN<<24;
385
0
        }
386
0
    }
387
0
    return result;
388
0
}
389
390
391
static void U_CALLCONV
392
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
393
0
                             UErrorCode *pErrorCode) {
394
0
    UConverter *cnv;
395
0
    const UChar *source, *sourceLimit;
396
0
    uint8_t *target;
397
0
    int32_t targetCapacity;
398
0
    int32_t *offsets;
399
400
0
    int32_t prev, c, diff;
401
402
0
    int32_t sourceIndex, nextSourceIndex;
403
404
    /* set up the local pointers */
405
0
    cnv=pArgs->converter;
406
0
    source=pArgs->source;
407
0
    sourceLimit=pArgs->sourceLimit;
408
0
    target=(uint8_t *)pArgs->target;
409
0
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410
0
    offsets=pArgs->offsets;
411
412
    /* get the converter state from UConverter */
413
0
    c=cnv->fromUChar32;
414
0
    prev=(int32_t)cnv->fromUnicodeStatus;
415
0
    if(prev==0) {
416
0
        prev=BOCU1_ASCII_PREV;
417
0
    }
418
419
    /* sourceIndex=-1 if the current character began in the previous buffer */
420
0
    sourceIndex= c==0 ? 0 : -1;
421
0
    nextSourceIndex=0;
422
423
    /* conversion loop */
424
0
    if(c!=0 && targetCapacity>0) {
425
0
        goto getTrail;
426
0
    }
427
428
0
fastSingle:
429
    /* fast loop for single-byte differences */
430
    /* use only one loop counter variable, targetCapacity, not also source */
431
0
    diff=(int32_t)(sourceLimit-source);
432
0
    if(targetCapacity>diff) {
433
0
        targetCapacity=diff;
434
0
    }
435
0
    while(targetCapacity>0 && (c=*source)<0x3000) {
436
0
        if(c<=0x20) {
437
0
            if(c!=0x20) {
438
0
                prev=BOCU1_ASCII_PREV;
439
0
            }
440
0
            *target++=(uint8_t)c;
441
0
            *offsets++=nextSourceIndex++;
442
0
            ++source;
443
0
            --targetCapacity;
444
0
        } else {
445
0
            diff=c-prev;
446
0
            if(DIFF_IS_SINGLE(diff)) {
447
0
                prev=BOCU1_SIMPLE_PREV(c);
448
0
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449
0
                *offsets++=nextSourceIndex++;
450
0
                ++source;
451
0
                --targetCapacity;
452
0
            } else {
453
0
                break;
454
0
            }
455
0
        }
456
0
    }
457
    /* restore real values */
458
0
    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459
0
    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
460
461
    /* regular loop for all cases */
462
0
    while(source<sourceLimit) {
463
0
        if(targetCapacity>0) {
464
0
            c=*source++;
465
0
            ++nextSourceIndex;
466
467
0
            if(c<=0x20) {
468
                /*
469
                 * ISO C0 control & space:
470
                 * Encode directly for MIME compatibility,
471
                 * and reset state except for space, to not disrupt compression.
472
                 */
473
0
                if(c!=0x20) {
474
0
                    prev=BOCU1_ASCII_PREV;
475
0
                }
476
0
                *target++=(uint8_t)c;
477
0
                *offsets++=sourceIndex;
478
0
                --targetCapacity;
479
480
0
                sourceIndex=nextSourceIndex;
481
0
                continue;
482
0
            }
483
484
0
            if(U16_IS_LEAD(c)) {
485
0
getTrail:
486
0
                if(source<sourceLimit) {
487
                    /* test the following code unit */
488
0
                    UChar trail=*source;
489
0
                    if(U16_IS_TRAIL(trail)) {
490
0
                        ++source;
491
0
                        ++nextSourceIndex;
492
0
                        c=U16_GET_SUPPLEMENTARY(c, trail);
493
0
                    }
494
0
                } else {
495
                    /* no more input */
496
0
                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497
0
                    break;
498
0
                }
499
0
            }
500
501
            /*
502
             * all other Unicode code points c==U+0021..U+10ffff
503
             * are encoded with the difference c-prev
504
             *
505
             * a new prev is computed from c,
506
             * placed in the middle of a 0x80-block (for most small scripts) or
507
             * in the middle of the Unihan and Hangul blocks
508
             * to statistically minimize the following difference
509
             */
510
0
            diff=c-prev;
511
0
            prev=BOCU1_PREV(c);
512
0
            if(DIFF_IS_SINGLE(diff)) {
513
0
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514
0
                *offsets++=sourceIndex;
515
0
                --targetCapacity;
516
0
                sourceIndex=nextSourceIndex;
517
0
                if(c<0x3000) {
518
0
                    goto fastSingle;
519
0
                }
520
0
            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
521
                /* optimize 2-byte case */
522
0
                int32_t m;
523
524
0
                if(diff>=0) {
525
0
                    diff-=BOCU1_REACH_POS_1+1;
526
0
                    m=diff%BOCU1_TRAIL_COUNT;
527
0
                    diff/=BOCU1_TRAIL_COUNT;
528
0
                    diff+=BOCU1_START_POS_2;
529
0
                } else {
530
0
                    diff-=BOCU1_REACH_NEG_1;
531
0
                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532
0
                    diff+=BOCU1_START_NEG_2;
533
0
                }
534
0
                *target++=(uint8_t)diff;
535
0
                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536
0
                *offsets++=sourceIndex;
537
0
                *offsets++=sourceIndex;
538
0
                targetCapacity-=2;
539
0
                sourceIndex=nextSourceIndex;
540
0
            } else {
541
0
                int32_t length; /* will be 2..4 */
542
543
0
                diff=packDiff(diff);
544
0
                length=BOCU1_LENGTH_FROM_PACKED(diff);
545
546
                /* write the output character bytes from diff and length */
547
                /* from the first if in the loop we know that targetCapacity>0 */
548
0
                if(length<=targetCapacity) {
549
0
                    switch(length) {
550
                        /* each branch falls through to the next one */
551
0
                    case 4:
552
0
                        *target++=(uint8_t)(diff>>24);
553
0
                        *offsets++=sourceIndex;
554
0
                        U_FALLTHROUGH;
555
0
                    case 3:
556
0
                        *target++=(uint8_t)(diff>>16);
557
0
                        *offsets++=sourceIndex;
558
0
                        U_FALLTHROUGH;
559
0
                    case 2:
560
0
                        *target++=(uint8_t)(diff>>8);
561
0
                        *offsets++=sourceIndex;
562
                    /* case 1: handled above */
563
0
                        *target++=(uint8_t)diff;
564
0
                        *offsets++=sourceIndex;
565
0
                        U_FALLTHROUGH;
566
0
                    default:
567
                        /* will never occur */
568
0
                        break;
569
0
                    }
570
0
                    targetCapacity-=length;
571
0
                    sourceIndex=nextSourceIndex;
572
0
                } else {
573
0
                    uint8_t *charErrorBuffer;
574
575
                    /*
576
                     * We actually do this backwards here:
577
                     * In order to save an intermediate variable, we output
578
                     * first to the overflow buffer what does not fit into the
579
                     * regular target.
580
                     */
581
                    /* we know that 1<=targetCapacity<length<=4 */
582
0
                    length-=targetCapacity;
583
0
                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
584
0
                    switch(length) {
585
                        /* each branch falls through to the next one */
586
0
                    case 3:
587
0
                        *charErrorBuffer++=(uint8_t)(diff>>16);
588
0
                        U_FALLTHROUGH;
589
0
                    case 2:
590
0
                        *charErrorBuffer++=(uint8_t)(diff>>8);
591
0
                        U_FALLTHROUGH;
592
0
                    case 1:
593
0
                        *charErrorBuffer=(uint8_t)diff;
594
0
                        U_FALLTHROUGH;
595
0
                    default:
596
                        /* will never occur */
597
0
                        break;
598
0
                    }
599
0
                    cnv->charErrorBufferLength=(int8_t)length;
600
601
                    /* now output what fits into the regular target */
602
0
                    diff>>=8*length; /* length was reduced by targetCapacity */
603
0
                    switch(targetCapacity) {
604
                        /* each branch falls through to the next one */
605
0
                    case 3:
606
0
                        *target++=(uint8_t)(diff>>16);
607
0
                        *offsets++=sourceIndex;
608
0
                        U_FALLTHROUGH;
609
0
                    case 2:
610
0
                        *target++=(uint8_t)(diff>>8);
611
0
                        *offsets++=sourceIndex;
612
0
                        U_FALLTHROUGH;
613
0
                    case 1:
614
0
                        *target++=(uint8_t)diff;
615
0
                        *offsets++=sourceIndex;
616
0
                        U_FALLTHROUGH;
617
0
                    default:
618
                        /* will never occur */
619
0
                        break;
620
0
                    }
621
622
                    /* target overflow */
623
0
                    targetCapacity=0;
624
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
625
0
                    break;
626
0
                }
627
0
            }
628
0
        } else {
629
            /* target is full */
630
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631
0
            break;
632
0
        }
633
0
    }
634
635
    /* set the converter state back into UConverter */
636
0
    cnv->fromUChar32= c<0 ? -c : 0;
637
0
    cnv->fromUnicodeStatus=(uint32_t)prev;
638
639
    /* write back the updated pointers */
640
0
    pArgs->source=source;
641
0
    pArgs->target=(char *)target;
642
0
    pArgs->offsets=offsets;
643
0
}
644
645
/*
646
 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
647
 * If a change is made in the original function, then either
648
 * change this function the same way or
649
 * re-copy the original function and remove the variables
650
 * offsets, sourceIndex, and nextSourceIndex.
651
 */
652
static void U_CALLCONV
653
_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
654
0
                  UErrorCode *pErrorCode) {
655
0
    UConverter *cnv;
656
0
    const UChar *source, *sourceLimit;
657
0
    uint8_t *target;
658
0
    int32_t targetCapacity;
659
660
0
    int32_t prev, c, diff;
661
662
    /* set up the local pointers */
663
0
    cnv=pArgs->converter;
664
0
    source=pArgs->source;
665
0
    sourceLimit=pArgs->sourceLimit;
666
0
    target=(uint8_t *)pArgs->target;
667
0
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
668
669
    /* get the converter state from UConverter */
670
0
    c=cnv->fromUChar32;
671
0
    prev=(int32_t)cnv->fromUnicodeStatus;
672
0
    if(prev==0) {
673
0
        prev=BOCU1_ASCII_PREV;
674
0
    }
675
676
    /* conversion loop */
677
0
    if(c!=0 && targetCapacity>0) {
678
0
        goto getTrail;
679
0
    }
680
681
0
fastSingle:
682
    /* fast loop for single-byte differences */
683
    /* use only one loop counter variable, targetCapacity, not also source */
684
0
    diff=(int32_t)(sourceLimit-source);
685
0
    if(targetCapacity>diff) {
686
0
        targetCapacity=diff;
687
0
    }
688
0
    while(targetCapacity>0 && (c=*source)<0x3000) {
689
0
        if(c<=0x20) {
690
0
            if(c!=0x20) {
691
0
                prev=BOCU1_ASCII_PREV;
692
0
            }
693
0
            *target++=(uint8_t)c;
694
0
        } else {
695
0
            diff=c-prev;
696
0
            if(DIFF_IS_SINGLE(diff)) {
697
0
                prev=BOCU1_SIMPLE_PREV(c);
698
0
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
699
0
            } else {
700
0
                break;
701
0
            }
702
0
        }
703
0
        ++source;
704
0
        --targetCapacity;
705
0
    }
706
    /* restore real values */
707
0
    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
708
709
    /* regular loop for all cases */
710
0
    while(source<sourceLimit) {
711
0
        if(targetCapacity>0) {
712
0
            c=*source++;
713
714
0
            if(c<=0x20) {
715
                /*
716
                 * ISO C0 control & space:
717
                 * Encode directly for MIME compatibility,
718
                 * and reset state except for space, to not disrupt compression.
719
                 */
720
0
                if(c!=0x20) {
721
0
                    prev=BOCU1_ASCII_PREV;
722
0
                }
723
0
                *target++=(uint8_t)c;
724
0
                --targetCapacity;
725
0
                continue;
726
0
            }
727
728
0
            if(U16_IS_LEAD(c)) {
729
0
getTrail:
730
0
                if(source<sourceLimit) {
731
                    /* test the following code unit */
732
0
                    UChar trail=*source;
733
0
                    if(U16_IS_TRAIL(trail)) {
734
0
                        ++source;
735
0
                        c=U16_GET_SUPPLEMENTARY(c, trail);
736
0
                    }
737
0
                } else {
738
                    /* no more input */
739
0
                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
740
0
                    break;
741
0
                }
742
0
            }
743
744
            /*
745
             * all other Unicode code points c==U+0021..U+10ffff
746
             * are encoded with the difference c-prev
747
             *
748
             * a new prev is computed from c,
749
             * placed in the middle of a 0x80-block (for most small scripts) or
750
             * in the middle of the Unihan and Hangul blocks
751
             * to statistically minimize the following difference
752
             */
753
0
            diff=c-prev;
754
0
            prev=BOCU1_PREV(c);
755
0
            if(DIFF_IS_SINGLE(diff)) {
756
0
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
757
0
                --targetCapacity;
758
0
                if(c<0x3000) {
759
0
                    goto fastSingle;
760
0
                }
761
0
            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
762
                /* optimize 2-byte case */
763
0
                int32_t m;
764
765
0
                if(diff>=0) {
766
0
                    diff-=BOCU1_REACH_POS_1+1;
767
0
                    m=diff%BOCU1_TRAIL_COUNT;
768
0
                    diff/=BOCU1_TRAIL_COUNT;
769
0
                    diff+=BOCU1_START_POS_2;
770
0
                } else {
771
0
                    diff-=BOCU1_REACH_NEG_1;
772
0
                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
773
0
                    diff+=BOCU1_START_NEG_2;
774
0
                }
775
0
                *target++=(uint8_t)diff;
776
0
                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
777
0
                targetCapacity-=2;
778
0
            } else {
779
0
                int32_t length; /* will be 2..4 */
780
781
0
                diff=packDiff(diff);
782
0
                length=BOCU1_LENGTH_FROM_PACKED(diff);
783
784
                /* write the output character bytes from diff and length */
785
                /* from the first if in the loop we know that targetCapacity>0 */
786
0
                if(length<=targetCapacity) {
787
0
                    switch(length) {
788
                        /* each branch falls through to the next one */
789
0
                    case 4:
790
0
                        *target++=(uint8_t)(diff>>24);
791
0
                        U_FALLTHROUGH;
792
0
                    case 3:
793
0
                        *target++=(uint8_t)(diff>>16);
794
                    /* case 2: handled above */
795
0
                        *target++=(uint8_t)(diff>>8);
796
                    /* case 1: handled above */
797
0
                        *target++=(uint8_t)diff;
798
0
                        U_FALLTHROUGH;
799
0
                    default:
800
                        /* will never occur */
801
0
                        break;
802
0
                    }
803
0
                    targetCapacity-=length;
804
0
                } else {
805
0
                    uint8_t *charErrorBuffer;
806
807
                    /*
808
                     * We actually do this backwards here:
809
                     * In order to save an intermediate variable, we output
810
                     * first to the overflow buffer what does not fit into the
811
                     * regular target.
812
                     */
813
                    /* we know that 1<=targetCapacity<length<=4 */
814
0
                    length-=targetCapacity;
815
0
                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
816
0
                    switch(length) {
817
                        /* each branch falls through to the next one */
818
0
                    case 3:
819
0
                        *charErrorBuffer++=(uint8_t)(diff>>16);
820
0
                        U_FALLTHROUGH;
821
0
                    case 2:
822
0
                        *charErrorBuffer++=(uint8_t)(diff>>8);
823
0
                        U_FALLTHROUGH;
824
0
                    case 1:
825
0
                        *charErrorBuffer=(uint8_t)diff;
826
0
                        U_FALLTHROUGH;
827
0
                    default:
828
                        /* will never occur */
829
0
                        break;
830
0
                    }
831
0
                    cnv->charErrorBufferLength=(int8_t)length;
832
833
                    /* now output what fits into the regular target */
834
0
                    diff>>=8*length; /* length was reduced by targetCapacity */
835
0
                    switch(targetCapacity) {
836
                        /* each branch falls through to the next one */
837
0
                    case 3:
838
0
                        *target++=(uint8_t)(diff>>16);
839
0
                        U_FALLTHROUGH;
840
0
                    case 2:
841
0
                        *target++=(uint8_t)(diff>>8);
842
0
                        U_FALLTHROUGH;
843
0
                    case 1:
844
0
                        *target++=(uint8_t)diff;
845
0
                        U_FALLTHROUGH;
846
0
                    default:
847
                        /* will never occur */
848
0
                        break;
849
0
                    }
850
851
                    /* target overflow */
852
0
                    targetCapacity=0;
853
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
854
0
                    break;
855
0
                }
856
0
            }
857
0
        } else {
858
            /* target is full */
859
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
860
0
            break;
861
0
        }
862
0
    }
863
864
    /* set the converter state back into UConverter */
865
0
    cnv->fromUChar32= c<0 ? -c : 0;
866
0
    cnv->fromUnicodeStatus=(uint32_t)prev;
867
868
    /* write back the updated pointers */
869
0
    pArgs->source=source;
870
0
    pArgs->target=(char *)target;
871
0
}
872
873
/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
874
875
/**
876
 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
877
 *
878
 * @param b lead byte;
879
 *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
880
 * @return (diff<<2)|count
881
 */
882
static inline int32_t
883
0
decodeBocu1LeadByte(int32_t b) {
884
0
    int32_t diff, count;
885
886
0
    if(b>=BOCU1_START_NEG_2) {
887
        /* positive difference */
888
0
        if(b<BOCU1_START_POS_3) {
889
            /* two bytes */
890
0
            diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
891
0
            count=1;
892
0
        } else if(b<BOCU1_START_POS_4) {
893
            /* three bytes */
894
0
            diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
895
0
            count=2;
896
0
        } else {
897
            /* four bytes */
898
0
            diff=BOCU1_REACH_POS_3+1;
899
0
            count=3;
900
0
        }
901
0
    } else {
902
        /* negative difference */
903
0
        if(b>=BOCU1_START_NEG_3) {
904
            /* two bytes */
905
0
            diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
906
0
            count=1;
907
0
        } else if(b>BOCU1_MIN) {
908
            /* three bytes */
909
0
            diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
910
0
            count=2;
911
0
        } else {
912
            /* four bytes */
913
0
            diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
914
0
            count=3;
915
0
        }
916
0
    }
917
918
    /* return the state for decoding the trail byte(s) */
919
0
    return (diff<<2)|count;
920
0
}
921
922
/**
923
 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
924
 *
925
 * @param count number of remaining trail bytes including this one
926
 * @param b trail byte
927
 * @return new delta for diff including b - <0 indicates an error
928
 *
929
 * @see decodeBocu1
930
 */
931
static inline int32_t
932
0
decodeBocu1TrailByte(int32_t count, int32_t b) {
933
0
    if(b<=0x20) {
934
        /* skip some C0 controls and make the trail byte range contiguous */
935
0
        b=bocu1ByteToTrail[b];
936
        /* b<0 for an illegal trail byte value will result in return<0 below */
937
#if BOCU1_MAX_TRAIL<0xff
938
    } else if(b>BOCU1_MAX_TRAIL) {
939
        return -99;
940
#endif
941
0
    } else {
942
0
        b-=BOCU1_TRAIL_BYTE_OFFSET;
943
0
    }
944
945
    /* add trail byte into difference and decrement count */
946
0
    if(count==1) {
947
0
        return b;
948
0
    } else if(count==2) {
949
0
        return b*BOCU1_TRAIL_COUNT;
950
0
    } else /* count==3 */ {
951
0
        return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
952
0
    }
953
0
}
954
955
static void U_CALLCONV
956
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
957
0
                           UErrorCode *pErrorCode) {
958
0
    UConverter *cnv;
959
0
    const uint8_t *source, *sourceLimit;
960
0
    UChar *target;
961
0
    const UChar *targetLimit;
962
0
    int32_t *offsets;
963
964
0
    int32_t prev, count, diff, c;
965
966
0
    int8_t byteIndex;
967
0
    uint8_t *bytes;
968
969
0
    int32_t sourceIndex, nextSourceIndex;
970
971
    /* set up the local pointers */
972
0
    cnv=pArgs->converter;
973
0
    source=(const uint8_t *)pArgs->source;
974
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
975
0
    target=pArgs->target;
976
0
    targetLimit=pArgs->targetLimit;
977
0
    offsets=pArgs->offsets;
978
979
    /* get the converter state from UConverter */
980
0
    prev=(int32_t)cnv->toUnicodeStatus;
981
0
    if(prev==0) {
982
0
        prev=BOCU1_ASCII_PREV;
983
0
    }
984
0
    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
985
0
    count=diff&3;
986
0
    diff>>=2;
987
988
0
    byteIndex=cnv->toULength;
989
0
    bytes=cnv->toUBytes;
990
991
    /* sourceIndex=-1 if the current character began in the previous buffer */
992
0
    sourceIndex=byteIndex==0 ? 0 : -1;
993
0
    nextSourceIndex=0;
994
995
    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
996
0
    if(count>0 && byteIndex>0 && target<targetLimit) {
997
0
        goto getTrail;
998
0
    }
999
1000
0
fastSingle:
1001
    /* fast loop for single-byte differences */
1002
    /* use count as the only loop counter variable */
1003
0
    diff=(int32_t)(sourceLimit-source);
1004
0
    count=(int32_t)(pArgs->targetLimit-target);
1005
0
    if(count>diff) {
1006
0
        count=diff;
1007
0
    }
1008
0
    while(count>0) {
1009
0
        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1010
0
            c=prev+(c-BOCU1_MIDDLE);
1011
0
            if(c<0x3000) {
1012
0
                *target++=(UChar)c;
1013
0
                *offsets++=nextSourceIndex++;
1014
0
                prev=BOCU1_SIMPLE_PREV(c);
1015
0
            } else {
1016
0
                break;
1017
0
            }
1018
0
        } else if(c<=0x20) {
1019
0
            if(c!=0x20) {
1020
0
                prev=BOCU1_ASCII_PREV;
1021
0
            }
1022
0
            *target++=(UChar)c;
1023
0
            *offsets++=nextSourceIndex++;
1024
0
        } else {
1025
0
            break;
1026
0
        }
1027
0
        ++source;
1028
0
        --count;
1029
0
    }
1030
0
    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1031
1032
    /* decode a sequence of single and lead bytes */
1033
0
    while(source<sourceLimit) {
1034
0
        if(target>=targetLimit) {
1035
            /* target is full */
1036
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1037
0
            break;
1038
0
        }
1039
1040
0
        ++nextSourceIndex;
1041
0
        c=*source++;
1042
0
        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1043
            /* Write a code point directly from a single-byte difference. */
1044
0
            c=prev+(c-BOCU1_MIDDLE);
1045
0
            if(c<0x3000) {
1046
0
                *target++=(UChar)c;
1047
0
                *offsets++=sourceIndex;
1048
0
                prev=BOCU1_SIMPLE_PREV(c);
1049
0
                sourceIndex=nextSourceIndex;
1050
0
                goto fastSingle;
1051
0
            }
1052
0
        } else if(c<=0x20) {
1053
            /*
1054
             * Direct-encoded C0 control code or space.
1055
             * Reset prev for C0 control codes but not for space.
1056
             */
1057
0
            if(c!=0x20) {
1058
0
                prev=BOCU1_ASCII_PREV;
1059
0
            }
1060
0
            *target++=(UChar)c;
1061
0
            *offsets++=sourceIndex;
1062
0
            sourceIndex=nextSourceIndex;
1063
0
            continue;
1064
0
        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1065
            /* Optimize two-byte case. */
1066
0
            if(c>=BOCU1_MIDDLE) {
1067
0
                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1068
0
            } else {
1069
0
                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1070
0
            }
1071
1072
            /* trail byte */
1073
0
            ++nextSourceIndex;
1074
0
            c=decodeBocu1TrailByte(1, *source++);
1075
0
            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1076
0
                bytes[0]=source[-2];
1077
0
                bytes[1]=source[-1];
1078
0
                byteIndex=2;
1079
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1080
0
                break;
1081
0
            }
1082
0
        } else if(c==BOCU1_RESET) {
1083
            /* only reset the state, no code point */
1084
0
            prev=BOCU1_ASCII_PREV;
1085
0
            sourceIndex=nextSourceIndex;
1086
0
            continue;
1087
0
        } else {
1088
            /*
1089
             * For multi-byte difference lead bytes, set the decoder state
1090
             * with the partial difference value from the lead byte and
1091
             * with the number of trail bytes.
1092
             */
1093
0
            bytes[0]=(uint8_t)c;
1094
0
            byteIndex=1;
1095
1096
0
            diff=decodeBocu1LeadByte(c);
1097
0
            count=diff&3;
1098
0
            diff>>=2;
1099
0
getTrail:
1100
0
            for(;;) {
1101
0
                if(source>=sourceLimit) {
1102
0
                    goto endloop;
1103
0
                }
1104
0
                ++nextSourceIndex;
1105
0
                c=bytes[byteIndex++]=*source++;
1106
1107
                /* trail byte in any position */
1108
0
                c=decodeBocu1TrailByte(count, c);
1109
0
                if(c<0) {
1110
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1111
0
                    goto endloop;
1112
0
                }
1113
1114
0
                diff+=c;
1115
0
                if(--count==0) {
1116
                    /* final trail byte, deliver a code point */
1117
0
                    byteIndex=0;
1118
0
                    c=prev+diff;
1119
0
                    if((uint32_t)c>0x10ffff) {
1120
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121
0
                        goto endloop;
1122
0
                    }
1123
0
                    break;
1124
0
                }
1125
0
            }
1126
0
        }
1127
1128
        /* calculate the next prev and output c */
1129
0
        prev=BOCU1_PREV(c);
1130
0
        if(c<=0xffff) {
1131
0
            *target++=(UChar)c;
1132
0
            *offsets++=sourceIndex;
1133
0
        } else {
1134
            /* output surrogate pair */
1135
0
            *target++=U16_LEAD(c);
1136
0
            if(target<targetLimit) {
1137
0
                *target++=U16_TRAIL(c);
1138
0
                *offsets++=sourceIndex;
1139
0
                *offsets++=sourceIndex;
1140
0
            } else {
1141
                /* target overflow */
1142
0
                *offsets++=sourceIndex;
1143
0
                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1144
0
                cnv->UCharErrorBufferLength=1;
1145
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1146
0
                break;
1147
0
            }
1148
0
        }
1149
0
        sourceIndex=nextSourceIndex;
1150
0
    }
1151
0
endloop:
1152
1153
0
    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1154
        /* set the converter state in UConverter to deal with the next character */
1155
0
        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1156
0
        cnv->mode=0;
1157
0
    } else {
1158
        /* set the converter state back into UConverter */
1159
0
        cnv->toUnicodeStatus=(uint32_t)prev;
1160
0
        cnv->mode=(diff<<2)|count;
1161
0
    }
1162
0
    cnv->toULength=byteIndex;
1163
1164
    /* write back the updated pointers */
1165
0
    pArgs->source=(const char *)source;
1166
0
    pArgs->target=target;
1167
0
    pArgs->offsets=offsets;
1168
0
    return;
1169
0
}
1170
1171
/*
1172
 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173
 * If a change is made in the original function, then either
1174
 * change this function the same way or
1175
 * re-copy the original function and remove the variables
1176
 * offsets, sourceIndex, and nextSourceIndex.
1177
 */
1178
static void U_CALLCONV
1179
_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1180
0
                UErrorCode *pErrorCode) {
1181
0
    UConverter *cnv;
1182
0
    const uint8_t *source, *sourceLimit;
1183
0
    UChar *target;
1184
0
    const UChar *targetLimit;
1185
1186
0
    int32_t prev, count, diff, c;
1187
1188
0
    int8_t byteIndex;
1189
0
    uint8_t *bytes;
1190
1191
    /* set up the local pointers */
1192
0
    cnv=pArgs->converter;
1193
0
    source=(const uint8_t *)pArgs->source;
1194
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1195
0
    target=pArgs->target;
1196
0
    targetLimit=pArgs->targetLimit;
1197
1198
    /* get the converter state from UConverter */
1199
0
    prev=(int32_t)cnv->toUnicodeStatus;
1200
0
    if(prev==0) {
1201
0
        prev=BOCU1_ASCII_PREV;
1202
0
    }
1203
0
    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1204
0
    count=diff&3;
1205
0
    diff>>=2;
1206
1207
0
    byteIndex=cnv->toULength;
1208
0
    bytes=cnv->toUBytes;
1209
1210
    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1211
0
    if(count>0 && byteIndex>0 && target<targetLimit) {
1212
0
        goto getTrail;
1213
0
    }
1214
1215
0
fastSingle:
1216
    /* fast loop for single-byte differences */
1217
    /* use count as the only loop counter variable */
1218
0
    diff=(int32_t)(sourceLimit-source);
1219
0
    count=(int32_t)(pArgs->targetLimit-target);
1220
0
    if(count>diff) {
1221
0
        count=diff;
1222
0
    }
1223
0
    while(count>0) {
1224
0
        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1225
0
            c=prev+(c-BOCU1_MIDDLE);
1226
0
            if(c<0x3000) {
1227
0
                *target++=(UChar)c;
1228
0
                prev=BOCU1_SIMPLE_PREV(c);
1229
0
            } else {
1230
0
                break;
1231
0
            }
1232
0
        } else if(c<=0x20) {
1233
0
            if(c!=0x20) {
1234
0
                prev=BOCU1_ASCII_PREV;
1235
0
            }
1236
0
            *target++=(UChar)c;
1237
0
        } else {
1238
0
            break;
1239
0
        }
1240
0
        ++source;
1241
0
        --count;
1242
0
    }
1243
1244
    /* decode a sequence of single and lead bytes */
1245
0
    while(source<sourceLimit) {
1246
0
        if(target>=targetLimit) {
1247
            /* target is full */
1248
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249
0
            break;
1250
0
        }
1251
1252
0
        c=*source++;
1253
0
        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1254
            /* Write a code point directly from a single-byte difference. */
1255
0
            c=prev+(c-BOCU1_MIDDLE);
1256
0
            if(c<0x3000) {
1257
0
                *target++=(UChar)c;
1258
0
                prev=BOCU1_SIMPLE_PREV(c);
1259
0
                goto fastSingle;
1260
0
            }
1261
0
        } else if(c<=0x20) {
1262
            /*
1263
             * Direct-encoded C0 control code or space.
1264
             * Reset prev for C0 control codes but not for space.
1265
             */
1266
0
            if(c!=0x20) {
1267
0
                prev=BOCU1_ASCII_PREV;
1268
0
            }
1269
0
            *target++=(UChar)c;
1270
0
            continue;
1271
0
        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1272
            /* Optimize two-byte case. */
1273
0
            if(c>=BOCU1_MIDDLE) {
1274
0
                diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1275
0
            } else {
1276
0
                diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1277
0
            }
1278
1279
            /* trail byte */
1280
0
            c=decodeBocu1TrailByte(1, *source++);
1281
0
            if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1282
0
                bytes[0]=source[-2];
1283
0
                bytes[1]=source[-1];
1284
0
                byteIndex=2;
1285
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1286
0
                break;
1287
0
            }
1288
0
        } else if(c==BOCU1_RESET) {
1289
            /* only reset the state, no code point */
1290
0
            prev=BOCU1_ASCII_PREV;
1291
0
            continue;
1292
0
        } else {
1293
            /*
1294
             * For multi-byte difference lead bytes, set the decoder state
1295
             * with the partial difference value from the lead byte and
1296
             * with the number of trail bytes.
1297
             */
1298
0
            bytes[0]=(uint8_t)c;
1299
0
            byteIndex=1;
1300
1301
0
            diff=decodeBocu1LeadByte(c);
1302
0
            count=diff&3;
1303
0
            diff>>=2;
1304
0
getTrail:
1305
0
            for(;;) {
1306
0
                if(source>=sourceLimit) {
1307
0
                    goto endloop;
1308
0
                }
1309
0
                c=bytes[byteIndex++]=*source++;
1310
1311
                /* trail byte in any position */
1312
0
                c=decodeBocu1TrailByte(count, c);
1313
0
                if(c<0) {
1314
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315
0
                    goto endloop;
1316
0
                }
1317
1318
0
                diff+=c;
1319
0
                if(--count==0) {
1320
                    /* final trail byte, deliver a code point */
1321
0
                    byteIndex=0;
1322
0
                    c=prev+diff;
1323
0
                    if((uint32_t)c>0x10ffff) {
1324
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325
0
                        goto endloop;
1326
0
                    }
1327
0
                    break;
1328
0
                }
1329
0
            }
1330
0
        }
1331
1332
        /* calculate the next prev and output c */
1333
0
        prev=BOCU1_PREV(c);
1334
0
        if(c<=0xffff) {
1335
0
            *target++=(UChar)c;
1336
0
        } else {
1337
            /* output surrogate pair */
1338
0
            *target++=U16_LEAD(c);
1339
0
            if(target<targetLimit) {
1340
0
                *target++=U16_TRAIL(c);
1341
0
            } else {
1342
                /* target overflow */
1343
0
                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1344
0
                cnv->UCharErrorBufferLength=1;
1345
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346
0
                break;
1347
0
            }
1348
0
        }
1349
0
    }
1350
0
endloop:
1351
1352
0
    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1353
        /* set the converter state in UConverter to deal with the next character */
1354
0
        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1355
0
        cnv->mode=0;
1356
0
    } else {
1357
        /* set the converter state back into UConverter */
1358
0
        cnv->toUnicodeStatus=(uint32_t)prev;
1359
0
        cnv->mode=(diff<<2)|count;
1360
0
    }
1361
0
    cnv->toULength=byteIndex;
1362
1363
    /* write back the updated pointers */
1364
0
    pArgs->source=(const char *)source;
1365
0
    pArgs->target=target;
1366
0
    return;
1367
0
}
1368
1369
/* miscellaneous ------------------------------------------------------------ */
1370
1371
static const UConverterImpl _Bocu1Impl={
1372
    UCNV_BOCU1,
1373
1374
    NULL,
1375
    NULL,
1376
1377
    NULL,
1378
    NULL,
1379
    NULL,
1380
1381
    _Bocu1ToUnicode,
1382
    _Bocu1ToUnicodeWithOffsets,
1383
    _Bocu1FromUnicode,
1384
    _Bocu1FromUnicodeWithOffsets,
1385
    NULL,
1386
1387
    NULL,
1388
    NULL,
1389
    NULL,
1390
    NULL,
1391
    ucnv_getCompleteUnicodeSet,
1392
1393
    NULL,
1394
    NULL
1395
};
1396
1397
static const UConverterStaticData _Bocu1StaticData={
1398
    sizeof(UConverterStaticData),
1399
    "BOCU-1",
1400
    1214, /* CCSID for BOCU-1 */
1401
    UCNV_IBM, UCNV_BOCU1,
1402
    1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1403
    { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1404
    FALSE, FALSE,
1405
    0,
1406
    0,
1407
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1408
};
1409
1410
const UConverterSharedData _Bocu1Data=
1411
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1412
1413
#endif