Coverage Report

Created: 2024-04-24 06:23

/src/icu/source/common/ucnv_u7.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*  
4
**********************************************************************
5
*   Copyright (C) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv_u7.c
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2002jul01
14
*   created by: Markus W. Scherer
15
*
16
*   UTF-7 converter implementation. Used to be in ucnv_utf.c.
17
*/
18
19
#include "unicode/utypes.h"
20
21
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22
23
#include "cmemory.h"
24
#include "unicode/ucnv.h"
25
#include "ucnv_bld.h"
26
#include "ucnv_cnv.h"
27
#include "uassert.h"
28
29
/* UTF-7 -------------------------------------------------------------------- */
30
31
/*
32
 * UTF-7 is a stateful encoding of Unicode.
33
 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34
 * It was intended for use in Internet email systems, using in its bytewise
35
 * encoding only a subset of 7-bit US-ASCII.
36
 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37
 * occasionally used.
38
 *
39
 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40
 * characters directly or in base64. Especially, the characters in set O
41
 * as defined in the RFC (see below) may be encoded directly but are not
42
 * allowed in, e.g., email headers.
43
 * By default, the ICU UTF-7 converter encodes set O directly.
44
 * By choosing the option "version=1", set O will be escaped instead.
45
 * For example:
46
 *     utf7Converter=ucnv_open("UTF-7,version=1");
47
 *
48
 * For details about email headers see RFC 2047.
49
 */
50
51
/*
52
 * Tests for US-ASCII characters belonging to character classes
53
 * defined in UTF-7.
54
 *
55
 * Set D (directly encoded characters) consists of the following
56
 * characters: the upper and lower case letters A through Z
57
 * and a through z, the 10 digits 0-9, and the following nine special
58
 * characters (note that "+" and "=" are omitted):
59
 *     '(),-./:?
60
 *
61
 * Set O (optional direct characters) consists of the following
62
 * characters (note that "\" and "~" are omitted):
63
 *     !"#$%&*;<=>@[]^_`{|}
64
 *
65
 * According to the rules in RFC 2152, the byte values for the following
66
 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67
 * - all C0 control codes except for CR LF TAB
68
 * - BACKSLASH
69
 * - TILDE
70
 * - DEL
71
 * - all codes beyond US-ASCII, i.e. all >127
72
 */
73
#define inSetD(c) \
74
    ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75
     (uint8_t)((c)-48)<10 ||    /* digits */ \
76
     (uint8_t)((c)-39)<3 ||     /* '() */ \
77
     (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
78
     (c)==58 || (c)==63         /* :? */ \
79
    )
80
81
#define inSetO(c) \
82
    ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
83
     (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
84
     (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
85
     (uint8_t)((c)-123)<3 ||        /* {|} */ \
86
     (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
87
    )
88
89
0
#define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90
#define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
91
92
0
#define PLUS  43
93
0
#define MINUS 45
94
0
#define BACKSLASH 92
95
#define TILDE 126
96
97
/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98
0
#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
99
100
/* encode directly sets D and O and CR LF SP TAB */
101
static const UBool encodeDirectlyMaximum[128]={
102
 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
103
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105
106
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108
109
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
111
112
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114
};
115
116
/* encode directly set D and CR LF SP TAB but not set O */
117
static const UBool encodeDirectlyRestricted[128]={
118
 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
119
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121
122
    1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
124
125
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
127
128
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
130
};
131
132
static const uint8_t
133
toBase64[64]={
134
    /* A-Z */
135
    65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136
    78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
137
    /* a-z */
138
    97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139
    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
140
    /* 0-9 */
141
    48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
142
    /* +/ */
143
    43, 47
144
};
145
146
static const int8_t
147
fromBase64[128]={
148
    /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149
    -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150
    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
151
152
    /* general punctuation with + and / and a special value (-2) for - */
153
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
154
    /* digits */
155
    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156
157
    /* A-Z */
158
    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
159
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160
161
    /* a-z */
162
    -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163
    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164
};
165
166
/*
167
 * converter status values:
168
 *
169
 * toUnicodeStatus:
170
 *     24 inDirectMode (boolean)
171
 * 23..16 base64Counter (-1..7)
172
 * 15..0  bits (up to 14 bits incoming base64)
173
 *
174
 * fromUnicodeStatus:
175
 * 31..28 version (0: set O direct  1: set O escaped)
176
 *     24 inDirectMode (boolean)
177
 * 23..16 base64Counter (0..2)
178
 *  7..0  bits (6 bits outgoing base64)
179
 *
180
 */
181
182
U_CDECL_BEGIN
183
static void U_CALLCONV
184
0
_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
185
0
    if(choice<=UCNV_RESET_TO_UNICODE) {
186
        /* reset toUnicode */
187
0
        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
188
0
        cnv->toULength=0;
189
0
    }
190
0
    if(choice!=UCNV_RESET_TO_UNICODE) {
191
        /* reset fromUnicode */
192
0
        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
193
0
    }
194
0
}
195
196
static void U_CALLCONV
197
_UTF7Open(UConverter *cnv,
198
          UConverterLoadArgs *pArgs,
199
0
          UErrorCode *pErrorCode) {
200
0
    (void)pArgs;
201
0
    if(UCNV_GET_VERSION(cnv)<=1) {
202
        /* TODO(markus): Should just use cnv->options rather than copying the version number. */
203
0
        cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
204
0
        _UTF7Reset(cnv, UCNV_RESET_BOTH);
205
0
    } else {
206
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207
0
    }
208
0
}
209
210
static void U_CALLCONV
211
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
212
0
                          UErrorCode *pErrorCode) {
213
0
    UConverter *cnv;
214
0
    const uint8_t *source, *sourceLimit;
215
0
    UChar *target;
216
0
    const UChar *targetLimit;
217
0
    int32_t *offsets;
218
219
0
    uint8_t *bytes;
220
0
    uint8_t byteIndex;
221
222
0
    int32_t length, targetCapacity;
223
224
    /* UTF-7 state */
225
0
    uint16_t bits;
226
0
    int8_t base64Counter;
227
0
    UBool inDirectMode;
228
229
0
    int8_t base64Value;
230
231
0
    int32_t sourceIndex, nextSourceIndex;
232
233
0
    uint8_t b;
234
    /* set up the local pointers */
235
0
    cnv=pArgs->converter;
236
237
0
    source=(const uint8_t *)pArgs->source;
238
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
239
0
    target=pArgs->target;
240
0
    targetLimit=pArgs->targetLimit;
241
0
    offsets=pArgs->offsets;
242
    /* get the state machine state */
243
0
    {
244
0
        uint32_t status=cnv->toUnicodeStatus;
245
0
        inDirectMode=(UBool)((status>>24)&1);
246
0
        base64Counter=(int8_t)(status>>16);
247
0
        bits=(uint16_t)status;
248
0
    }
249
0
    bytes=cnv->toUBytes;
250
0
    byteIndex=cnv->toULength;
251
252
    /* sourceIndex=-1 if the current character began in the previous buffer */
253
0
    sourceIndex=byteIndex==0 ? 0 : -1;
254
0
    nextSourceIndex=0;
255
256
0
    if(inDirectMode) {
257
0
directMode:
258
        /*
259
         * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
260
         * with their US-ASCII byte values.
261
         * Backslash and Tilde and most control characters are not allowed in UTF-7.
262
         * A plus sign starts Unicode (or "escape") Mode.
263
         *
264
         * In Direct Mode, only the sourceIndex is used.
265
         */
266
0
        byteIndex=0;
267
0
        length=(int32_t)(sourceLimit-source);
268
0
        targetCapacity=(int32_t)(targetLimit-target);
269
0
        if(length>targetCapacity) {
270
0
            length=targetCapacity;
271
0
        }
272
0
        while(length>0) {
273
0
            b=*source++;
274
0
            if(!isLegalUTF7(b)) {
275
                /* illegal */
276
0
                bytes[0]=b;
277
0
                byteIndex=1;
278
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
279
0
                break;
280
0
            } else if(b!=PLUS) {
281
                /* write directly encoded character */
282
0
                *target++=b;
283
0
                if(offsets!=NULL) {
284
0
                    *offsets++=sourceIndex++;
285
0
                }
286
0
            } else /* PLUS */ {
287
                /* switch to Unicode mode */
288
0
                nextSourceIndex=++sourceIndex;
289
0
                inDirectMode=FALSE;
290
0
                byteIndex=0;
291
0
                bits=0;
292
0
                base64Counter=-1;
293
0
                goto unicodeMode;
294
0
            }
295
0
            --length;
296
0
        }
297
0
        if(source<sourceLimit && target>=targetLimit) {
298
            /* target is full */
299
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
300
0
        }
301
0
    } else {
302
0
unicodeMode:
303
        /*
304
         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
305
         * The base64 sequence ends with any character that is not in the base64 alphabet.
306
         * A terminating minus sign is consumed.
307
         *
308
         * In Unicode Mode, the sourceIndex has the index to the start of the current
309
         * base64 bytes, while nextSourceIndex is precisely parallel to source,
310
         * keeping the index to the following byte.
311
         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
312
         */
313
0
        while(source<sourceLimit) {
314
0
            if(target<targetLimit) {
315
0
                bytes[byteIndex++]=b=*source++;
316
0
                ++nextSourceIndex;
317
0
                base64Value = -3; /* initialize as illegal */
318
0
                if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
319
                    /* either
320
                     * base64Value==-1 for any legal character except base64 and minus sign, or
321
                     * base64Value==-3 for illegal characters:
322
                     * 1. In either case, leave Unicode mode.
323
                     * 2.1. If we ended with an incomplete UChar or none after the +, then
324
                     *      generate an error for the preceding erroneous sequence and deal with
325
                     *      the current (possibly illegal) character next time through.
326
                     * 2.2. Else the current char comes after a complete UChar, which was already
327
                     *      pushed to the output buf, so:
328
                     * 2.2.1. If the current char is legal, just save it for processing next time.
329
                     *        It may be for example, a plus which we need to deal with in direct mode.
330
                     * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331
                     */
332
0
                    inDirectMode=TRUE;
333
0
                    if(base64Counter==-1) {
334
                        /* illegal: + immediately followed by something other than base64 or minus sign */
335
                        /* include the plus sign in the reported sequence, but not the subsequent char */
336
0
                        --source;
337
0
                        bytes[0]=PLUS;
338
0
                        byteIndex=1;
339
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
340
0
                        break;
341
0
                    } else if(bits!=0) {
342
                        /* bits are illegally left over, a UChar is incomplete */
343
                        /* don't include current char (legal or illegal) in error seq */
344
0
                        --source;
345
0
                        --byteIndex;
346
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347
0
                        break;
348
0
                    } else {
349
                        /* previous UChar was complete */
350
0
                        if(base64Value==-3) {
351
                            /* current character is illegal, deal with it here */
352
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
353
0
                            break;
354
0
                        } else {
355
                            /* un-read the current character in case it is a plus sign */
356
0
                            --source;
357
0
                            sourceIndex=nextSourceIndex-1;
358
0
                            goto directMode;
359
0
                        }
360
0
                    }
361
0
                } else if(base64Value>=0) {
362
                    /* collect base64 bytes into UChars */
363
0
                    switch(base64Counter) {
364
0
                    case -1: /* -1 is immediately after the + */
365
0
                    case 0:
366
0
                        bits=base64Value;
367
0
                        base64Counter=1;
368
0
                        break;
369
0
                    case 1:
370
0
                    case 3:
371
0
                    case 4:
372
0
                    case 6:
373
0
                        bits=(uint16_t)((bits<<6)|base64Value);
374
0
                        ++base64Counter;
375
0
                        break;
376
0
                    case 2:
377
0
                        *target++=(UChar)((bits<<4)|(base64Value>>2));
378
0
                        if(offsets!=NULL) {
379
0
                            *offsets++=sourceIndex;
380
0
                            sourceIndex=nextSourceIndex-1;
381
0
                        }
382
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
383
0
                        byteIndex=1;
384
0
                        bits=(uint16_t)(base64Value&3);
385
0
                        base64Counter=3;
386
0
                        break;
387
0
                    case 5:
388
0
                        *target++=(UChar)((bits<<2)|(base64Value>>4));
389
0
                        if(offsets!=NULL) {
390
0
                            *offsets++=sourceIndex;
391
0
                            sourceIndex=nextSourceIndex-1;
392
0
                        }
393
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
394
0
                        byteIndex=1;
395
0
                        bits=(uint16_t)(base64Value&15);
396
0
                        base64Counter=6;
397
0
                        break;
398
0
                    case 7:
399
0
                        *target++=(UChar)((bits<<6)|base64Value);
400
0
                        if(offsets!=NULL) {
401
0
                            *offsets++=sourceIndex;
402
0
                            sourceIndex=nextSourceIndex;
403
0
                        }
404
0
                        byteIndex=0;
405
0
                        bits=0;
406
0
                        base64Counter=0;
407
0
                        break;
408
0
                    default:
409
                        /* will never occur */
410
0
                        break;
411
0
                    }
412
0
                } else /*base64Value==-2*/ {
413
                    /* minus sign terminates the base64 sequence */
414
0
                    inDirectMode=TRUE;
415
0
                    if(base64Counter==-1) {
416
                        /* +- i.e. a minus immediately following a plus */
417
0
                        *target++=PLUS;
418
0
                        if(offsets!=NULL) {
419
0
                            *offsets++=sourceIndex-1;
420
0
                        }
421
0
                    } else {
422
                        /* absorb the minus and leave the Unicode Mode */
423
0
                        if(bits!=0) {
424
                            /* bits are illegally left over, a UChar is incomplete */
425
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
426
0
                            break;
427
0
                        }
428
0
                    }
429
0
                    sourceIndex=nextSourceIndex;
430
0
                    goto directMode;
431
0
                }
432
0
            } else {
433
                /* target is full */
434
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435
0
                break;
436
0
            }
437
0
        }
438
0
    }
439
440
0
    if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
441
        /*
442
         * if we are in Unicode mode, then the byteIndex might not be 0,
443
         * but that is ok if bits==0
444
         * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
445
         * (not true for IMAP-mailbox-name where we must end in direct mode)
446
         */
447
0
        byteIndex=0;
448
0
    }
449
450
    /* set the converter state back into UConverter */
451
0
    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
452
0
    cnv->toULength=byteIndex;
453
454
    /* write back the updated pointers */
455
0
    pArgs->source=(const char *)source;
456
0
    pArgs->target=target;
457
0
    pArgs->offsets=offsets;
458
0
    return;
459
0
}
460
461
static void U_CALLCONV
462
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
463
0
                            UErrorCode *pErrorCode) {
464
0
    UConverter *cnv;
465
0
    const UChar *source, *sourceLimit;
466
0
    uint8_t *target, *targetLimit;
467
0
    int32_t *offsets;
468
469
0
    int32_t length, targetCapacity, sourceIndex;
470
0
    UChar c;
471
472
    /* UTF-7 state */
473
0
    const UBool *encodeDirectly;
474
0
    uint8_t bits;
475
0
    int8_t base64Counter;
476
0
    UBool inDirectMode;
477
478
    /* set up the local pointers */
479
0
    cnv=pArgs->converter;
480
481
    /* set up the local pointers */
482
0
    source=pArgs->source;
483
0
    sourceLimit=pArgs->sourceLimit;
484
0
    target=(uint8_t *)pArgs->target;
485
0
    targetLimit=(uint8_t *)pArgs->targetLimit;
486
0
    offsets=pArgs->offsets;
487
488
    /* get the state machine state */
489
0
    {
490
0
        uint32_t status=cnv->fromUnicodeStatus;
491
0
        encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
492
0
        inDirectMode=(UBool)((status>>24)&1);
493
0
        base64Counter=(int8_t)(status>>16);
494
0
        bits=(uint8_t)status;
495
0
        U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
496
0
    }
497
498
    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
499
0
    sourceIndex=0;
500
501
0
    if(inDirectMode) {
502
0
directMode:
503
0
        length=(int32_t)(sourceLimit-source);
504
0
        targetCapacity=(int32_t)(targetLimit-target);
505
0
        if(length>targetCapacity) {
506
0
            length=targetCapacity;
507
0
        }
508
0
        while(length>0) {
509
0
            c=*source++;
510
            /* currently always encode CR LF SP TAB directly */
511
0
            if(c<=127 && encodeDirectly[c]) {
512
                /* encode directly */
513
0
                *target++=(uint8_t)c;
514
0
                if(offsets!=NULL) {
515
0
                    *offsets++=sourceIndex++;
516
0
                }
517
0
            } else if(c==PLUS) {
518
                /* output +- for + */
519
0
                *target++=PLUS;
520
0
                if(target<targetLimit) {
521
0
                    *target++=MINUS;
522
0
                    if(offsets!=NULL) {
523
0
                        *offsets++=sourceIndex;
524
0
                        *offsets++=sourceIndex++;
525
0
                    }
526
                    /* realign length and targetCapacity */
527
0
                    goto directMode;
528
0
                } else {
529
0
                    if(offsets!=NULL) {
530
0
                        *offsets++=sourceIndex++;
531
0
                    }
532
0
                    cnv->charErrorBuffer[0]=MINUS;
533
0
                    cnv->charErrorBufferLength=1;
534
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
535
0
                    break;
536
0
                }
537
0
            } else {
538
                /* un-read this character and switch to Unicode Mode */
539
0
                --source;
540
0
                *target++=PLUS;
541
0
                if(offsets!=NULL) {
542
0
                    *offsets++=sourceIndex;
543
0
                }
544
0
                inDirectMode=FALSE;
545
0
                base64Counter=0;
546
0
                goto unicodeMode;
547
0
            }
548
0
            --length;
549
0
        }
550
0
        if(source<sourceLimit && target>=targetLimit) {
551
            /* target is full */
552
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
553
0
        }
554
0
    } else {
555
0
unicodeMode:
556
0
        while(source<sourceLimit) {
557
0
            if(target<targetLimit) {
558
0
                c=*source++;
559
0
                if(c<=127 && encodeDirectly[c]) {
560
                    /* encode directly */
561
0
                    inDirectMode=TRUE;
562
563
                    /* trick: back out this character to make this easier */
564
0
                    --source;
565
566
                    /* terminate the base64 sequence */
567
0
                    if(base64Counter!=0) {
568
                        /* write remaining bits for the previous character */
569
0
                        *target++=toBase64[bits];
570
0
                        if(offsets!=NULL) {
571
0
                            *offsets++=sourceIndex-1;
572
0
                        }
573
0
                    }
574
0
                    if(fromBase64[c]!=-1) {
575
                        /* need to terminate with a minus */
576
0
                        if(target<targetLimit) {
577
0
                            *target++=MINUS;
578
0
                            if(offsets!=NULL) {
579
0
                                *offsets++=sourceIndex-1;
580
0
                            }
581
0
                        } else {
582
0
                            cnv->charErrorBuffer[0]=MINUS;
583
0
                            cnv->charErrorBufferLength=1;
584
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
585
0
                            break;
586
0
                        }
587
0
                    }
588
0
                    goto directMode;
589
0
                } else {
590
                    /*
591
                     * base64 this character:
592
                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
593
                     * and the bits of this character, each implicitly in UTF-16BE.
594
                     *
595
                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
596
                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
597
                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
598
                     */
599
0
                    switch(base64Counter) {
600
0
                    case 0:
601
0
                        *target++=toBase64[c>>10];
602
0
                        if(target<targetLimit) {
603
0
                            *target++=toBase64[(c>>4)&0x3f];
604
0
                            if(offsets!=NULL) {
605
0
                                *offsets++=sourceIndex;
606
0
                                *offsets++=sourceIndex++;
607
0
                            }
608
0
                        } else {
609
0
                            if(offsets!=NULL) {
610
0
                                *offsets++=sourceIndex++;
611
0
                            }
612
0
                            cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
613
0
                            cnv->charErrorBufferLength=1;
614
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
615
0
                        }
616
0
                        bits=(uint8_t)((c&15)<<2);
617
0
                        base64Counter=1;
618
0
                        break;
619
0
                    case 1:
620
0
                        *target++=toBase64[bits|(c>>14)];
621
0
                        if(target<targetLimit) {
622
0
                            *target++=toBase64[(c>>8)&0x3f];
623
0
                            if(target<targetLimit) {
624
0
                                *target++=toBase64[(c>>2)&0x3f];
625
0
                                if(offsets!=NULL) {
626
0
                                    *offsets++=sourceIndex;
627
0
                                    *offsets++=sourceIndex;
628
0
                                    *offsets++=sourceIndex++;
629
0
                                }
630
0
                            } else {
631
0
                                if(offsets!=NULL) {
632
0
                                    *offsets++=sourceIndex;
633
0
                                    *offsets++=sourceIndex++;
634
0
                                }
635
0
                                cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
636
0
                                cnv->charErrorBufferLength=1;
637
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
638
0
                            }
639
0
                        } else {
640
0
                            if(offsets!=NULL) {
641
0
                                *offsets++=sourceIndex++;
642
0
                            }
643
0
                            cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
644
0
                            cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
645
0
                            cnv->charErrorBufferLength=2;
646
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
647
0
                        }
648
0
                        bits=(uint8_t)((c&3)<<4);
649
0
                        base64Counter=2;
650
0
                        break;
651
0
                    case 2:
652
0
                        *target++=toBase64[bits|(c>>12)];
653
0
                        if(target<targetLimit) {
654
0
                            *target++=toBase64[(c>>6)&0x3f];
655
0
                            if(target<targetLimit) {
656
0
                                *target++=toBase64[c&0x3f];
657
0
                                if(offsets!=NULL) {
658
0
                                    *offsets++=sourceIndex;
659
0
                                    *offsets++=sourceIndex;
660
0
                                    *offsets++=sourceIndex++;
661
0
                                }
662
0
                            } else {
663
0
                                if(offsets!=NULL) {
664
0
                                    *offsets++=sourceIndex;
665
0
                                    *offsets++=sourceIndex++;
666
0
                                }
667
0
                                cnv->charErrorBuffer[0]=toBase64[c&0x3f];
668
0
                                cnv->charErrorBufferLength=1;
669
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
670
0
                            }
671
0
                        } else {
672
0
                            if(offsets!=NULL) {
673
0
                                *offsets++=sourceIndex++;
674
0
                            }
675
0
                            cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
676
0
                            cnv->charErrorBuffer[1]=toBase64[c&0x3f];
677
0
                            cnv->charErrorBufferLength=2;
678
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
679
0
                        }
680
0
                        bits=0;
681
0
                        base64Counter=0;
682
0
                        break;
683
0
                    default:
684
                        /* will never occur */
685
0
                        break;
686
0
                    }
687
0
                }
688
0
            } else {
689
                /* target is full */
690
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
691
0
                break;
692
0
            }
693
0
        }
694
0
    }
695
696
0
    if(pArgs->flush && source>=sourceLimit) {
697
        /* flush remaining bits to the target */
698
0
        if(!inDirectMode) {
699
0
            if (base64Counter!=0) {
700
0
                if(target<targetLimit) {
701
0
                    *target++=toBase64[bits];
702
0
                    if(offsets!=NULL) {
703
0
                        *offsets++=sourceIndex-1;
704
0
                    }
705
0
                } else {
706
0
                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
707
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
708
0
                }
709
0
            }
710
            /* Add final MINUS to terminate unicodeMode */
711
0
            if(target<targetLimit) {
712
0
                *target++=MINUS;
713
0
                if(offsets!=NULL) {
714
0
                    *offsets++=sourceIndex-1;
715
0
                }
716
0
            } else {
717
0
                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
718
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
719
0
            }
720
0
        }
721
        /* reset the state for the next conversion */
722
0
        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
723
0
    } else {
724
        /* set the converter state back into UConverter */
725
0
        cnv->fromUnicodeStatus=
726
0
            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
727
0
            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
728
0
    }
729
730
    /* write back the updated pointers */
731
0
    pArgs->source=source;
732
0
    pArgs->target=(char *)target;
733
0
    pArgs->offsets=offsets;
734
0
    return;
735
0
}
736
737
static const char * U_CALLCONV
738
0
_UTF7GetName(const UConverter *cnv) {
739
0
    switch(cnv->fromUnicodeStatus>>28) {
740
0
    case 1:
741
0
        return "UTF-7,version=1";
742
0
    default:
743
0
        return "UTF-7";
744
0
    }
745
0
}
746
U_CDECL_END
747
748
static const UConverterImpl _UTF7Impl={
749
    UCNV_UTF7,
750
751
    NULL,
752
    NULL,
753
754
    _UTF7Open,
755
    NULL,
756
    _UTF7Reset,
757
758
    _UTF7ToUnicodeWithOffsets,
759
    _UTF7ToUnicodeWithOffsets,
760
    _UTF7FromUnicodeWithOffsets,
761
    _UTF7FromUnicodeWithOffsets,
762
    NULL,
763
764
    NULL,
765
    _UTF7GetName,
766
    NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
767
    NULL,
768
    ucnv_getCompleteUnicodeSet,
769
770
    NULL,
771
    NULL
772
};
773
774
static const UConverterStaticData _UTF7StaticData={
775
    sizeof(UConverterStaticData),
776
    "UTF-7",
777
    0, /* TODO CCSID for UTF-7 */
778
    UCNV_IBM, UCNV_UTF7,
779
    1, 4,
780
    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
781
    FALSE, FALSE,
782
    0,
783
    0,
784
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
785
};
786
787
const UConverterSharedData _UTF7Data=
788
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
789
790
/* IMAP mailbox name encoding ----------------------------------------------- */
791
792
/*
793
 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
794
 * http://www.ietf.org/rfc/rfc2060.txt
795
 *
796
 * 5.1.3.  Mailbox International Naming Convention
797
 *
798
 * By convention, international mailbox names are specified using a
799
 * modified version of the UTF-7 encoding described in [UTF-7].  The
800
 * purpose of these modifications is to correct the following problems
801
 * with UTF-7:
802
 *
803
 *    1) UTF-7 uses the "+" character for shifting; this conflicts with
804
 *       the common use of "+" in mailbox names, in particular USENET
805
 *       newsgroup names.
806
 *
807
 *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
808
 *       conflicts with the use of "/" as a popular hierarchy delimiter.
809
 *
810
 *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
811
 *       the use of "\" as a popular hierarchy delimiter.
812
 *
813
 *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
814
 *       the use of "~" in some servers as a home directory indicator.
815
 *
816
 *    5) UTF-7 permits multiple alternate forms to represent the same
817
 *       string; in particular, printable US-ASCII characters can be
818
 *       represented in encoded form.
819
 *
820
 * In modified UTF-7, printable US-ASCII characters except for "&"
821
 * represent themselves; that is, characters with octet values 0x20-0x25
822
 * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
823
 * octet sequence "&-".
824
 *
825
 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
826
 * Unicode 16-bit octets) are represented in modified BASE64, with a
827
 * further modification from [UTF-7] that "," is used instead of "/".
828
 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
829
 * character which can represent itself.
830
 *
831
 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
832
 * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
833
 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
834
 * ").
835
 *
836
 * For example, here is a mailbox name which mixes English, Japanese,
837
 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
838
 */
839
840
/*
841
 * Tests for US-ASCII characters belonging to character classes
842
 * defined in UTF-7.
843
 *
844
 * Set D (directly encoded characters) consists of the following
845
 * characters: the upper and lower case letters A through Z
846
 * and a through z, the 10 digits 0-9, and the following nine special
847
 * characters (note that "+" and "=" are omitted):
848
 *     '(),-./:?
849
 *
850
 * Set O (optional direct characters) consists of the following
851
 * characters (note that "\" and "~" are omitted):
852
 *     !"#$%&*;<=>@[]^_`{|}
853
 *
854
 * According to the rules in RFC 2152, the byte values for the following
855
 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
856
 * - all C0 control codes except for CR LF TAB
857
 * - BACKSLASH
858
 * - TILDE
859
 * - DEL
860
 * - all codes beyond US-ASCII, i.e. all >127
861
 */
862
863
/* uses '&' not '+' to start a base64 sequence */
864
0
#define AMPERSAND 0x26
865
0
#define COMMA 0x2c
866
0
#define SLASH 0x2f
867
868
/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
869
0
#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
870
871
/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
872
0
#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
873
874
0
#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
875
0
#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
876
877
/*
878
 * converter status values:
879
 *
880
 * toUnicodeStatus:
881
 *     24 inDirectMode (boolean)
882
 * 23..16 base64Counter (-1..7)
883
 * 15..0  bits (up to 14 bits incoming base64)
884
 *
885
 * fromUnicodeStatus:
886
 *     24 inDirectMode (boolean)
887
 * 23..16 base64Counter (0..2)
888
 *  7..0  bits (6 bits outgoing base64)
889
 *
890
 * ignore bits 31..25
891
 */
892
893
U_CDECL_BEGIN
894
static void U_CALLCONV
895
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
896
0
                          UErrorCode *pErrorCode) {
897
0
    UConverter *cnv;
898
0
    const uint8_t *source, *sourceLimit;
899
0
    UChar *target;
900
0
    const UChar *targetLimit;
901
0
    int32_t *offsets;
902
903
0
    uint8_t *bytes;
904
0
    uint8_t byteIndex;
905
906
0
    int32_t length, targetCapacity;
907
908
    /* UTF-7 state */
909
0
    uint16_t bits;
910
0
    int8_t base64Counter;
911
0
    UBool inDirectMode;
912
913
0
    int8_t base64Value;
914
915
0
    int32_t sourceIndex, nextSourceIndex;
916
917
0
    UChar c;
918
0
    uint8_t b;
919
920
    /* set up the local pointers */
921
0
    cnv=pArgs->converter;
922
923
0
    source=(const uint8_t *)pArgs->source;
924
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
925
0
    target=pArgs->target;
926
0
    targetLimit=pArgs->targetLimit;
927
0
    offsets=pArgs->offsets;
928
    /* get the state machine state */
929
0
    {
930
0
        uint32_t status=cnv->toUnicodeStatus;
931
0
        inDirectMode=(UBool)((status>>24)&1);
932
0
        base64Counter=(int8_t)(status>>16);
933
0
        bits=(uint16_t)status;
934
0
    }
935
0
    bytes=cnv->toUBytes;
936
0
    byteIndex=cnv->toULength;
937
938
    /* sourceIndex=-1 if the current character began in the previous buffer */
939
0
    sourceIndex=byteIndex==0 ? 0 : -1;
940
0
    nextSourceIndex=0;
941
942
0
    if(inDirectMode) {
943
0
directMode:
944
        /*
945
         * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
946
         * with their US-ASCII byte values.
947
         * An ampersand starts Unicode (or "escape") Mode.
948
         *
949
         * In Direct Mode, only the sourceIndex is used.
950
         */
951
0
        byteIndex=0;
952
0
        length=(int32_t)(sourceLimit-source);
953
0
        targetCapacity=(int32_t)(targetLimit-target);
954
0
        if(length>targetCapacity) {
955
0
            length=targetCapacity;
956
0
        }
957
0
        while(length>0) {
958
0
            b=*source++;
959
0
            if(!isLegalIMAP(b)) {
960
                /* illegal */
961
0
                bytes[0]=b;
962
0
                byteIndex=1;
963
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
964
0
                break;
965
0
            } else if(b!=AMPERSAND) {
966
                /* write directly encoded character */
967
0
                *target++=b;
968
0
                if(offsets!=NULL) {
969
0
                    *offsets++=sourceIndex++;
970
0
                }
971
0
            } else /* AMPERSAND */ {
972
                /* switch to Unicode mode */
973
0
                nextSourceIndex=++sourceIndex;
974
0
                inDirectMode=FALSE;
975
0
                byteIndex=0;
976
0
                bits=0;
977
0
                base64Counter=-1;
978
0
                goto unicodeMode;
979
0
            }
980
0
            --length;
981
0
        }
982
0
        if(source<sourceLimit && target>=targetLimit) {
983
            /* target is full */
984
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
985
0
        }
986
0
    } else {
987
0
unicodeMode:
988
        /*
989
         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
990
         * The base64 sequence ends with any character that is not in the base64 alphabet.
991
         * A terminating minus sign is consumed.
992
         * US-ASCII must not be base64-ed.
993
         *
994
         * In Unicode Mode, the sourceIndex has the index to the start of the current
995
         * base64 bytes, while nextSourceIndex is precisely parallel to source,
996
         * keeping the index to the following byte.
997
         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
998
         */
999
0
        while(source<sourceLimit) {
1000
0
            if(target<targetLimit) {
1001
0
                bytes[byteIndex++]=b=*source++;
1002
0
                ++nextSourceIndex;
1003
0
                if(b>0x7e) {
1004
                    /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1005
0
                    inDirectMode=TRUE;
1006
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1007
0
                    break;
1008
0
                } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1009
                    /* collect base64 bytes into UChars */
1010
0
                    switch(base64Counter) {
1011
0
                    case -1: /* -1 is immediately after the & */
1012
0
                    case 0:
1013
0
                        bits=base64Value;
1014
0
                        base64Counter=1;
1015
0
                        break;
1016
0
                    case 1:
1017
0
                    case 3:
1018
0
                    case 4:
1019
0
                    case 6:
1020
0
                        bits=(uint16_t)((bits<<6)|base64Value);
1021
0
                        ++base64Counter;
1022
0
                        break;
1023
0
                    case 2:
1024
0
                        c=(UChar)((bits<<4)|(base64Value>>2));
1025
0
                        if(isLegalIMAP(c)) {
1026
                            /* illegal */
1027
0
                            inDirectMode=TRUE;
1028
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1029
0
                            goto endloop;
1030
0
                        }
1031
0
                        *target++=c;
1032
0
                        if(offsets!=NULL) {
1033
0
                            *offsets++=sourceIndex;
1034
0
                            sourceIndex=nextSourceIndex-1;
1035
0
                        }
1036
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
1037
0
                        byteIndex=1;
1038
0
                        bits=(uint16_t)(base64Value&3);
1039
0
                        base64Counter=3;
1040
0
                        break;
1041
0
                    case 5:
1042
0
                        c=(UChar)((bits<<2)|(base64Value>>4));
1043
0
                        if(isLegalIMAP(c)) {
1044
                            /* illegal */
1045
0
                            inDirectMode=TRUE;
1046
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1047
0
                            goto endloop;
1048
0
                        }
1049
0
                        *target++=c;
1050
0
                        if(offsets!=NULL) {
1051
0
                            *offsets++=sourceIndex;
1052
0
                            sourceIndex=nextSourceIndex-1;
1053
0
                        }
1054
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
1055
0
                        byteIndex=1;
1056
0
                        bits=(uint16_t)(base64Value&15);
1057
0
                        base64Counter=6;
1058
0
                        break;
1059
0
                    case 7:
1060
0
                        c=(UChar)((bits<<6)|base64Value);
1061
0
                        if(isLegalIMAP(c)) {
1062
                            /* illegal */
1063
0
                            inDirectMode=TRUE;
1064
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1065
0
                            goto endloop;
1066
0
                        }
1067
0
                        *target++=c;
1068
0
                        if(offsets!=NULL) {
1069
0
                            *offsets++=sourceIndex;
1070
0
                            sourceIndex=nextSourceIndex;
1071
0
                        }
1072
0
                        byteIndex=0;
1073
0
                        bits=0;
1074
0
                        base64Counter=0;
1075
0
                        break;
1076
0
                    default:
1077
                        /* will never occur */
1078
0
                        break;
1079
0
                    }
1080
0
                } else if(base64Value==-2) {
1081
                    /* minus sign terminates the base64 sequence */
1082
0
                    inDirectMode=TRUE;
1083
0
                    if(base64Counter==-1) {
1084
                        /* &- i.e. a minus immediately following an ampersand */
1085
0
                        *target++=AMPERSAND;
1086
0
                        if(offsets!=NULL) {
1087
0
                            *offsets++=sourceIndex-1;
1088
0
                        }
1089
0
                    } else {
1090
                        /* absorb the minus and leave the Unicode Mode */
1091
0
                        if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1092
                            /* bits are illegally left over, a UChar is incomplete */
1093
                            /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1094
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1095
0
                            break;
1096
0
                        }
1097
0
                    }
1098
0
                    sourceIndex=nextSourceIndex;
1099
0
                    goto directMode;
1100
0
                } else {
1101
0
                    if(base64Counter==-1) {
1102
                        /* illegal: & immediately followed by something other than base64 or minus sign */
1103
                        /* include the ampersand in the reported sequence */
1104
0
                        --sourceIndex;
1105
0
                        bytes[0]=AMPERSAND;
1106
0
                        bytes[1]=b;
1107
0
                        byteIndex=2;
1108
0
                    }
1109
                    /* base64Value==-1 for characters that are illegal only in Unicode mode */
1110
                    /* base64Value==-3 for illegal characters */
1111
                    /* illegal */
1112
0
                    inDirectMode=TRUE;
1113
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1114
0
                    break;
1115
0
                }
1116
0
            } else {
1117
                /* target is full */
1118
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1119
0
                break;
1120
0
            }
1121
0
        }
1122
0
    }
1123
0
endloop:
1124
1125
    /*
1126
     * the end of the input stream and detection of truncated input
1127
     * are handled by the framework, but here we must check if we are in Unicode
1128
     * mode and byteIndex==0 because we must end in direct mode
1129
     *
1130
     * conditions:
1131
     *   successful
1132
     *   in Unicode mode and byteIndex==0
1133
     *   end of input and no truncated input
1134
     */
1135
0
    if( U_SUCCESS(*pErrorCode) &&
1136
0
        !inDirectMode && byteIndex==0 &&
1137
0
        pArgs->flush && source>=sourceLimit
1138
0
    ) {
1139
0
        if(base64Counter==-1) {
1140
            /* & at the very end of the input */
1141
            /* make the ampersand the reported sequence */
1142
0
            bytes[0]=AMPERSAND;
1143
0
            byteIndex=1;
1144
0
        }
1145
        /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1146
1147
0
        inDirectMode=TRUE; /* avoid looping */
1148
0
        *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1149
0
    }
1150
1151
    /* set the converter state back into UConverter */
1152
0
    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1153
0
    cnv->toULength=byteIndex;
1154
1155
    /* write back the updated pointers */
1156
0
    pArgs->source=(const char *)source;
1157
0
    pArgs->target=target;
1158
0
    pArgs->offsets=offsets;
1159
0
    return;
1160
0
}
1161
1162
static void U_CALLCONV
1163
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1164
0
                            UErrorCode *pErrorCode) {
1165
0
    UConverter *cnv;
1166
0
    const UChar *source, *sourceLimit;
1167
0
    uint8_t *target, *targetLimit;
1168
0
    int32_t *offsets;
1169
1170
0
    int32_t length, targetCapacity, sourceIndex;
1171
0
    UChar c;
1172
0
    uint8_t b;
1173
1174
    /* UTF-7 state */
1175
0
    uint8_t bits;
1176
0
    int8_t base64Counter;
1177
0
    UBool inDirectMode;
1178
1179
    /* set up the local pointers */
1180
0
    cnv=pArgs->converter;
1181
1182
    /* set up the local pointers */
1183
0
    source=pArgs->source;
1184
0
    sourceLimit=pArgs->sourceLimit;
1185
0
    target=(uint8_t *)pArgs->target;
1186
0
    targetLimit=(uint8_t *)pArgs->targetLimit;
1187
0
    offsets=pArgs->offsets;
1188
1189
    /* get the state machine state */
1190
0
    {
1191
0
        uint32_t status=cnv->fromUnicodeStatus;
1192
0
        inDirectMode=(UBool)((status>>24)&1);
1193
0
        base64Counter=(int8_t)(status>>16);
1194
0
        bits=(uint8_t)status;
1195
0
    }
1196
1197
    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1198
0
    sourceIndex=0;
1199
1200
0
    if(inDirectMode) {
1201
0
directMode:
1202
0
        length=(int32_t)(sourceLimit-source);
1203
0
        targetCapacity=(int32_t)(targetLimit-target);
1204
0
        if(length>targetCapacity) {
1205
0
            length=targetCapacity;
1206
0
        }
1207
0
        while(length>0) {
1208
0
            c=*source++;
1209
            /* encode 0x20..0x7e except '&' directly */
1210
0
            if(inSetDIMAP(c)) {
1211
                /* encode directly */
1212
0
                *target++=(uint8_t)c;
1213
0
                if(offsets!=NULL) {
1214
0
                    *offsets++=sourceIndex++;
1215
0
                }
1216
0
            } else if(c==AMPERSAND) {
1217
                /* output &- for & */
1218
0
                *target++=AMPERSAND;
1219
0
                if(target<targetLimit) {
1220
0
                    *target++=MINUS;
1221
0
                    if(offsets!=NULL) {
1222
0
                        *offsets++=sourceIndex;
1223
0
                        *offsets++=sourceIndex++;
1224
0
                    }
1225
                    /* realign length and targetCapacity */
1226
0
                    goto directMode;
1227
0
                } else {
1228
0
                    if(offsets!=NULL) {
1229
0
                        *offsets++=sourceIndex++;
1230
0
                    }
1231
0
                    cnv->charErrorBuffer[0]=MINUS;
1232
0
                    cnv->charErrorBufferLength=1;
1233
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234
0
                    break;
1235
0
                }
1236
0
            } else {
1237
                /* un-read this character and switch to Unicode Mode */
1238
0
                --source;
1239
0
                *target++=AMPERSAND;
1240
0
                if(offsets!=NULL) {
1241
0
                    *offsets++=sourceIndex;
1242
0
                }
1243
0
                inDirectMode=FALSE;
1244
0
                base64Counter=0;
1245
0
                goto unicodeMode;
1246
0
            }
1247
0
            --length;
1248
0
        }
1249
0
        if(source<sourceLimit && target>=targetLimit) {
1250
            /* target is full */
1251
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1252
0
        }
1253
0
    } else {
1254
0
unicodeMode:
1255
0
        while(source<sourceLimit) {
1256
0
            if(target<targetLimit) {
1257
0
                c=*source++;
1258
0
                if(isLegalIMAP(c)) {
1259
                    /* encode directly */
1260
0
                    inDirectMode=TRUE;
1261
1262
                    /* trick: back out this character to make this easier */
1263
0
                    --source;
1264
1265
                    /* terminate the base64 sequence */
1266
0
                    if(base64Counter!=0) {
1267
                        /* write remaining bits for the previous character */
1268
0
                        *target++=TO_BASE64_IMAP(bits);
1269
0
                        if(offsets!=NULL) {
1270
0
                            *offsets++=sourceIndex-1;
1271
0
                        }
1272
0
                    }
1273
                    /* need to terminate with a minus */
1274
0
                    if(target<targetLimit) {
1275
0
                        *target++=MINUS;
1276
0
                        if(offsets!=NULL) {
1277
0
                            *offsets++=sourceIndex-1;
1278
0
                        }
1279
0
                    } else {
1280
0
                        cnv->charErrorBuffer[0]=MINUS;
1281
0
                        cnv->charErrorBufferLength=1;
1282
0
                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1283
0
                        break;
1284
0
                    }
1285
0
                    goto directMode;
1286
0
                } else {
1287
                    /*
1288
                     * base64 this character:
1289
                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1290
                     * and the bits of this character, each implicitly in UTF-16BE.
1291
                     *
1292
                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1293
                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
1294
                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1295
                     */
1296
0
                    switch(base64Counter) {
1297
0
                    case 0:
1298
0
                        b=(uint8_t)(c>>10);
1299
0
                        *target++=TO_BASE64_IMAP(b);
1300
0
                        if(target<targetLimit) {
1301
0
                            b=(uint8_t)((c>>4)&0x3f);
1302
0
                            *target++=TO_BASE64_IMAP(b);
1303
0
                            if(offsets!=NULL) {
1304
0
                                *offsets++=sourceIndex;
1305
0
                                *offsets++=sourceIndex++;
1306
0
                            }
1307
0
                        } else {
1308
0
                            if(offsets!=NULL) {
1309
0
                                *offsets++=sourceIndex++;
1310
0
                            }
1311
0
                            b=(uint8_t)((c>>4)&0x3f);
1312
0
                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1313
0
                            cnv->charErrorBufferLength=1;
1314
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1315
0
                        }
1316
0
                        bits=(uint8_t)((c&15)<<2);
1317
0
                        base64Counter=1;
1318
0
                        break;
1319
0
                    case 1:
1320
0
                        b=(uint8_t)(bits|(c>>14));
1321
0
                        *target++=TO_BASE64_IMAP(b);
1322
0
                        if(target<targetLimit) {
1323
0
                            b=(uint8_t)((c>>8)&0x3f);
1324
0
                            *target++=TO_BASE64_IMAP(b);
1325
0
                            if(target<targetLimit) {
1326
0
                                b=(uint8_t)((c>>2)&0x3f);
1327
0
                                *target++=TO_BASE64_IMAP(b);
1328
0
                                if(offsets!=NULL) {
1329
0
                                    *offsets++=sourceIndex;
1330
0
                                    *offsets++=sourceIndex;
1331
0
                                    *offsets++=sourceIndex++;
1332
0
                                }
1333
0
                            } else {
1334
0
                                if(offsets!=NULL) {
1335
0
                                    *offsets++=sourceIndex;
1336
0
                                    *offsets++=sourceIndex++;
1337
0
                                }
1338
0
                                b=(uint8_t)((c>>2)&0x3f);
1339
0
                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340
0
                                cnv->charErrorBufferLength=1;
1341
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1342
0
                            }
1343
0
                        } else {
1344
0
                            if(offsets!=NULL) {
1345
0
                                *offsets++=sourceIndex++;
1346
0
                            }
1347
0
                            b=(uint8_t)((c>>8)&0x3f);
1348
0
                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1349
0
                            b=(uint8_t)((c>>2)&0x3f);
1350
0
                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1351
0
                            cnv->charErrorBufferLength=2;
1352
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1353
0
                        }
1354
0
                        bits=(uint8_t)((c&3)<<4);
1355
0
                        base64Counter=2;
1356
0
                        break;
1357
0
                    case 2:
1358
0
                        b=(uint8_t)(bits|(c>>12));
1359
0
                        *target++=TO_BASE64_IMAP(b);
1360
0
                        if(target<targetLimit) {
1361
0
                            b=(uint8_t)((c>>6)&0x3f);
1362
0
                            *target++=TO_BASE64_IMAP(b);
1363
0
                            if(target<targetLimit) {
1364
0
                                b=(uint8_t)(c&0x3f);
1365
0
                                *target++=TO_BASE64_IMAP(b);
1366
0
                                if(offsets!=NULL) {
1367
0
                                    *offsets++=sourceIndex;
1368
0
                                    *offsets++=sourceIndex;
1369
0
                                    *offsets++=sourceIndex++;
1370
0
                                }
1371
0
                            } else {
1372
0
                                if(offsets!=NULL) {
1373
0
                                    *offsets++=sourceIndex;
1374
0
                                    *offsets++=sourceIndex++;
1375
0
                                }
1376
0
                                b=(uint8_t)(c&0x3f);
1377
0
                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378
0
                                cnv->charErrorBufferLength=1;
1379
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1380
0
                            }
1381
0
                        } else {
1382
0
                            if(offsets!=NULL) {
1383
0
                                *offsets++=sourceIndex++;
1384
0
                            }
1385
0
                            b=(uint8_t)((c>>6)&0x3f);
1386
0
                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1387
0
                            b=(uint8_t)(c&0x3f);
1388
0
                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1389
0
                            cnv->charErrorBufferLength=2;
1390
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1391
0
                        }
1392
0
                        bits=0;
1393
0
                        base64Counter=0;
1394
0
                        break;
1395
0
                    default:
1396
                        /* will never occur */
1397
0
                        break;
1398
0
                    }
1399
0
                }
1400
0
            } else {
1401
                /* target is full */
1402
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1403
0
                break;
1404
0
            }
1405
0
        }
1406
0
    }
1407
1408
0
    if(pArgs->flush && source>=sourceLimit) {
1409
        /* flush remaining bits to the target */
1410
0
        if(!inDirectMode) {
1411
0
            if(base64Counter!=0) {
1412
0
                if(target<targetLimit) {
1413
0
                    *target++=TO_BASE64_IMAP(bits);
1414
0
                    if(offsets!=NULL) {
1415
0
                        *offsets++=sourceIndex-1;
1416
0
                    }
1417
0
                } else {
1418
0
                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1419
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1420
0
                }
1421
0
            }
1422
            /* need to terminate with a minus */
1423
0
            if(target<targetLimit) {
1424
0
                *target++=MINUS;
1425
0
                if(offsets!=NULL) {
1426
0
                    *offsets++=sourceIndex-1;
1427
0
                }
1428
0
            } else {
1429
0
                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1430
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1431
0
            }
1432
0
        }
1433
        /* reset the state for the next conversion */
1434
0
        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1435
0
    } else {
1436
        /* set the converter state back into UConverter */
1437
0
        cnv->fromUnicodeStatus=
1438
0
            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1439
0
            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1440
0
    }
1441
1442
    /* write back the updated pointers */
1443
0
    pArgs->source=source;
1444
0
    pArgs->target=(char *)target;
1445
0
    pArgs->offsets=offsets;
1446
0
    return;
1447
0
}
1448
U_CDECL_END
1449
1450
static const UConverterImpl _IMAPImpl={
1451
    UCNV_IMAP_MAILBOX,
1452
1453
    NULL,
1454
    NULL,
1455
1456
    _UTF7Open,
1457
    NULL,
1458
    _UTF7Reset,
1459
1460
    _IMAPToUnicodeWithOffsets,
1461
    _IMAPToUnicodeWithOffsets,
1462
    _IMAPFromUnicodeWithOffsets,
1463
    _IMAPFromUnicodeWithOffsets,
1464
    NULL,
1465
1466
    NULL,
1467
    NULL,
1468
    NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1469
    NULL,
1470
    ucnv_getCompleteUnicodeSet,
1471
    NULL,
1472
    NULL
1473
};
1474
1475
static const UConverterStaticData _IMAPStaticData={
1476
    sizeof(UConverterStaticData),
1477
    "IMAP-mailbox-name",
1478
    0, /* TODO CCSID for IMAP-mailbox-name */
1479
    UCNV_IBM, UCNV_IMAP_MAILBOX,
1480
    1, 4,
1481
    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1482
    FALSE, FALSE,
1483
    0,
1484
    0,
1485
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1486
};
1487
1488
const UConverterSharedData _IMAPData=
1489
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1490
1491
#endif