Coverage Report

Created: 2026-06-13 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/common/ucnv_u7.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*  
4
**********************************************************************
5
*   Copyright (C) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv_u7.c
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2002jul01
14
*   created by: Markus W. Scherer
15
*
16
*   UTF-7 converter implementation. Used to be in ucnv_utf.c.
17
*/
18
19
#include "unicode/utypes.h"
20
21
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22
23
#include "cmemory.h"
24
#include "unicode/ucnv.h"
25
#include "ucnv_bld.h"
26
#include "ucnv_cnv.h"
27
#include "uassert.h"
28
29
/* UTF-7 -------------------------------------------------------------------- */
30
31
/*
32
 * UTF-7 is a stateful encoding of Unicode.
33
 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34
 * It was intended for use in Internet email systems, using in its bytewise
35
 * encoding only a subset of 7-bit US-ASCII.
36
 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37
 * occasionally used.
38
 *
39
 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40
 * characters directly or in base64. Especially, the characters in set O
41
 * as defined in the RFC (see below) may be encoded directly but are not
42
 * allowed in, e.g., email headers.
43
 * By default, the ICU UTF-7 converter encodes set O directly.
44
 * By choosing the option "version=1", set O will be escaped instead.
45
 * For example:
46
 *     utf7Converter=ucnv_open("UTF-7,version=1");
47
 *
48
 * For details about email headers see RFC 2047.
49
 */
50
51
/*
52
 * Tests for US-ASCII characters belonging to character classes
53
 * defined in UTF-7.
54
 *
55
 * Set D (directly encoded characters) consists of the following
56
 * characters: the upper and lower case letters A through Z
57
 * and a through z, the 10 digits 0-9, and the following nine special
58
 * characters (note that "+" and "=" are omitted):
59
 *     '(),-./:?
60
 *
61
 * Set O (optional direct characters) consists of the following
62
 * characters (note that "\" and "~" are omitted):
63
 *     !"#$%&*;<=>@[]^_`{|}
64
 *
65
 * According to the rules in RFC 2152, the byte values for the following
66
 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67
 * - all C0 control codes except for CR LF TAB
68
 * - BACKSLASH
69
 * - TILDE
70
 * - DEL
71
 * - all codes beyond US-ASCII, i.e. all >127
72
 */
73
#define inSetD(c) \
74
    ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75
     (uint8_t)((c)-48)<10 ||    /* digits */ \
76
     (uint8_t)((c)-39)<3 ||     /* '() */ \
77
     (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
78
     (c)==58 || (c)==63         /* :? */ \
79
    )
80
81
#define inSetO(c) \
82
    ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
83
     (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
84
     (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
85
     (uint8_t)((c)-123)<3 ||        /* {|} */ \
86
     (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
87
    )
88
89
0
#define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90
#define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
91
92
0
#define PLUS  43
93
0
#define MINUS 45
94
0
#define BACKSLASH 92
95
#define TILDE 126
96
97
/* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98
0
#define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
99
100
/* encode directly sets D and O and CR LF SP TAB */
101
static const UBool encodeDirectlyMaximum[128]={
102
 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
103
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105
106
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108
109
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
111
112
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114
};
115
116
/* encode directly set D and CR LF SP TAB but not set O */
117
static const UBool encodeDirectlyRestricted[128]={
118
 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
119
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121
122
    1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
124
125
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
127
128
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
130
};
131
132
static const uint8_t
133
toBase64[64]={
134
    /* A-Z */
135
    65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136
    78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
137
    /* a-z */
138
    97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139
    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
140
    /* 0-9 */
141
    48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
142
    /* +/ */
143
    43, 47
144
};
145
146
static const int8_t
147
fromBase64[128]={
148
    /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149
    -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150
    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
151
152
    /* general punctuation with + and / and a special value (-2) for - */
153
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
154
    /* digits */
155
    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156
157
    /* A-Z */
158
    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
159
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160
161
    /* a-z */
162
    -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163
    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164
};
165
166
/*
167
 * converter status values:
168
 *
169
 * toUnicodeStatus:
170
 *     24 inDirectMode (boolean)
171
 * 23..16 base64Counter (-1..7)
172
 * 15..0  bits (up to 14 bits incoming base64)
173
 *
174
 * fromUnicodeStatus:
175
 * 31..28 version (0: set O direct  1: set O escaped)
176
 *     24 inDirectMode (boolean)
177
 * 23..16 base64Counter (0..2)
178
 *  7..0  bits (6 bits outgoing base64)
179
 *
180
 */
181
182
U_CDECL_BEGIN
183
static void U_CALLCONV
184
100
_UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
185
100
    if(choice<=UCNV_RESET_TO_UNICODE) {
186
        /* reset toUnicode */
187
100
        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=true */
188
100
        cnv->toULength=0;
189
100
    }
190
100
    if(choice!=UCNV_RESET_TO_UNICODE) {
191
        /* reset fromUnicode */
192
100
        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
193
100
    }
194
100
}
195
196
static void U_CALLCONV
197
_UTF7Open(UConverter *cnv,
198
          UConverterLoadArgs *pArgs,
199
100
          UErrorCode *pErrorCode) {
200
100
    (void)pArgs;
201
100
    if(UCNV_GET_VERSION(cnv)<=1) {
202
        /* TODO(markus): Should just use cnv->options rather than copying the version number. */
203
100
        cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
204
100
        _UTF7Reset(cnv, UCNV_RESET_BOTH);
205
100
    } else {
206
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207
0
    }
208
100
}
209
210
static void U_CALLCONV
211
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
212
0
                          UErrorCode *pErrorCode) {
213
0
    UConverter *cnv;
214
0
    const uint8_t *source, *sourceLimit;
215
0
    char16_t *target;
216
0
    const char16_t *targetLimit;
217
0
    int32_t *offsets;
218
219
0
    uint8_t *bytes;
220
0
    uint8_t byteIndex;
221
222
0
    int32_t length, targetCapacity;
223
224
    /* UTF-7 state */
225
0
    uint16_t bits;
226
0
    int8_t base64Counter;
227
0
    UBool inDirectMode;
228
229
0
    int8_t base64Value;
230
231
0
    int32_t sourceIndex, nextSourceIndex;
232
233
0
    uint8_t b;
234
    /* set up the local pointers */
235
0
    cnv=pArgs->converter;
236
237
0
    source=(const uint8_t *)pArgs->source;
238
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
239
0
    target=pArgs->target;
240
0
    targetLimit=pArgs->targetLimit;
241
0
    offsets=pArgs->offsets;
242
    /* get the state machine state */
243
0
    {
244
0
        uint32_t status=cnv->toUnicodeStatus;
245
0
        inDirectMode=(UBool)((status>>24)&1);
246
0
        base64Counter=(int8_t)(status>>16);
247
0
        bits=(uint16_t)status;
248
0
    }
249
0
    bytes=cnv->toUBytes;
250
0
    byteIndex=cnv->toULength;
251
252
    /* sourceIndex=-1 if the current character began in the previous buffer */
253
0
    sourceIndex=byteIndex==0 ? 0 : -1;
254
0
    nextSourceIndex=0;
255
256
0
    if(inDirectMode) {
257
0
directMode:
258
        /*
259
         * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
260
         * with their US-ASCII byte values.
261
         * Backslash and Tilde and most control characters are not allowed in UTF-7.
262
         * A plus sign starts Unicode (or "escape") Mode.
263
         *
264
         * In Direct Mode, only the sourceIndex is used.
265
         */
266
0
        byteIndex=0;
267
0
        length=(int32_t)(sourceLimit-source);
268
0
        targetCapacity=(int32_t)(targetLimit-target);
269
0
        if(length>targetCapacity) {
270
0
            length=targetCapacity;
271
0
        }
272
0
        while(length>0) {
273
0
            b=*source++;
274
0
            if(!isLegalUTF7(b)) {
275
                /* illegal */
276
0
                bytes[0]=b;
277
0
                byteIndex=1;
278
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
279
0
                break;
280
0
            } else if(b!=PLUS) {
281
                /* write directly encoded character */
282
0
                *target++=b;
283
0
                if(offsets!=nullptr) {
284
0
                    *offsets++=sourceIndex++;
285
0
                }
286
0
            } else /* PLUS */ {
287
                /* switch to Unicode mode */
288
0
                nextSourceIndex=++sourceIndex;
289
0
                inDirectMode=false;
290
0
                byteIndex=0;
291
0
                bits=0;
292
0
                base64Counter=-1;
293
0
                goto unicodeMode;
294
0
            }
295
0
            --length;
296
0
        }
297
0
        if(source<sourceLimit && target>=targetLimit) {
298
            /* target is full */
299
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
300
0
        }
301
0
    } else {
302
0
unicodeMode:
303
        /*
304
         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
305
         * The base64 sequence ends with any character that is not in the base64 alphabet.
306
         * A terminating minus sign is consumed.
307
         *
308
         * In Unicode Mode, the sourceIndex has the index to the start of the current
309
         * base64 bytes, while nextSourceIndex is precisely parallel to source,
310
         * keeping the index to the following byte.
311
         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
312
         */
313
0
        while(source<sourceLimit) {
314
0
            if(target<targetLimit) {
315
0
                bytes[byteIndex++]=b=*source++;
316
0
                ++nextSourceIndex;
317
0
                base64Value = -3; /* initialize as illegal */
318
0
                if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
319
                    /* either
320
                     * base64Value==-1 for any legal character except base64 and minus sign, or
321
                     * base64Value==-3 for illegal characters:
322
                     * 1. In either case, leave Unicode mode.
323
                     * 2.1. If we ended with an incomplete char16_t or none after the +, then
324
                     *      generate an error for the preceding erroneous sequence and deal with
325
                     *      the current (possibly illegal) character next time through.
326
                     * 2.2. Else the current char comes after a complete char16_t, which was already
327
                     *      pushed to the output buf, so:
328
                     * 2.2.1. If the current char is legal, just save it for processing next time.
329
                     *        It may be for example, a plus which we need to deal with in direct mode.
330
                     * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331
                     */
332
0
                    inDirectMode=true;
333
0
                    if(base64Counter==-1) {
334
                        /* illegal: + immediately followed by something other than base64 or minus sign */
335
                        /* include the plus sign in the reported sequence, but not the subsequent char */
336
0
                        --source;
337
0
                        bytes[0]=PLUS;
338
0
                        byteIndex=1;
339
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
340
0
                        break;
341
0
                    } else if(bits!=0) {
342
                        /* bits are illegally left over, a char16_t is incomplete */
343
                        /* don't include current char (legal or illegal) in error seq */
344
0
                        --source;
345
0
                        --byteIndex;
346
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347
0
                        break;
348
0
                    } else {
349
                        /* previous char16_t was complete */
350
0
                        if(base64Value==-3) {
351
                            /* current character is illegal, deal with it here */
352
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
353
0
                            break;
354
0
                        } else {
355
                            /* un-read the current character in case it is a plus sign */
356
0
                            --source;
357
0
                            sourceIndex=nextSourceIndex-1;
358
0
                            goto directMode;
359
0
                        }
360
0
                    }
361
0
                } else if(base64Value>=0) {
362
                    /* collect base64 bytes into UChars */
363
0
                    switch(base64Counter) {
364
0
                    case -1: /* -1 is immediately after the + */
365
0
                    case 0:
366
0
                        bits=base64Value;
367
0
                        base64Counter=1;
368
0
                        break;
369
0
                    case 1:
370
0
                    case 3:
371
0
                    case 4:
372
0
                    case 6:
373
0
                        bits=(uint16_t)((bits<<6)|base64Value);
374
0
                        ++base64Counter;
375
0
                        break;
376
0
                    case 2:
377
0
                        *target++=(char16_t)((bits<<4)|(base64Value>>2));
378
0
                        if(offsets!=nullptr) {
379
0
                            *offsets++=sourceIndex;
380
0
                            sourceIndex=nextSourceIndex-1;
381
0
                        }
382
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
383
0
                        byteIndex=1;
384
0
                        bits=(uint16_t)(base64Value&3);
385
0
                        base64Counter=3;
386
0
                        break;
387
0
                    case 5:
388
0
                        *target++=(char16_t)((bits<<2)|(base64Value>>4));
389
0
                        if(offsets!=nullptr) {
390
0
                            *offsets++=sourceIndex;
391
0
                            sourceIndex=nextSourceIndex-1;
392
0
                        }
393
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
394
0
                        byteIndex=1;
395
0
                        bits=(uint16_t)(base64Value&15);
396
0
                        base64Counter=6;
397
0
                        break;
398
0
                    case 7:
399
0
                        *target++=(char16_t)((bits<<6)|base64Value);
400
0
                        if(offsets!=nullptr) {
401
0
                            *offsets++=sourceIndex;
402
0
                            sourceIndex=nextSourceIndex;
403
0
                        }
404
0
                        byteIndex=0;
405
0
                        bits=0;
406
0
                        base64Counter=0;
407
0
                        break;
408
0
                    default:
409
                        /* will never occur */
410
0
                        break;
411
0
                    }
412
0
                } else /*base64Value==-2*/ {
413
                    /* minus sign terminates the base64 sequence */
414
0
                    inDirectMode=true;
415
0
                    if(base64Counter==-1) {
416
                        /* +- i.e. a minus immediately following a plus */
417
0
                        *target++=PLUS;
418
0
                        if(offsets!=nullptr) {
419
0
                            *offsets++=sourceIndex-1;
420
0
                        }
421
0
                    } else {
422
                        /* absorb the minus and leave the Unicode Mode */
423
0
                        if(bits!=0) {
424
                            /* bits are illegally left over, a char16_t is incomplete */
425
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
426
0
                            break;
427
0
                        }
428
0
                    }
429
0
                    sourceIndex=nextSourceIndex;
430
0
                    goto directMode;
431
0
                }
432
0
            } else {
433
                /* target is full */
434
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435
0
                break;
436
0
            }
437
0
        }
438
0
    }
439
440
0
    if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
441
        /*
442
         * if we are in Unicode mode, then the byteIndex might not be 0,
443
         * but that is ok if bits==0
444
         * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
445
         * (not true for IMAP-mailbox-name where we must end in direct mode)
446
         */
447
0
        byteIndex=0;
448
0
    }
449
450
    /* set the converter state back into UConverter */
451
0
    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
452
0
    cnv->toULength=byteIndex;
453
454
    /* write back the updated pointers */
455
0
    pArgs->source=(const char *)source;
456
0
    pArgs->target=target;
457
0
    pArgs->offsets=offsets;
458
0
}
459
460
static void U_CALLCONV
461
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
462
0
                            UErrorCode *pErrorCode) {
463
0
    UConverter *cnv;
464
0
    const char16_t *source, *sourceLimit;
465
0
    uint8_t *target, *targetLimit;
466
0
    int32_t *offsets;
467
468
0
    int32_t length, targetCapacity, sourceIndex;
469
0
    char16_t c;
470
471
    /* UTF-7 state */
472
0
    const UBool *encodeDirectly;
473
0
    uint8_t bits;
474
0
    int8_t base64Counter;
475
0
    UBool inDirectMode;
476
477
    /* set up the local pointers */
478
0
    cnv=pArgs->converter;
479
480
    /* set up the local pointers */
481
0
    source=pArgs->source;
482
0
    sourceLimit=pArgs->sourceLimit;
483
0
    target=(uint8_t *)pArgs->target;
484
0
    targetLimit=(uint8_t *)pArgs->targetLimit;
485
0
    offsets=pArgs->offsets;
486
487
    /* get the state machine state */
488
0
    {
489
0
        uint32_t status=cnv->fromUnicodeStatus;
490
0
        encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
491
0
        inDirectMode=(UBool)((status>>24)&1);
492
0
        base64Counter=(int8_t)(status>>16);
493
0
        bits=(uint8_t)status;
494
0
        U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
495
0
    }
496
497
    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
498
0
    sourceIndex=0;
499
500
0
    if(inDirectMode) {
501
0
directMode:
502
0
        length=(int32_t)(sourceLimit-source);
503
0
        targetCapacity=(int32_t)(targetLimit-target);
504
0
        if(length>targetCapacity) {
505
0
            length=targetCapacity;
506
0
        }
507
0
        while(length>0) {
508
0
            c=*source++;
509
            /* currently always encode CR LF SP TAB directly */
510
0
            if(c<=127 && encodeDirectly[c]) {
511
                /* encode directly */
512
0
                *target++=(uint8_t)c;
513
0
                if(offsets!=nullptr) {
514
0
                    *offsets++=sourceIndex++;
515
0
                }
516
0
            } else if(c==PLUS) {
517
                /* output +- for + */
518
0
                *target++=PLUS;
519
0
                if(target<targetLimit) {
520
0
                    *target++=MINUS;
521
0
                    if(offsets!=nullptr) {
522
0
                        *offsets++=sourceIndex;
523
0
                        *offsets++=sourceIndex++;
524
0
                    }
525
                    /* realign length and targetCapacity */
526
0
                    goto directMode;
527
0
                } else {
528
0
                    if(offsets!=nullptr) {
529
0
                        *offsets++=sourceIndex++;
530
0
                    }
531
0
                    cnv->charErrorBuffer[0]=MINUS;
532
0
                    cnv->charErrorBufferLength=1;
533
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
534
0
                    break;
535
0
                }
536
0
            } else {
537
                /* un-read this character and switch to Unicode Mode */
538
0
                --source;
539
0
                *target++=PLUS;
540
0
                if(offsets!=nullptr) {
541
0
                    *offsets++=sourceIndex;
542
0
                }
543
0
                inDirectMode=false;
544
0
                base64Counter=0;
545
0
                goto unicodeMode;
546
0
            }
547
0
            --length;
548
0
        }
549
0
        if(source<sourceLimit && target>=targetLimit) {
550
            /* target is full */
551
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
552
0
        }
553
0
    } else {
554
0
unicodeMode:
555
0
        while(source<sourceLimit) {
556
0
            if(target<targetLimit) {
557
0
                c=*source++;
558
0
                if(c<=127 && encodeDirectly[c]) {
559
                    /* encode directly */
560
0
                    inDirectMode=true;
561
562
                    /* trick: back out this character to make this easier */
563
0
                    --source;
564
565
                    /* terminate the base64 sequence */
566
0
                    if(base64Counter!=0) {
567
                        /* write remaining bits for the previous character */
568
0
                        *target++=toBase64[bits];
569
0
                        if(offsets!=nullptr) {
570
0
                            *offsets++=sourceIndex-1;
571
0
                        }
572
0
                    }
573
0
                    if(fromBase64[c]!=-1) {
574
                        /* need to terminate with a minus */
575
0
                        if(target<targetLimit) {
576
0
                            *target++=MINUS;
577
0
                            if(offsets!=nullptr) {
578
0
                                *offsets++=sourceIndex-1;
579
0
                            }
580
0
                        } else {
581
0
                            cnv->charErrorBuffer[0]=MINUS;
582
0
                            cnv->charErrorBufferLength=1;
583
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
584
0
                            break;
585
0
                        }
586
0
                    }
587
0
                    goto directMode;
588
0
                } else {
589
                    /*
590
                     * base64 this character:
591
                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
592
                     * and the bits of this character, each implicitly in UTF-16BE.
593
                     *
594
                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
595
                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
596
                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
597
                     */
598
0
                    switch(base64Counter) {
599
0
                    case 0:
600
0
                        *target++=toBase64[c>>10];
601
0
                        if(target<targetLimit) {
602
0
                            *target++=toBase64[(c>>4)&0x3f];
603
0
                            if(offsets!=nullptr) {
604
0
                                *offsets++=sourceIndex;
605
0
                                *offsets++=sourceIndex++;
606
0
                            }
607
0
                        } else {
608
0
                            if(offsets!=nullptr) {
609
0
                                *offsets++=sourceIndex++;
610
0
                            }
611
0
                            cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
612
0
                            cnv->charErrorBufferLength=1;
613
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
614
0
                        }
615
0
                        bits=(uint8_t)((c&15)<<2);
616
0
                        base64Counter=1;
617
0
                        break;
618
0
                    case 1:
619
0
                        *target++=toBase64[bits|(c>>14)];
620
0
                        if(target<targetLimit) {
621
0
                            *target++=toBase64[(c>>8)&0x3f];
622
0
                            if(target<targetLimit) {
623
0
                                *target++=toBase64[(c>>2)&0x3f];
624
0
                                if(offsets!=nullptr) {
625
0
                                    *offsets++=sourceIndex;
626
0
                                    *offsets++=sourceIndex;
627
0
                                    *offsets++=sourceIndex++;
628
0
                                }
629
0
                            } else {
630
0
                                if(offsets!=nullptr) {
631
0
                                    *offsets++=sourceIndex;
632
0
                                    *offsets++=sourceIndex++;
633
0
                                }
634
0
                                cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
635
0
                                cnv->charErrorBufferLength=1;
636
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
637
0
                            }
638
0
                        } else {
639
0
                            if(offsets!=nullptr) {
640
0
                                *offsets++=sourceIndex++;
641
0
                            }
642
0
                            cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
643
0
                            cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
644
0
                            cnv->charErrorBufferLength=2;
645
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
646
0
                        }
647
0
                        bits=(uint8_t)((c&3)<<4);
648
0
                        base64Counter=2;
649
0
                        break;
650
0
                    case 2:
651
0
                        *target++=toBase64[bits|(c>>12)];
652
0
                        if(target<targetLimit) {
653
0
                            *target++=toBase64[(c>>6)&0x3f];
654
0
                            if(target<targetLimit) {
655
0
                                *target++=toBase64[c&0x3f];
656
0
                                if(offsets!=nullptr) {
657
0
                                    *offsets++=sourceIndex;
658
0
                                    *offsets++=sourceIndex;
659
0
                                    *offsets++=sourceIndex++;
660
0
                                }
661
0
                            } else {
662
0
                                if(offsets!=nullptr) {
663
0
                                    *offsets++=sourceIndex;
664
0
                                    *offsets++=sourceIndex++;
665
0
                                }
666
0
                                cnv->charErrorBuffer[0]=toBase64[c&0x3f];
667
0
                                cnv->charErrorBufferLength=1;
668
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
669
0
                            }
670
0
                        } else {
671
0
                            if(offsets!=nullptr) {
672
0
                                *offsets++=sourceIndex++;
673
0
                            }
674
0
                            cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
675
0
                            cnv->charErrorBuffer[1]=toBase64[c&0x3f];
676
0
                            cnv->charErrorBufferLength=2;
677
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
678
0
                        }
679
0
                        bits=0;
680
0
                        base64Counter=0;
681
0
                        break;
682
0
                    default:
683
                        /* will never occur */
684
0
                        break;
685
0
                    }
686
0
                }
687
0
            } else {
688
                /* target is full */
689
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
690
0
                break;
691
0
            }
692
0
        }
693
0
    }
694
695
0
    if(pArgs->flush && source>=sourceLimit) {
696
        /* flush remaining bits to the target */
697
0
        if(!inDirectMode) {
698
0
            if (base64Counter!=0) {
699
0
                if(target<targetLimit) {
700
0
                    *target++=toBase64[bits];
701
0
                    if(offsets!=nullptr) {
702
0
                        *offsets++=sourceIndex-1;
703
0
                    }
704
0
                } else {
705
0
                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
706
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
707
0
                }
708
0
            }
709
            /* Add final MINUS to terminate unicodeMode */
710
0
            if(target<targetLimit) {
711
0
                *target++=MINUS;
712
0
                if(offsets!=nullptr) {
713
0
                    *offsets++=sourceIndex-1;
714
0
                }
715
0
            } else {
716
0
                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
717
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
718
0
            }
719
0
        }
720
        /* reset the state for the next conversion */
721
0
        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
722
0
    } else {
723
        /* set the converter state back into UConverter */
724
0
        cnv->fromUnicodeStatus=
725
0
            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
726
0
            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
727
0
    }
728
729
    /* write back the updated pointers */
730
0
    pArgs->source=source;
731
0
    pArgs->target=(char *)target;
732
0
    pArgs->offsets=offsets;
733
0
}
734
735
static const char * U_CALLCONV
736
0
_UTF7GetName(const UConverter *cnv) {
737
0
    switch(cnv->fromUnicodeStatus>>28) {
738
0
    case 1:
739
0
        return "UTF-7,version=1";
740
0
    default:
741
0
        return "UTF-7";
742
0
    }
743
0
}
744
U_CDECL_END
745
746
static const UConverterImpl _UTF7Impl={
747
    UCNV_UTF7,
748
749
    nullptr,
750
    nullptr,
751
752
    _UTF7Open,
753
    nullptr,
754
    _UTF7Reset,
755
756
    _UTF7ToUnicodeWithOffsets,
757
    _UTF7ToUnicodeWithOffsets,
758
    _UTF7FromUnicodeWithOffsets,
759
    _UTF7FromUnicodeWithOffsets,
760
    nullptr,
761
762
    nullptr,
763
    _UTF7GetName,
764
    nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
765
    nullptr,
766
    ucnv_getCompleteUnicodeSet,
767
768
    nullptr,
769
    nullptr
770
};
771
772
static const UConverterStaticData _UTF7StaticData={
773
    sizeof(UConverterStaticData),
774
    "UTF-7",
775
    0, /* TODO CCSID for UTF-7 */
776
    UCNV_IBM, UCNV_UTF7,
777
    1, 4,
778
    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
779
    false, false,
780
    0,
781
    0,
782
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
783
};
784
785
const UConverterSharedData _UTF7Data=
786
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
787
788
/* IMAP mailbox name encoding ----------------------------------------------- */
789
790
/*
791
 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
792
 * http://www.ietf.org/rfc/rfc2060.txt
793
 *
794
 * 5.1.3.  Mailbox International Naming Convention
795
 *
796
 * By convention, international mailbox names are specified using a
797
 * modified version of the UTF-7 encoding described in [UTF-7].  The
798
 * purpose of these modifications is to correct the following problems
799
 * with UTF-7:
800
 *
801
 *    1) UTF-7 uses the "+" character for shifting; this conflicts with
802
 *       the common use of "+" in mailbox names, in particular USENET
803
 *       newsgroup names.
804
 *
805
 *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
806
 *       conflicts with the use of "/" as a popular hierarchy delimiter.
807
 *
808
 *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
809
 *       the use of "\" as a popular hierarchy delimiter.
810
 *
811
 *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
812
 *       the use of "~" in some servers as a home directory indicator.
813
 *
814
 *    5) UTF-7 permits multiple alternate forms to represent the same
815
 *       string; in particular, printable US-ASCII characters can be
816
 *       represented in encoded form.
817
 *
818
 * In modified UTF-7, printable US-ASCII characters except for "&"
819
 * represent themselves; that is, characters with octet values 0x20-0x25
820
 * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
821
 * octet sequence "&-".
822
 *
823
 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
824
 * Unicode 16-bit octets) are represented in modified BASE64, with a
825
 * further modification from [UTF-7] that "," is used instead of "/".
826
 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
827
 * character which can represent itself.
828
 *
829
 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
830
 * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
831
 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
832
 * ").
833
 *
834
 * For example, here is a mailbox name which mixes English, Japanese,
835
 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
836
 */
837
838
/*
839
 * Tests for US-ASCII characters belonging to character classes
840
 * defined in UTF-7.
841
 *
842
 * Set D (directly encoded characters) consists of the following
843
 * characters: the upper and lower case letters A through Z
844
 * and a through z, the 10 digits 0-9, and the following nine special
845
 * characters (note that "+" and "=" are omitted):
846
 *     '(),-./:?
847
 *
848
 * Set O (optional direct characters) consists of the following
849
 * characters (note that "\" and "~" are omitted):
850
 *     !"#$%&*;<=>@[]^_`{|}
851
 *
852
 * According to the rules in RFC 2152, the byte values for the following
853
 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
854
 * - all C0 control codes except for CR LF TAB
855
 * - BACKSLASH
856
 * - TILDE
857
 * - DEL
858
 * - all codes beyond US-ASCII, i.e. all >127
859
 */
860
861
/* uses '&' not '+' to start a base64 sequence */
862
0
#define AMPERSAND 0x26
863
0
#define COMMA 0x2c
864
0
#define SLASH 0x2f
865
866
/* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
867
0
#define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
868
869
/* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
870
0
#define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
871
872
0
#define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
873
0
#define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
874
875
/*
876
 * converter status values:
877
 *
878
 * toUnicodeStatus:
879
 *     24 inDirectMode (boolean)
880
 * 23..16 base64Counter (-1..7)
881
 * 15..0  bits (up to 14 bits incoming base64)
882
 *
883
 * fromUnicodeStatus:
884
 *     24 inDirectMode (boolean)
885
 * 23..16 base64Counter (0..2)
886
 *  7..0  bits (6 bits outgoing base64)
887
 *
888
 * ignore bits 31..25
889
 */
890
891
U_CDECL_BEGIN
892
static void U_CALLCONV
893
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
894
0
                          UErrorCode *pErrorCode) {
895
0
    UConverter *cnv;
896
0
    const uint8_t *source, *sourceLimit;
897
0
    char16_t *target;
898
0
    const char16_t *targetLimit;
899
0
    int32_t *offsets;
900
901
0
    uint8_t *bytes;
902
0
    uint8_t byteIndex;
903
904
0
    int32_t length, targetCapacity;
905
906
    /* UTF-7 state */
907
0
    uint16_t bits;
908
0
    int8_t base64Counter;
909
0
    UBool inDirectMode;
910
911
0
    int8_t base64Value;
912
913
0
    int32_t sourceIndex, nextSourceIndex;
914
915
0
    char16_t c;
916
0
    uint8_t b;
917
918
    /* set up the local pointers */
919
0
    cnv=pArgs->converter;
920
921
0
    source=(const uint8_t *)pArgs->source;
922
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
923
0
    target=pArgs->target;
924
0
    targetLimit=pArgs->targetLimit;
925
0
    offsets=pArgs->offsets;
926
    /* get the state machine state */
927
0
    {
928
0
        uint32_t status=cnv->toUnicodeStatus;
929
0
        inDirectMode=(UBool)((status>>24)&1);
930
0
        base64Counter=(int8_t)(status>>16);
931
0
        bits=(uint16_t)status;
932
0
    }
933
0
    bytes=cnv->toUBytes;
934
0
    byteIndex=cnv->toULength;
935
936
    /* sourceIndex=-1 if the current character began in the previous buffer */
937
0
    sourceIndex=byteIndex==0 ? 0 : -1;
938
0
    nextSourceIndex=0;
939
940
0
    if(inDirectMode) {
941
0
directMode:
942
        /*
943
         * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
944
         * with their US-ASCII byte values.
945
         * An ampersand starts Unicode (or "escape") Mode.
946
         *
947
         * In Direct Mode, only the sourceIndex is used.
948
         */
949
0
        byteIndex=0;
950
0
        length=(int32_t)(sourceLimit-source);
951
0
        targetCapacity=(int32_t)(targetLimit-target);
952
0
        if(length>targetCapacity) {
953
0
            length=targetCapacity;
954
0
        }
955
0
        while(length>0) {
956
0
            b=*source++;
957
0
            if(!isLegalIMAP(b)) {
958
                /* illegal */
959
0
                bytes[0]=b;
960
0
                byteIndex=1;
961
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
962
0
                break;
963
0
            } else if(b!=AMPERSAND) {
964
                /* write directly encoded character */
965
0
                *target++=b;
966
0
                if(offsets!=nullptr) {
967
0
                    *offsets++=sourceIndex++;
968
0
                }
969
0
            } else /* AMPERSAND */ {
970
                /* switch to Unicode mode */
971
0
                nextSourceIndex=++sourceIndex;
972
0
                inDirectMode=false;
973
0
                byteIndex=0;
974
0
                bits=0;
975
0
                base64Counter=-1;
976
0
                goto unicodeMode;
977
0
            }
978
0
            --length;
979
0
        }
980
0
        if(source<sourceLimit && target>=targetLimit) {
981
            /* target is full */
982
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
983
0
        }
984
0
    } else {
985
0
unicodeMode:
986
        /*
987
         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
988
         * The base64 sequence ends with any character that is not in the base64 alphabet.
989
         * A terminating minus sign is consumed.
990
         * US-ASCII must not be base64-ed.
991
         *
992
         * In Unicode Mode, the sourceIndex has the index to the start of the current
993
         * base64 bytes, while nextSourceIndex is precisely parallel to source,
994
         * keeping the index to the following byte.
995
         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
996
         */
997
0
        while(source<sourceLimit) {
998
0
            if(target<targetLimit) {
999
0
                bytes[byteIndex++]=b=*source++;
1000
0
                ++nextSourceIndex;
1001
0
                if(b>0x7e) {
1002
                    /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1003
0
                    inDirectMode=true;
1004
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1005
0
                    break;
1006
0
                } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1007
                    /* collect base64 bytes into UChars */
1008
0
                    switch(base64Counter) {
1009
0
                    case -1: /* -1 is immediately after the & */
1010
0
                    case 0:
1011
0
                        bits=base64Value;
1012
0
                        base64Counter=1;
1013
0
                        break;
1014
0
                    case 1:
1015
0
                    case 3:
1016
0
                    case 4:
1017
0
                    case 6:
1018
0
                        bits=(uint16_t)((bits<<6)|base64Value);
1019
0
                        ++base64Counter;
1020
0
                        break;
1021
0
                    case 2:
1022
0
                        c=(char16_t)((bits<<4)|(base64Value>>2));
1023
0
                        if(isLegalIMAP(c)) {
1024
                            /* illegal */
1025
0
                            inDirectMode=true;
1026
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1027
0
                            goto endloop;
1028
0
                        }
1029
0
                        *target++=c;
1030
0
                        if(offsets!=nullptr) {
1031
0
                            *offsets++=sourceIndex;
1032
0
                            sourceIndex=nextSourceIndex-1;
1033
0
                        }
1034
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
1035
0
                        byteIndex=1;
1036
0
                        bits=(uint16_t)(base64Value&3);
1037
0
                        base64Counter=3;
1038
0
                        break;
1039
0
                    case 5:
1040
0
                        c=(char16_t)((bits<<2)|(base64Value>>4));
1041
0
                        if(isLegalIMAP(c)) {
1042
                            /* illegal */
1043
0
                            inDirectMode=true;
1044
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1045
0
                            goto endloop;
1046
0
                        }
1047
0
                        *target++=c;
1048
0
                        if(offsets!=nullptr) {
1049
0
                            *offsets++=sourceIndex;
1050
0
                            sourceIndex=nextSourceIndex-1;
1051
0
                        }
1052
0
                        bytes[0]=b; /* keep this byte in case an error occurs */
1053
0
                        byteIndex=1;
1054
0
                        bits=(uint16_t)(base64Value&15);
1055
0
                        base64Counter=6;
1056
0
                        break;
1057
0
                    case 7:
1058
0
                        c=(char16_t)((bits<<6)|base64Value);
1059
0
                        if(isLegalIMAP(c)) {
1060
                            /* illegal */
1061
0
                            inDirectMode=true;
1062
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063
0
                            goto endloop;
1064
0
                        }
1065
0
                        *target++=c;
1066
0
                        if(offsets!=nullptr) {
1067
0
                            *offsets++=sourceIndex;
1068
0
                            sourceIndex=nextSourceIndex;
1069
0
                        }
1070
0
                        byteIndex=0;
1071
0
                        bits=0;
1072
0
                        base64Counter=0;
1073
0
                        break;
1074
0
                    default:
1075
                        /* will never occur */
1076
0
                        break;
1077
0
                    }
1078
0
                } else if(base64Value==-2) {
1079
                    /* minus sign terminates the base64 sequence */
1080
0
                    inDirectMode=true;
1081
0
                    if(base64Counter==-1) {
1082
                        /* &- i.e. a minus immediately following an ampersand */
1083
0
                        *target++=AMPERSAND;
1084
0
                        if(offsets!=nullptr) {
1085
0
                            *offsets++=sourceIndex-1;
1086
0
                        }
1087
0
                    } else {
1088
                        /* absorb the minus and leave the Unicode Mode */
1089
0
                        if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1090
                            /* bits are illegally left over, a char16_t is incomplete */
1091
                            /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1092
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1093
0
                            break;
1094
0
                        }
1095
0
                    }
1096
0
                    sourceIndex=nextSourceIndex;
1097
0
                    goto directMode;
1098
0
                } else {
1099
0
                    if(base64Counter==-1) {
1100
                        /* illegal: & immediately followed by something other than base64 or minus sign */
1101
                        /* include the ampersand in the reported sequence */
1102
0
                        --sourceIndex;
1103
0
                        bytes[0]=AMPERSAND;
1104
0
                        bytes[1]=b;
1105
0
                        byteIndex=2;
1106
0
                    }
1107
                    /* base64Value==-1 for characters that are illegal only in Unicode mode */
1108
                    /* base64Value==-3 for illegal characters */
1109
                    /* illegal */
1110
0
                    inDirectMode=true;
1111
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1112
0
                    break;
1113
0
                }
1114
0
            } else {
1115
                /* target is full */
1116
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1117
0
                break;
1118
0
            }
1119
0
        }
1120
0
    }
1121
0
endloop:
1122
1123
    /*
1124
     * the end of the input stream and detection of truncated input
1125
     * are handled by the framework, but here we must check if we are in Unicode
1126
     * mode and byteIndex==0 because we must end in direct mode
1127
     *
1128
     * conditions:
1129
     *   successful
1130
     *   in Unicode mode and byteIndex==0
1131
     *   end of input and no truncated input
1132
     */
1133
0
    if( U_SUCCESS(*pErrorCode) &&
1134
0
        !inDirectMode && byteIndex==0 &&
1135
0
        pArgs->flush && source>=sourceLimit
1136
0
    ) {
1137
0
        if(base64Counter==-1) {
1138
            /* & at the very end of the input */
1139
            /* make the ampersand the reported sequence */
1140
0
            bytes[0]=AMPERSAND;
1141
0
            byteIndex=1;
1142
0
        }
1143
        /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1144
1145
0
        inDirectMode=true; /* avoid looping */
1146
0
        *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1147
0
    }
1148
1149
    /* set the converter state back into UConverter */
1150
0
    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1151
0
    cnv->toULength=byteIndex;
1152
1153
    /* write back the updated pointers */
1154
0
    pArgs->source=(const char *)source;
1155
0
    pArgs->target=target;
1156
0
    pArgs->offsets=offsets;
1157
0
}
1158
1159
static void U_CALLCONV
1160
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1161
0
                            UErrorCode *pErrorCode) {
1162
0
    UConverter *cnv;
1163
0
    const char16_t *source, *sourceLimit;
1164
0
    uint8_t *target, *targetLimit;
1165
0
    int32_t *offsets;
1166
1167
0
    int32_t length, targetCapacity, sourceIndex;
1168
0
    char16_t c;
1169
0
    uint8_t b;
1170
1171
    /* UTF-7 state */
1172
0
    uint8_t bits;
1173
0
    int8_t base64Counter;
1174
0
    UBool inDirectMode;
1175
1176
    /* set up the local pointers */
1177
0
    cnv=pArgs->converter;
1178
1179
    /* set up the local pointers */
1180
0
    source=pArgs->source;
1181
0
    sourceLimit=pArgs->sourceLimit;
1182
0
    target=(uint8_t *)pArgs->target;
1183
0
    targetLimit=(uint8_t *)pArgs->targetLimit;
1184
0
    offsets=pArgs->offsets;
1185
1186
    /* get the state machine state */
1187
0
    {
1188
0
        uint32_t status=cnv->fromUnicodeStatus;
1189
0
        inDirectMode=(UBool)((status>>24)&1);
1190
0
        base64Counter=(int8_t)(status>>16);
1191
0
        bits=(uint8_t)status;
1192
0
    }
1193
1194
    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1195
0
    sourceIndex=0;
1196
1197
0
    if(inDirectMode) {
1198
0
directMode:
1199
0
        length=(int32_t)(sourceLimit-source);
1200
0
        targetCapacity=(int32_t)(targetLimit-target);
1201
0
        if(length>targetCapacity) {
1202
0
            length=targetCapacity;
1203
0
        }
1204
0
        while(length>0) {
1205
0
            c=*source++;
1206
            /* encode 0x20..0x7e except '&' directly */
1207
0
            if(inSetDIMAP(c)) {
1208
                /* encode directly */
1209
0
                *target++=(uint8_t)c;
1210
0
                if(offsets!=nullptr) {
1211
0
                    *offsets++=sourceIndex++;
1212
0
                }
1213
0
            } else if(c==AMPERSAND) {
1214
                /* output &- for & */
1215
0
                *target++=AMPERSAND;
1216
0
                if(target<targetLimit) {
1217
0
                    *target++=MINUS;
1218
0
                    if(offsets!=nullptr) {
1219
0
                        *offsets++=sourceIndex;
1220
0
                        *offsets++=sourceIndex++;
1221
0
                    }
1222
                    /* realign length and targetCapacity */
1223
0
                    goto directMode;
1224
0
                } else {
1225
0
                    if(offsets!=nullptr) {
1226
0
                        *offsets++=sourceIndex++;
1227
0
                    }
1228
0
                    cnv->charErrorBuffer[0]=MINUS;
1229
0
                    cnv->charErrorBufferLength=1;
1230
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1231
0
                    break;
1232
0
                }
1233
0
            } else {
1234
                /* un-read this character and switch to Unicode Mode */
1235
0
                --source;
1236
0
                *target++=AMPERSAND;
1237
0
                if(offsets!=nullptr) {
1238
0
                    *offsets++=sourceIndex;
1239
0
                }
1240
0
                inDirectMode=false;
1241
0
                base64Counter=0;
1242
0
                goto unicodeMode;
1243
0
            }
1244
0
            --length;
1245
0
        }
1246
0
        if(source<sourceLimit && target>=targetLimit) {
1247
            /* target is full */
1248
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249
0
        }
1250
0
    } else {
1251
0
unicodeMode:
1252
0
        while(source<sourceLimit) {
1253
0
            if(target<targetLimit) {
1254
0
                c=*source++;
1255
0
                if(isLegalIMAP(c)) {
1256
                    /* encode directly */
1257
0
                    inDirectMode=true;
1258
1259
                    /* trick: back out this character to make this easier */
1260
0
                    --source;
1261
1262
                    /* terminate the base64 sequence */
1263
0
                    if(base64Counter!=0) {
1264
                        /* write remaining bits for the previous character */
1265
0
                        *target++=TO_BASE64_IMAP(bits);
1266
0
                        if(offsets!=nullptr) {
1267
0
                            *offsets++=sourceIndex-1;
1268
0
                        }
1269
0
                    }
1270
                    /* need to terminate with a minus */
1271
0
                    if(target<targetLimit) {
1272
0
                        *target++=MINUS;
1273
0
                        if(offsets!=nullptr) {
1274
0
                            *offsets++=sourceIndex-1;
1275
0
                        }
1276
0
                    } else {
1277
0
                        cnv->charErrorBuffer[0]=MINUS;
1278
0
                        cnv->charErrorBufferLength=1;
1279
0
                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1280
0
                        break;
1281
0
                    }
1282
0
                    goto directMode;
1283
0
                } else {
1284
                    /*
1285
                     * base64 this character:
1286
                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1287
                     * and the bits of this character, each implicitly in UTF-16BE.
1288
                     *
1289
                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1290
                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
1291
                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1292
                     */
1293
0
                    switch(base64Counter) {
1294
0
                    case 0:
1295
0
                        b=(uint8_t)(c>>10);
1296
0
                        *target++=TO_BASE64_IMAP(b);
1297
0
                        if(target<targetLimit) {
1298
0
                            b=(uint8_t)((c>>4)&0x3f);
1299
0
                            *target++=TO_BASE64_IMAP(b);
1300
0
                            if(offsets!=nullptr) {
1301
0
                                *offsets++=sourceIndex;
1302
0
                                *offsets++=sourceIndex++;
1303
0
                            }
1304
0
                        } else {
1305
0
                            if(offsets!=nullptr) {
1306
0
                                *offsets++=sourceIndex++;
1307
0
                            }
1308
0
                            b=(uint8_t)((c>>4)&0x3f);
1309
0
                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1310
0
                            cnv->charErrorBufferLength=1;
1311
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1312
0
                        }
1313
0
                        bits=(uint8_t)((c&15)<<2);
1314
0
                        base64Counter=1;
1315
0
                        break;
1316
0
                    case 1:
1317
0
                        b=(uint8_t)(bits|(c>>14));
1318
0
                        *target++=TO_BASE64_IMAP(b);
1319
0
                        if(target<targetLimit) {
1320
0
                            b=(uint8_t)((c>>8)&0x3f);
1321
0
                            *target++=TO_BASE64_IMAP(b);
1322
0
                            if(target<targetLimit) {
1323
0
                                b=(uint8_t)((c>>2)&0x3f);
1324
0
                                *target++=TO_BASE64_IMAP(b);
1325
0
                                if(offsets!=nullptr) {
1326
0
                                    *offsets++=sourceIndex;
1327
0
                                    *offsets++=sourceIndex;
1328
0
                                    *offsets++=sourceIndex++;
1329
0
                                }
1330
0
                            } else {
1331
0
                                if(offsets!=nullptr) {
1332
0
                                    *offsets++=sourceIndex;
1333
0
                                    *offsets++=sourceIndex++;
1334
0
                                }
1335
0
                                b=(uint8_t)((c>>2)&0x3f);
1336
0
                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1337
0
                                cnv->charErrorBufferLength=1;
1338
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1339
0
                            }
1340
0
                        } else {
1341
0
                            if(offsets!=nullptr) {
1342
0
                                *offsets++=sourceIndex++;
1343
0
                            }
1344
0
                            b=(uint8_t)((c>>8)&0x3f);
1345
0
                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1346
0
                            b=(uint8_t)((c>>2)&0x3f);
1347
0
                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1348
0
                            cnv->charErrorBufferLength=2;
1349
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1350
0
                        }
1351
0
                        bits=(uint8_t)((c&3)<<4);
1352
0
                        base64Counter=2;
1353
0
                        break;
1354
0
                    case 2:
1355
0
                        b=(uint8_t)(bits|(c>>12));
1356
0
                        *target++=TO_BASE64_IMAP(b);
1357
0
                        if(target<targetLimit) {
1358
0
                            b=(uint8_t)((c>>6)&0x3f);
1359
0
                            *target++=TO_BASE64_IMAP(b);
1360
0
                            if(target<targetLimit) {
1361
0
                                b=(uint8_t)(c&0x3f);
1362
0
                                *target++=TO_BASE64_IMAP(b);
1363
0
                                if(offsets!=nullptr) {
1364
0
                                    *offsets++=sourceIndex;
1365
0
                                    *offsets++=sourceIndex;
1366
0
                                    *offsets++=sourceIndex++;
1367
0
                                }
1368
0
                            } else {
1369
0
                                if(offsets!=nullptr) {
1370
0
                                    *offsets++=sourceIndex;
1371
0
                                    *offsets++=sourceIndex++;
1372
0
                                }
1373
0
                                b=(uint8_t)(c&0x3f);
1374
0
                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1375
0
                                cnv->charErrorBufferLength=1;
1376
0
                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1377
0
                            }
1378
0
                        } else {
1379
0
                            if(offsets!=nullptr) {
1380
0
                                *offsets++=sourceIndex++;
1381
0
                            }
1382
0
                            b=(uint8_t)((c>>6)&0x3f);
1383
0
                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1384
0
                            b=(uint8_t)(c&0x3f);
1385
0
                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1386
0
                            cnv->charErrorBufferLength=2;
1387
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1388
0
                        }
1389
0
                        bits=0;
1390
0
                        base64Counter=0;
1391
0
                        break;
1392
0
                    default:
1393
                        /* will never occur */
1394
0
                        break;
1395
0
                    }
1396
0
                }
1397
0
            } else {
1398
                /* target is full */
1399
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1400
0
                break;
1401
0
            }
1402
0
        }
1403
0
    }
1404
1405
0
    if(pArgs->flush && source>=sourceLimit) {
1406
        /* flush remaining bits to the target */
1407
0
        if(!inDirectMode) {
1408
0
            if(base64Counter!=0) {
1409
0
                if(target<targetLimit) {
1410
0
                    *target++=TO_BASE64_IMAP(bits);
1411
0
                    if(offsets!=nullptr) {
1412
0
                        *offsets++=sourceIndex-1;
1413
0
                    }
1414
0
                } else {
1415
0
                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1416
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1417
0
                }
1418
0
            }
1419
            /* need to terminate with a minus */
1420
0
            if(target<targetLimit) {
1421
0
                *target++=MINUS;
1422
0
                if(offsets!=nullptr) {
1423
0
                    *offsets++=sourceIndex-1;
1424
0
                }
1425
0
            } else {
1426
0
                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1427
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1428
0
            }
1429
0
        }
1430
        /* reset the state for the next conversion */
1431
0
        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
1432
0
    } else {
1433
        /* set the converter state back into UConverter */
1434
0
        cnv->fromUnicodeStatus=
1435
0
            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1436
0
            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1437
0
    }
1438
1439
    /* write back the updated pointers */
1440
0
    pArgs->source=source;
1441
0
    pArgs->target=(char *)target;
1442
0
    pArgs->offsets=offsets;
1443
0
}
1444
U_CDECL_END
1445
1446
static const UConverterImpl _IMAPImpl={
1447
    UCNV_IMAP_MAILBOX,
1448
1449
    nullptr,
1450
    nullptr,
1451
1452
    _UTF7Open,
1453
    nullptr,
1454
    _UTF7Reset,
1455
1456
    _IMAPToUnicodeWithOffsets,
1457
    _IMAPToUnicodeWithOffsets,
1458
    _IMAPFromUnicodeWithOffsets,
1459
    _IMAPFromUnicodeWithOffsets,
1460
    nullptr,
1461
1462
    nullptr,
1463
    nullptr,
1464
    nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1465
    nullptr,
1466
    ucnv_getCompleteUnicodeSet,
1467
    nullptr,
1468
    nullptr
1469
};
1470
1471
static const UConverterStaticData _IMAPStaticData={
1472
    sizeof(UConverterStaticData),
1473
    "IMAP-mailbox-name",
1474
    0, /* TODO CCSID for IMAP-mailbox-name */
1475
    UCNV_IBM, UCNV_IMAP_MAILBOX,
1476
    1, 4,
1477
    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1478
    false, false,
1479
    0,
1480
    0,
1481
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1482
};
1483
1484
const UConverterSharedData _IMAPData=
1485
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1486
1487
#endif