/src/icu/source/common/punycode.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
*   Copyright (C) 2002-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  punycode.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jan31
*   created by: Markus W. Scherer
*/


/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/

Disclaimer and license

    Regarding this entire document or any portion of it (including
    the pseudocode and C code), the author makes no guarantees and
    is not responsible for any damage resulting from its use.  The
    author grants irrevocable permission to anyone to use, modify,
    and distribute it in any way that does not diminish the rights
    of anyone else to use, modify, and distribute it, provided that
    redistributed derivative works do not contain misleading author or
    version information.  Derivative works need not be licensed under
    similar terms.
*/
/*
 * ICU modifications:
 * - ICU data types and coding conventions
 * - ICU string buffer handling with implicit source lengths
 *   and destination preflighting
 * - UTF-16 handling
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "unicode/ustring.h"
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "ustr_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "punycode.h"
#include "uassert.h"


/* Punycode ----------------------------------------------------------------- */

/* Punycode parameters for Bootstring */
#define BASE            36
#define TMIN            1
#define TMAX            26
#define SKEW            38
#define DAMP            700
#define INITIAL_BIAS    72
#define INITIAL_N       0x80

/* "Basic" Unicode/ASCII code points */
#define _HYPHEN         0X2d
#define DELIMITER       _HYPHEN

#define _ZERO_          0X30
#define _NINE           0x39

#define _SMALL_A        0X61
#define _SMALL_Z        0X7a

#define _CAPITAL_A      0X41
#define _CAPITAL_Z      0X5a

#define IS_BASIC(c) ((c)<0x80)
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)

/**
 * digitToBasic() returns the basic code point whose value
 * (when used for representing integers) is d, which must be in the
 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
 * nonzero, in which case the uppercase form is used.
 */
static inline char
digitToBasic(int32_t digit, UBool uppercase) {
    /*  0..25 map to ASCII a..z or A..Z */
    /* 26..35 map to ASCII 0..9         */
    if(digit<26) {
        if(uppercase) {
            return (char)(_CAPITAL_A+digit);
        } else {
            return (char)(_SMALL_A+digit);
        }
    } else {
        return (char)((_ZERO_-26)+digit);
    }
}

/**
 * basicToDigit[] contains the numeric value of a basic code
 * point (for use in representing integers) in the range 0 to
 * BASE-1, or -1 if b is does not represent a value.
 */
static const int8_t
basicToDigit[256]={
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};

static inline char
asciiCaseMap(char b, UBool uppercase) {
    if(uppercase) {
        if(_SMALL_A<=b && b<=_SMALL_Z) {
            b-=(_SMALL_A-_CAPITAL_A);
        }
    } else {
        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
            b+=(_SMALL_A-_CAPITAL_A);
        }
    }
    return b;
}

/* Punycode-specific Bootstring code ---------------------------------------- */

/*
 * The following code omits the {parts} of the pseudo-algorithm in the spec
 * that are not used with the Punycode parameter set.
 */

/* Bias adaptation function. */
static int32_t
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    int32_t count;

    if(firstTime) {
        delta/=DAMP;
    } else {
        delta/=2;
    }

    delta+=delta/length;
    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
        delta/=(BASE-TMIN);
    }

    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}

#define MAX_CP_COUNT    200

U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
                UChar *dest, int32_t destCapacity,
                const UBool *caseFlags,
                UErrorCode *pErrorCode) {

    int32_t cpBuffer[MAX_CP_COUNT];
    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    UChar c, c2;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /*
     * Handle the basic code points and
     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
     */
    srcCPCount=destLength=0;
    if(srcLength==-1) {
        /* NUL-terminated input */
        for(j=0; /* no condition */; ++j) {
            if((c=src[j])==0) {
                break;
            }
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    } else {
        /* length-specified input */
        for(j=0; j<srcLength; ++j) {
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            c=src[j];
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    }

    /* Finish the basic string - if it is not empty - with a delimiter. */
    basicLength=destLength;
    if(basicLength>0) {
        if(destLength<destCapacity) {
            dest[destLength]=DELIMITER;
        }
        ++destLength;
    }

    /*
     * handledCPCount is the number of code points that have been handled
     * basicLength is the number of basic code points
     * destLength is the number of chars that have been output
     */

    /* Initialize the state: */
    n=INITIAL_N;
    delta=0;
    bias=INITIAL_BIAS;

    /* Main encoding loop: */
    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
        /*
         * All non-basic code points < n have been handled already.
         * Find the next larger one:
         */
        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(n<=q && q<m) {
                m=q;
            }
        }

        /*
         * Increase delta enough to advance the decoder's
         * <n,i> state to <m,0>, but guard against overflow:
         */
        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
            return 0;
        }
        delta+=(m-n)*(handledCPCount+1);
        n=m;

        /* Encode a sequence of same code points n */
        for(j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(q<n) {
                ++delta;
            } else if(q==n) {
                /* Represent delta as a generalized variable-length integer: */
                for(q=delta, k=BASE; /* no condition */; k+=BASE) {

                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(t>TMAX) {
                        t=TMAX;
                    }
                    */

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(k>=(bias+TMAX)) {
                        t=TMAX;
                    }

                    if(q<t) {
                        break;
                    }

                    if(destLength<destCapacity) {
                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
                    }
                    ++destLength;
                    q=(q-t)/(BASE-t);
                }

                if(destLength<destCapacity) {
                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
                }
                ++destLength;
                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
                delta=0;
                ++handledCPCount;
            }
        }

        ++delta;
        ++n;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
                  UChar *dest, int32_t destCapacity,
                  UBool *caseFlags,
                  UErrorCode *pErrorCode) {
    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
            destCPCount, firstSupplementaryIndex, cpLength;
    UChar b;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(srcLength==-1) {
        srcLength=u_strlen(src);
    }

    /*
     * Handle the basic code points:
     * Let basicLength be the number of input code points
     * before the last delimiter, or 0 if there is none,
     * then copy the first basicLength code points to the output.
     *
     * The two following loops iterate backward.
     */
    for(j=srcLength; j>0;) {
        if(src[--j]==DELIMITER) {
            break;
        }
    }
    destLength=basicLength=destCPCount=j;
    U_ASSERT(destLength>=0);

    while(j>0) {
        b=src[--j];
        if(!IS_BASIC(b)) {
            *pErrorCode=U_INVALID_CHAR_FOUND;
            return 0;
        }

        if(j<destCapacity) {
            dest[j]=(UChar)b;

            if(caseFlags!=NULL) {
                caseFlags[j]=IS_BASIC_UPPERCASE(b);
            }
        }
    }

    /* Initialize the state: */
    n=INITIAL_N;
    i=0;
    bias=INITIAL_BIAS;
    firstSupplementaryIndex=1000000000;

    /*
     * Main decoding loop:
     * Start just after the last delimiter if any
     * basic code points were copied; start at the beginning otherwise.
     */
    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
        /*
         * in is the index of the next character to be consumed, and
         * destCPCount is the number of code points in the output array.
         *
         * Decode a generalized variable-length integer into delta,
         * which gets added to i.  The overflow checking is easier
         * if we increase i as we go, then subtract off its starting
         * value at the end to obtain delta.
         */
        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
            if(in>=srcLength) {
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            digit=basicToDigit[(uint8_t)src[in++]];
            if(digit<0) {
                *pErrorCode=U_INVALID_CHAR_FOUND;
                return 0;
            }
            if(digit>(0x7fffffff-i)/w) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            i+=digit*w;
            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt  
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(t>TMAX) {
                t=TMAX;
            }
            */
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(k>=(bias+TMAX)) {
                t=TMAX;
            }
            if(digit<t) {
                break;
            }

            if(w>0x7fffffff/(BASE-t)) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }
            w*=BASE-t;
        }

        /*
         * Modification from sample code:
         * Increments destCPCount here,
         * where needed instead of in for() loop tail.
         */
        ++destCPCount;
        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));

        /*
         * i was supposed to wrap around from (incremented) destCPCount to 0,
         * incrementing n each time, so we'll fix that now:
         */
        if(i/destCPCount>(0x7fffffff-n)) {
            /* integer overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        n+=i/destCPCount;
        i%=destCPCount;
        /* not needed for Punycode: */
        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */

        if(n>0x10ffff || U_IS_SURROGATE(n)) {
            /* Unicode code point overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        /* Insert n at position i of the output: */
        cpLength=U16_LENGTH(n);
        if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
            int32_t codeUnitIndex;

            /*
             * Handle indexes when supplementary code points are present.
             *
             * In almost all cases, there will be only BMP code points before i
             * and even in the entire string.
             * This is handled with the same efficiency as with UTF-32.
             *
             * Only the rare cases with supplementary code points are handled
             * more slowly - but not too bad since this is an insertion anyway.
             */
            if(i<=firstSupplementaryIndex) {
                codeUnitIndex=i;
                if(cpLength>1) {
                    firstSupplementaryIndex=codeUnitIndex;
                } else {
                    ++firstSupplementaryIndex;
                }
            } else {
                codeUnitIndex=firstSupplementaryIndex;
                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
            }

            /* use the UChar index codeUnitIndex instead of the code point index i */
            if(codeUnitIndex<destLength) {
                uprv_memmove(dest+codeUnitIndex+cpLength,
                             dest+codeUnitIndex,
                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
                if(caseFlags!=NULL) {
                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
                                 caseFlags+codeUnitIndex,
                                 destLength-codeUnitIndex);
                }
            }
            if(cpLength==1) {
                /* BMP, insert one code unit */
                dest[codeUnitIndex]=(UChar)n;
            } else {
                /* supplementary character, insert two code units */
                dest[codeUnitIndex]=U16_LEAD(n);
                dest[codeUnitIndex+1]=U16_TRAIL(n);
            }
            if(caseFlags!=NULL) {
                /* Case of last character determines uppercase flag: */
                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
                if(cpLength==2) {
                    caseFlags[codeUnitIndex+1]=FALSE;
                }
            }
        }
        destLength+=cpLength;
        U_ASSERT(destLength>=0);
        ++i;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}

/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */

#endif /* #if !UCONFIG_NO_IDNA */

Coverage Report

Created: 2025-07-11 06:23

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		*
6		* Copyright (C) 2002-2011, International Business Machines
7		* Corporation and others. All Rights Reserved.
8		*
9		*******************************************************************************
10		* file name: punycode.cpp
11		* encoding: UTF-8
12		* tab size: 8 (not used)
13		* indentation:4
14		*
15		* created on: 2002jan31
16		* created by: Markus W. Scherer
17		*/
18
19
20		/* This ICU code derived from: */
21		/*
22		punycode.c 0.4.0 (2001-Nov-17-Sat)
23		http://www.cs.berkeley.edu/~amc/idn/
24		Adam M. Costello
25		http://www.nicemice.net/amc/
26
27		Disclaimer and license
28
29		Regarding this entire document or any portion of it (including
30		the pseudocode and C code), the author makes no guarantees and
31		is not responsible for any damage resulting from its use. The
32		author grants irrevocable permission to anyone to use, modify,
33		and distribute it in any way that does not diminish the rights
34		of anyone else to use, modify, and distribute it, provided that
35		redistributed derivative works do not contain misleading author or
36		version information. Derivative works need not be licensed under
37		similar terms.
38		*/
39		/*
40		* ICU modifications:
41		* - ICU data types and coding conventions
42		* - ICU string buffer handling with implicit source lengths
43		* and destination preflighting
44		* - UTF-16 handling
45		*/
46
47		#include "unicode/utypes.h"
48
49		#if !UCONFIG_NO_IDNA
50
51		#include "unicode/ustring.h"
52		#include "unicode/utf.h"
53		#include "unicode/utf16.h"
54		#include "ustr_imp.h"
55		#include "cstring.h"
56		#include "cmemory.h"
57		#include "punycode.h"
58		#include "uassert.h"
59
60
61		/* Punycode ----------------------------------------------------------------- */
62
63		/* Punycode parameters for Bootstring */
64	0	#define BASE 36
65	0	#define TMIN 1
66	0	#define TMAX 26
67	0	#define SKEW 38
68	0	#define DAMP 700
69	0	#define INITIAL_BIAS 72
70	0	#define INITIAL_N 0x80
71
72		/* "Basic" Unicode/ASCII code points */
73	0	#define _HYPHEN 0X2d
74	0	#define DELIMITER _HYPHEN
75
76	0	#define _ZERO_ 0X30
77		#define _NINE 0x39
78
79	0	#define _SMALL_A 0X61
80	0	#define _SMALL_Z 0X7a
81
82	0	#define _CAPITAL_A 0X41
83	0	#define _CAPITAL_Z 0X5a
84
85	0	#define IS_BASIC(c) ((c)<0x80)
86	0	#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87
88		/**
89		* digitToBasic() returns the basic code point whose value
90		* (when used for representing integers) is d, which must be in the
91		* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92		* nonzero, in which case the uppercase form is used.
93		*/
94		static inline char
95	0	digitToBasic(int32_t digit, UBool uppercase) {
96		/* 0..25 map to ASCII a..z or A..Z */
97		/* 26..35 map to ASCII 0..9 */
98	0	if(digit<26) {
99	0	if(uppercase) {
100	0	return (char)(_CAPITAL_A+digit);
101	0	} else {
102	0	return (char)(_SMALL_A+digit);
103	0	}
104	0	} else {
105	0	return (char)((_ZERO_-26)+digit);
106	0	}
107	0	}
108
109		/**
110		* basicToDigit[] contains the numeric value of a basic code
111		* point (for use in representing integers) in the range 0 to
112		* BASE-1, or -1 if b is does not represent a value.
113		*/
114		static const int8_t
115		basicToDigit[256]={
116		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
118
119		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120		26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
121
122		-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
123		15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
124
125		-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
126		15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
127
128		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
129		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130
131		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
132		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133
134		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
135		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
136
137		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
138		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
139		};
140
141		static inline char
142	0	asciiCaseMap(char b, UBool uppercase) {
143	0	if(uppercase) {
144	0	if(_SMALL_A<=b && b<=_SMALL_Z) {
145	0	b-=(_SMALL_A-_CAPITAL_A);
146	0	}
147	0	} else {
148	0	if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
149	0	b+=(_SMALL_A-_CAPITAL_A);
150	0	}
151	0	}
152	0	return b;
153	0	}
154
155		/* Punycode-specific Bootstring code ---------------------------------------- */
156
157		/*
158		* The following code omits the {parts} of the pseudo-algorithm in the spec
159		* that are not used with the Punycode parameter set.
160		*/
161
162		/* Bias adaptation function. */
163		static int32_t
164	0	adaptBias(int32_t delta, int32_t length, UBool firstTime) {
165	0	int32_t count;
166
167	0	if(firstTime) {
168	0	delta/=DAMP;
169	0	} else {
170	0	delta/=2;
171	0	}
172
173	0	delta+=delta/length;
174	0	for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
175	0	delta/=(BASE-TMIN);
176	0	}
177
178	0	return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
179	0	}
180
181	0	#define MAX_CP_COUNT 200
182
183		U_CFUNC int32_t
184		u_strToPunycode(const UChar *src, int32_t srcLength,
185		UChar *dest, int32_t destCapacity,
186		const UBool *caseFlags,
187	0	UErrorCode *pErrorCode) {
188
189	0	int32_t cpBuffer[MAX_CP_COUNT];
190	0	int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
191	0	UChar c, c2;
192
193		/* argument checking */
194	0	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
195	0	return 0;
196	0	}
197
198	0	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
199	0	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
200	0	return 0;
201	0	}
202
203		/*
204		* Handle the basic code points and
205		* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
206		*/
207	0	srcCPCount=destLength=0;
208	0	if(srcLength==-1) {
209		/* NUL-terminated input */
210	0	for(j=0; /* no condition */; ++j) {
211	0	if((c=src[j])==0) {
212	0	break;
213	0	}
214	0	if(srcCPCount==MAX_CP_COUNT) {
215		/* too many input code points */
216	0	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
217	0	return 0;
218	0	}
219	0	if(IS_BASIC(c)) {
220	0	cpBuffer[srcCPCount++]=0;
221	0	if(destLength<destCapacity) {
222	0	dest[destLength]=
223	0	caseFlags!=NULL ?
224	0	asciiCaseMap((char)c, caseFlags[j]) :
225	0	(char)c;
226	0	}
227	0	++destLength;
228	0	} else {
229	0	n=(caseFlags!=NULL && caseFlags[j])<<31L;
230	0	if(U16_IS_SINGLE(c)) {
231	0	n\|=c;
232	0	} else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
233	0	++j;
234	0	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
235	0	} else {
236		/* error: unmatched surrogate */
237	0	*pErrorCode=U_INVALID_CHAR_FOUND;
238	0	return 0;
239	0	}
240	0	cpBuffer[srcCPCount++]=n;
241	0	}
242	0	}
243	0	} else {
244		/* length-specified input */
245	0	for(j=0; j<srcLength; ++j) {
246	0	if(srcCPCount==MAX_CP_COUNT) {
247		/* too many input code points */
248	0	*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
249	0	return 0;
250	0	}
251	0	c=src[j];
252	0	if(IS_BASIC(c)) {
253	0	cpBuffer[srcCPCount++]=0;
254	0	if(destLength<destCapacity) {
255	0	dest[destLength]=
256	0	caseFlags!=NULL ?
257	0	asciiCaseMap((char)c, caseFlags[j]) :
258	0	(char)c;
259	0	}
260	0	++destLength;
261	0	} else {
262	0	n=(caseFlags!=NULL && caseFlags[j])<<31L;
263	0	if(U16_IS_SINGLE(c)) {
264	0	n\|=c;
265	0	} else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
266	0	++j;
267	0	n\|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
268	0	} else {
269		/* error: unmatched surrogate */
270	0	*pErrorCode=U_INVALID_CHAR_FOUND;
271	0	return 0;
272	0	}
273	0	cpBuffer[srcCPCount++]=n;
274	0	}
275	0	}
276	0	}
277
278		/* Finish the basic string - if it is not empty - with a delimiter. */
279	0	basicLength=destLength;
280	0	if(basicLength>0) {
281	0	if(destLength<destCapacity) {
282	0	dest[destLength]=DELIMITER;
283	0	}
284	0	++destLength;
285	0	}
286
287		/*
288		* handledCPCount is the number of code points that have been handled
289		* basicLength is the number of basic code points
290		* destLength is the number of chars that have been output
291		*/
292
293		/* Initialize the state: */
294	0	n=INITIAL_N;
295	0	delta=0;
296	0	bias=INITIAL_BIAS;
297
298		/* Main encoding loop: */
299	0	for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
300		/*
301		* All non-basic code points < n have been handled already.
302		* Find the next larger one:
303		*/
304	0	for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
305	0	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
306	0	if(n<=q && q<m) {
307	0	m=q;
308	0	}
309	0	}
310
311		/*
312		* Increase delta enough to advance the decoder's
313		* <n,i> state to <m,0>, but guard against overflow:
314		*/
315	0	if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
316	0	*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
317	0	return 0;
318	0	}
319	0	delta+=(m-n)*(handledCPCount+1);
320	0	n=m;
321
322		/* Encode a sequence of same code points n */
323	0	for(j=0; j<srcCPCount; ++j) {
324	0	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
325	0	if(q<n) {
326	0	++delta;
327	0	} else if(q==n) {
328		/* Represent delta as a generalized variable-length integer: */
329	0	for(q=delta, k=BASE; /* no condition */; k+=BASE) {
330
331		/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
332
333		t=k-bias;
334		if(t<TMIN) {
335		t=TMIN;
336		} else if(t>TMAX) {
337		t=TMAX;
338		}
339		*/
340
341	0	t=k-bias;
342	0	if(t<TMIN) {
343	0	t=TMIN;
344	0	} else if(k>=(bias+TMAX)) {
345	0	t=TMAX;
346	0	}
347
348	0	if(q<t) {
349	0	break;
350	0	}
351
352	0	if(destLength<destCapacity) {
353	0	dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
354	0	}
355	0	++destLength;
356	0	q=(q-t)/(BASE-t);
357	0	}
358
359	0	if(destLength<destCapacity) {
360	0	dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
361	0	}
362	0	++destLength;
363	0	bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
364	0	delta=0;
365	0	++handledCPCount;
366	0	}
367	0	}
368
369	0	++delta;
370	0	++n;
371	0	}
372
373	0	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
374	0	}
375
376		U_CFUNC int32_t
377		u_strFromPunycode(const UChar *src, int32_t srcLength,
378		UChar *dest, int32_t destCapacity,
379		UBool *caseFlags,
380	0	UErrorCode *pErrorCode) {
381	0	int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
382	0	destCPCount, firstSupplementaryIndex, cpLength;
383	0	UChar b;
384
385		/* argument checking */
386	0	if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)) {
387	0	return 0;
388	0	}
389
390	0	if(src==NULL \|\| srcLength<-1 \|\| (dest==NULL && destCapacity!=0)) {
391	0	*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
392	0	return 0;
393	0	}
394
395	0	if(srcLength==-1) {
396	0	srcLength=u_strlen(src);
397	0	}
398
399		/*
400		* Handle the basic code points:
401		* Let basicLength be the number of input code points
402		* before the last delimiter, or 0 if there is none,
403		* then copy the first basicLength code points to the output.
404		*
405		* The two following loops iterate backward.
406		*/
407	0	for(j=srcLength; j>0;) {
408	0	if(src[--j]==DELIMITER) {
409	0	break;
410	0	}
411	0	}
412	0	destLength=basicLength=destCPCount=j;
413	0	U_ASSERT(destLength>=0);
414
415	0	while(j>0) {
416	0	b=src[--j];
417	0	if(!IS_BASIC(b)) {
418	0	*pErrorCode=U_INVALID_CHAR_FOUND;
419	0	return 0;
420	0	}
421
422	0	if(j<destCapacity) {
423	0	dest[j]=(UChar)b;
424
425	0	if(caseFlags!=NULL) {
426	0	caseFlags[j]=IS_BASIC_UPPERCASE(b);
427	0	}
428	0	}
429	0	}
430
431		/* Initialize the state: */
432	0	n=INITIAL_N;
433	0	i=0;
434	0	bias=INITIAL_BIAS;
435	0	firstSupplementaryIndex=1000000000;
436
437		/*
438		* Main decoding loop:
439		* Start just after the last delimiter if any
440		* basic code points were copied; start at the beginning otherwise.
441		*/
442	0	for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
443		/*
444		* in is the index of the next character to be consumed, and
445		* destCPCount is the number of code points in the output array.
446		*
447		* Decode a generalized variable-length integer into delta,
448		* which gets added to i. The overflow checking is easier
449		* if we increase i as we go, then subtract off its starting
450		* value at the end to obtain delta.
451		*/
452	0	for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
453	0	if(in>=srcLength) {
454	0	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
455	0	return 0;
456	0	}
457
458	0	digit=basicToDigit[(uint8_t)src[in++]];
459	0	if(digit<0) {
460	0	*pErrorCode=U_INVALID_CHAR_FOUND;
461	0	return 0;
462	0	}
463	0	if(digit>(0x7fffffff-i)/w) {
464		/* integer overflow */
465	0	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
466	0	return 0;
467	0	}
468
469	0	i+=digit*w;
470		/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
471		t=k-bias;
472		if(t<TMIN) {
473		t=TMIN;
474		} else if(t>TMAX) {
475		t=TMAX;
476		}
477		*/
478	0	t=k-bias;
479	0	if(t<TMIN) {
480	0	t=TMIN;
481	0	} else if(k>=(bias+TMAX)) {
482	0	t=TMAX;
483	0	}
484	0	if(digit<t) {
485	0	break;
486	0	}
487
488	0	if(w>0x7fffffff/(BASE-t)) {
489		/* integer overflow */
490	0	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
491	0	return 0;
492	0	}
493	0	w*=BASE-t;
494	0	}
495
496		/*
497		* Modification from sample code:
498		* Increments destCPCount here,
499		* where needed instead of in for() loop tail.
500		*/
501	0	++destCPCount;
502	0	bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
503
504		/*
505		* i was supposed to wrap around from (incremented) destCPCount to 0,
506		* incrementing n each time, so we'll fix that now:
507		*/
508	0	if(i/destCPCount>(0x7fffffff-n)) {
509		/* integer overflow */
510	0	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
511	0	return 0;
512	0	}
513
514	0	n+=i/destCPCount;
515	0	i%=destCPCount;
516		/* not needed for Punycode: */
517		/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
518
519	0	if(n>0x10ffff \|\| U_IS_SURROGATE(n)) {
520		/* Unicode code point overflow */
521	0	*pErrorCode=U_ILLEGAL_CHAR_FOUND;
522	0	return 0;
523	0	}
524
525		/* Insert n at position i of the output: */
526	0	cpLength=U16_LENGTH(n);
527	0	if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
528	0	int32_t codeUnitIndex;
529
530		/*
531		* Handle indexes when supplementary code points are present.
532		*
533		* In almost all cases, there will be only BMP code points before i
534		* and even in the entire string.
535		* This is handled with the same efficiency as with UTF-32.
536		*
537		* Only the rare cases with supplementary code points are handled
538		* more slowly - but not too bad since this is an insertion anyway.
539		*/
540	0	if(i<=firstSupplementaryIndex) {
541	0	codeUnitIndex=i;
542	0	if(cpLength>1) {
543	0	firstSupplementaryIndex=codeUnitIndex;
544	0	} else {
545	0	++firstSupplementaryIndex;
546	0	}
547	0	} else {
548	0	codeUnitIndex=firstSupplementaryIndex;
549	0	U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
550	0	}
551
552		/* use the UChar index codeUnitIndex instead of the code point index i */
553	0	if(codeUnitIndex<destLength) {
554	0	uprv_memmove(dest+codeUnitIndex+cpLength,
555	0	dest+codeUnitIndex,
556	0	(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
557	0	if(caseFlags!=NULL) {
558	0	uprv_memmove(caseFlags+codeUnitIndex+cpLength,
559	0	caseFlags+codeUnitIndex,
560	0	destLength-codeUnitIndex);
561	0	}
562	0	}
563	0	if(cpLength==1) {
564		/* BMP, insert one code unit */
565	0	dest[codeUnitIndex]=(UChar)n;
566	0	} else {
567		/* supplementary character, insert two code units */
568	0	dest[codeUnitIndex]=U16_LEAD(n);
569	0	dest[codeUnitIndex+1]=U16_TRAIL(n);
570	0	}
571	0	if(caseFlags!=NULL) {
572		/* Case of last character determines uppercase flag: */
573	0	caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
574	0	if(cpLength==2) {
575	0	caseFlags[codeUnitIndex+1]=FALSE;
576	0	}
577	0	}
578	0	}
579	0	destLength+=cpLength;
580	0	U_ASSERT(destLength>=0);
581	0	++i;
582	0	}
583
584	0	return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
585	0	}
586
587		/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
588
589		#endif /* #if !UCONFIG_NO_IDNA */