/src/libidn2/lib/punycode.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* punycode.c - punycode encoding/decoding  | 
2  |  |    Copyright (C) 2011-2024 Simon Josefsson  | 
3  |  |  | 
4  |  |    Libidn2 is free software: you can redistribute it and/or modify it  | 
5  |  |    under the terms of either:  | 
6  |  |  | 
7  |  |      * the GNU Lesser General Public License as published by the Free  | 
8  |  |        Software Foundation; either version 3 of the License, or (at  | 
9  |  |        your option) any later version.  | 
10  |  |  | 
11  |  |    or  | 
12  |  |  | 
13  |  |      * the GNU General Public License as published by the Free  | 
14  |  |        Software Foundation; either version 2 of the License, or (at  | 
15  |  |        your option) any later version.  | 
16  |  |  | 
17  |  |    or both in parallel, as here.  | 
18  |  |  | 
19  |  |    This program is distributed in the hope that it will be useful,  | 
20  |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
21  |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  | 
22  |  |    GNU General Public License for more details.  | 
23  |  |  | 
24  |  |    You should have received copies of the GNU General Public License and  | 
25  |  |    the GNU Lesser General Public License along with this program.  If  | 
26  |  |    not, see <http://www.gnu.org/licenses/>.  | 
27  |  | */  | 
28  |  |  | 
29  |  | /*  | 
30  |  |   Code copied from http://www.nicemice.net/idn/punycode-spec.gz on  | 
31  |  |   2011-01-04 with SHA-1 a966a8017f6be579d74a50a226accc7607c40133  | 
32  |  |   labeled punycode-spec 1.0.3 (2006-Mar-23-Thu).  It is modified for  | 
33  |  |   Libidn2 by Simon Josefsson and others.  Original code license:  | 
34  |  |  | 
35  |  |   punycode-spec 1.0.3 (2006-Mar-23-Thu)  | 
36  |  |   http://www.nicemice.net/idn/  | 
37  |  |   Adam M. Costello  | 
38  |  |   http://www.nicemice.net/amc/  | 
39  |  |  | 
40  |  |   B. Disclaimer and license  | 
41  |  |  | 
42  |  |     Regarding this entire document or any portion of it (including  | 
43  |  |     the pseudocode and C code), the author makes no guarantees and  | 
44  |  |     is not responsible for any damage resulting from its use.  The  | 
45  |  |     author grants irrevocable permission to anyone to use, modify,  | 
46  |  |     and distribute it in any way that does not diminish the rights  | 
47  |  |     of anyone else to use, modify, and distribute it, provided that  | 
48  |  |     redistributed derivative works do not contain misleading author or  | 
49  |  |     version information.  Derivative works need not be licensed under  | 
50  |  |     similar terms.  | 
51  |  |  | 
52  |  |   C. Punycode sample implementation  | 
53  |  |  | 
54  |  |   punycode-sample.c 2.0.0 (2004-Mar-21-Sun)  | 
55  |  |   http://www.nicemice.net/idn/  | 
56  |  |   Adam M. Costello  | 
57  |  |   http://www.nicemice.net/amc/  | 
58  |  |  | 
59  |  |   This is ANSI C code (C89) implementing Punycode 1.0.x.  | 
60  |  | */  | 
61  |  |  | 
62  |  | #include <config.h>  | 
63  |  |  | 
64  |  | #include "idn2.h"   /* IDN2_OK, ... */  | 
65  |  |  | 
66  |  | #include <string.h>  | 
67  |  |  | 
68  |  | /*** Bootstring parameters for Punycode ***/  | 
69  |  |  | 
70  |  | enum  | 
71  |  | { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700, | 
72  |  |   initial_bias = 72, initial_n = 0x80, delimiter = 0x2D  | 
73  |  | };  | 
74  |  |  | 
75  |  | /* basic(cp) tests whether cp is a basic code point: */  | 
76  | 0  | #define encode_basic(cp) ((uint32_t)(cp) < 0x80)  | 
77  |  |  | 
78  |  | /* encode_digit(d,flag) returns the basic code point whose value      */  | 
79  |  | /* (when used for representing integers) is d, which needs to be in   */  | 
80  |  | /* the range 0 to base-1.  The lowercase form is used unless flag is  */  | 
81  |  | /* nonzero, in which case the uppercase form is used.  The behavior   */  | 
82  |  | /* is undefined if flag is nonzero and digit d has no uppercase form. */  | 
83  |  |  | 
84  |  | static char  | 
85  |  | encode_digit (uint32_t d, int flag)  | 
86  | 0  | { | 
87  | 0  |   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);  | 
88  |  |   /*  0..25 map to ASCII a..z or A..Z */  | 
89  |  |   /* 26..35 map to ASCII 0..9         */  | 
90  | 0  | }  | 
91  |  |  | 
92  |  | /* basic(cp) tests whether cp is a basic code point: */  | 
93  |  | #define decode_basic(cp)             \  | 
94  | 0  |   ((cp >= 'a' && cp <= 'z') || (cp >= '0' && cp <='9')   \  | 
95  | 0  |    || (cp >= 'A' && cp <='Z') || cp == '-' || cp == '_')  | 
96  |  |  | 
97  |  | /* decode_digit(cp) returns the numeric value of a basic code */  | 
98  |  | /* point (for use in representing integers) in the range 0 to */  | 
99  |  | /* base-1, or base if cp does not represent a value.          */  | 
100  |  |  | 
101  |  | static unsigned  | 
102  |  | decode_digit (int cp)  | 
103  | 0  | { | 
104  | 0  |   if (cp >= 'a' && cp <= 'z')  | 
105  | 0  |     return cp - 'a';  | 
106  | 0  |   if (cp >= '0' && cp <= '9')  | 
107  | 0  |     return cp - '0' + 26;  | 
108  | 0  |   if (cp >= 'A' && cp <= 'Z')  | 
109  | 0  |     return cp - 'A';  | 
110  |  |  | 
111  | 0  |   return 0;  | 
112  | 0  | }  | 
113  |  |  | 
114  |  | /*** Bias adaptation function ***/  | 
115  |  |  | 
116  |  | static uint32_t  | 
117  |  | adapt (uint32_t delta, uint32_t numpoints, int firsttime)  | 
118  |  |   _GL_ATTRIBUTE_CONST;  | 
119  |  |  | 
120  |  |      static uint32_t adapt (uint32_t delta, uint32_t numpoints, int firsttime)  | 
121  | 0  | { | 
122  | 0  |   uint32_t k;  | 
123  |  | 
  | 
124  | 0  |   delta = firsttime ? delta / damp : delta >> 1;  | 
125  |  |   /* delta >> 1 is a faster way of doing delta / 2 */  | 
126  | 0  |   delta += delta / numpoints;  | 
127  |  | 
  | 
128  | 0  |   for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)  | 
129  | 0  |     { | 
130  | 0  |       delta /= base - tmin;  | 
131  | 0  |     }  | 
132  |  | 
  | 
133  | 0  |   return k + (base - tmin + 1) * delta / (delta + skew);  | 
134  | 0  | }  | 
135  |  |  | 
136  |  | /**  | 
137  |  |  * idn2_punycode_encode:  | 
138  |  |  * @input: array of input Unicode (UCS-4) code points  | 
139  |  |  * @input_length: number of code points in the @input array.  | 
140  |  |  * @output: output character array with ASCII code points.  | 
141  |  |  * @output_length: size of @output array  | 
142  |  |  *  | 
143  |  |  * Converts a sequence of code points to Punycode.  | 
144  |  |  *  | 
145  |  |  * Warning: Be aware that is rare for applications to need to perform  | 
146  |  |  *   Punycode operations, and you should consider idn2_to_ascii_8z(),  | 
147  |  |  *   idn2_to_ascii_lz(), idn2_to_ascii_4z() etc.  | 
148  |  |  *  | 
149  |  |  * Returns: On successful encoding %IDN2_OK is returned, or an error  | 
150  |  |  *   codes like %IDN2_PUNYCODE_BAD_INPUT, %IDN2_PUNYCODE_BIG_OUTPUT,  | 
151  |  |  *   or %IDN2_PUNYCODE_OVERFLOW.  | 
152  |  |  *  | 
153  |  |  * Since: 2.3.5  | 
154  |  |  **/  | 
155  |  | int  | 
156  |  | idn2_punycode_encode (const uint32_t *input,  | 
157  |  |           size_t input_length,  | 
158  |  |           char *output, size_t *output_length)  | 
159  | 0  | { | 
160  | 0  |   uint32_t our_input_length, n, delta, h, b, bias, j, m, q, k, t;  | 
161  | 0  |   size_t out, max_out;  | 
162  |  |  | 
163  |  |   /* The Punycode spec assumes that the input length is the same type */  | 
164  |  |   /* of integer as a code point, so we need to convert the size_t to  */  | 
165  |  |   /* a uint32_t, which could overflow.                           */  | 
166  |  | 
  | 
167  | 0  |   if (input_length > UINT32_MAX)  | 
168  | 0  |     return IDN2_PUNYCODE_OVERFLOW;  | 
169  | 0  |   our_input_length = (uint32_t) input_length;  | 
170  |  |  | 
171  |  |   /* Initialize the state: */  | 
172  |  | 
  | 
173  | 0  |   n = initial_n;  | 
174  | 0  |   delta = 0;  | 
175  | 0  |   out = 0;  | 
176  | 0  |   max_out = *output_length;  | 
177  | 0  |   bias = initial_bias;  | 
178  |  |  | 
179  |  |   /* Handle the basic code points: */  | 
180  |  | 
  | 
181  | 0  |   for (j = 0; j < our_input_length; ++j)  | 
182  | 0  |     { | 
183  | 0  |       if (encode_basic (input[j]))  | 
184  | 0  |   { | 
185  | 0  |     if (max_out - out < 2)  | 
186  | 0  |       return IDN2_PUNYCODE_BIG_OUTPUT;  | 
187  | 0  |     output[out++] = (char) input[j];  | 
188  | 0  |   }  | 
189  | 0  |       else if (input[j] > 0x10FFFF  | 
190  | 0  |          || (input[j] >= 0xD800 && input[j] <= 0xDBFF))  | 
191  | 0  |   return IDN2_PUNYCODE_BAD_INPUT;  | 
192  | 0  |     }  | 
193  |  |  | 
194  | 0  |   h = b = (uint32_t) out;  | 
195  |  |   /* cannot overflow because out <= our_input_length <= UINT32_MAX */  | 
196  |  |  | 
197  |  |   /* h is the number of code points that have been handled, b is the  */  | 
198  |  |   /* number of basic code points, and out is the number of ASCII code */  | 
199  |  |   /* points that have been output.                                    */  | 
200  |  | 
  | 
201  | 0  |   if (b > 0)  | 
202  | 0  |     output[out++] = delimiter;  | 
203  |  |  | 
204  |  |   /* Main encoding loop: */  | 
205  |  | 
  | 
206  | 0  |   while (h < our_input_length)  | 
207  | 0  |     { | 
208  |  |       /* All non-basic code points < n have been     */  | 
209  |  |       /* handled already.  Find the next larger one: */  | 
210  |  | 
  | 
211  | 0  |       for (m = UINT32_MAX, j = 0; j < our_input_length; ++j)  | 
212  | 0  |   { | 
213  |  |     /* if (basic(input[j])) continue; */  | 
214  |  |     /* (not needed for Punycode) */  | 
215  | 0  |     if (input[j] >= n && input[j] < m)  | 
216  | 0  |       m = input[j];  | 
217  | 0  |   }  | 
218  |  |  | 
219  |  |       /* Increase delta enough to advance the decoder's    */  | 
220  |  |       /* <n,i> state to <m,0>, but guard against overflow: */  | 
221  |  | 
  | 
222  | 0  |       if (m - n > (UINT32_MAX - delta) / (h + 1))  | 
223  | 0  |   return IDN2_PUNYCODE_OVERFLOW;  | 
224  | 0  |       delta += (m - n) * (h + 1);  | 
225  | 0  |       n = m;  | 
226  |  | 
  | 
227  | 0  |       for (j = 0; j < our_input_length; ++j)  | 
228  | 0  |   { | 
229  |  |     /* Punycode does not need to check whether input[j] is basic: */  | 
230  | 0  |     if (input[j] < n /* || basic(input[j]) */ )  | 
231  | 0  |       { | 
232  | 0  |         if (++delta == 0)  | 
233  | 0  |     return IDN2_PUNYCODE_OVERFLOW;  | 
234  | 0  |       }  | 
235  |  |  | 
236  | 0  |     if (input[j] == n)  | 
237  | 0  |       { | 
238  |  |         /* Represent delta as a generalized variable-length integer: */  | 
239  |  | 
  | 
240  | 0  |         for (q = delta, k = base;; k += base)  | 
241  | 0  |     { | 
242  | 0  |       if (out >= max_out)  | 
243  | 0  |         return IDN2_PUNYCODE_BIG_OUTPUT;  | 
244  | 0  |       t = k <= bias /* + tmin */ ? tmin :  /* +tmin not needed */  | 
245  | 0  |         k >= bias + tmax ? tmax : k - bias;  | 
246  | 0  |       if (q < t)  | 
247  | 0  |         break;  | 
248  | 0  |       output[out++] = encode_digit (t + (q - t) % (base - t), 0);  | 
249  | 0  |       q = (q - t) / (base - t);  | 
250  | 0  |     }  | 
251  |  |  | 
252  | 0  |         output[out++] = encode_digit (q, 0);  | 
253  | 0  |         bias = adapt (delta, h + 1, h == b);  | 
254  | 0  |         delta = 0;  | 
255  | 0  |         ++h;  | 
256  | 0  |       }  | 
257  | 0  |   }  | 
258  |  |  | 
259  | 0  |       ++delta, ++n;  | 
260  | 0  |     }  | 
261  |  |  | 
262  | 0  |   *output_length = out;  | 
263  | 0  |   return IDN2_OK;  | 
264  | 0  | }  | 
265  |  |  | 
266  |  | /**  | 
267  |  |  * idn2_punycode_decode:  | 
268  |  |  * @input: array of ASCII characters (0..7F)  | 
269  |  |  * @input_length: Number of ASCII characters in the @input array.  | 
270  |  |  * @output: output character array with ASCII code points.  | 
271  |  |  * @output_length: on input, guarantee that @output has room  | 
272  |  |  *   for this many code points; on output, *@output_length  | 
273  |  |  *   holds number of code points in @output.  | 
274  |  |  *  | 
275  |  |  * Converts Punycode to a sequence of code points.  | 
276  |  |  *  | 
277  |  |  * The decoder will never need to output more code points than the  | 
278  |  |  * number of ASCII code points in the input, because of the way the  | 
279  |  |  * encoding is defined.  | 
280  |  |  *  | 
281  |  |  * The number of code points output cannot exceed the maximum possible  | 
282  |  |  * value of a uint32_t, even if the supplied @output_length is greater  | 
283  |  |  * than that.  | 
284  |  |  *  | 
285  |  |  * Warning: Be aware that is rare for applications to need to perform  | 
286  |  |  *   Punycode operations, and you should consider  | 
287  |  |  *   idn2_to_unicode_8z8z(), idn2_to_unicode_lzlz() etc.  | 
288  |  |  *  | 
289  |  |  * Returns: On successful encoding %IDN2_OK is returned, or an error  | 
290  |  |  *   codes like %IDN2_PUNYCODE_BAD_INPUT, %IDN2_PUNYCODE_BIG_OUTPUT,  | 
291  |  |  *   or %IDN2_PUNYCODE_OVERFLOW.  | 
292  |  |  *  | 
293  |  |  * Since: 2.3.5  | 
294  |  |  **/  | 
295  |  | int  | 
296  |  | idn2_punycode_decode (const char *input,  | 
297  |  |           size_t input_length,  | 
298  |  |           uint32_t *output, size_t *output_length)  | 
299  | 0  | { | 
300  | 0  |   uint32_t n, out = 0, i, max_out, bias, oldi, w, k, digit, t;  | 
301  | 0  |   size_t b = 0, j, in;  | 
302  |  | 
  | 
303  | 0  |   if (!input_length)  | 
304  | 0  |     return IDN2_PUNYCODE_BAD_INPUT;  | 
305  |  |  | 
306  |  |   /* Check that all chars are basic */  | 
307  | 0  |   for (j = 0; j < input_length; ++j)  | 
308  | 0  |     { | 
309  | 0  |       if (!decode_basic (input[j]))  | 
310  | 0  |   return IDN2_PUNYCODE_BAD_INPUT;  | 
311  | 0  |       if (input[j] == delimiter)  | 
312  | 0  |   b = j;  | 
313  | 0  |     }  | 
314  |  |  | 
315  | 0  |   max_out =  | 
316  | 0  |     *output_length > UINT32_MAX ? UINT32_MAX : (uint32_t) * output_length;  | 
317  |  | 
  | 
318  | 0  |   if (input[b] == delimiter)  | 
319  | 0  |     { | 
320  |  |       /* do not accept leading or trailing delimiter  | 
321  |  |        *   - leading delim must be omitted if there is no ASCII char in u-label  | 
322  |  |        *   - trailing delim means there where no non-ASCII chars in u-label  | 
323  |  |        */  | 
324  | 0  |       if (!b || b == input_length - 1)  | 
325  | 0  |   return IDN2_PUNYCODE_BAD_INPUT;  | 
326  |  |  | 
327  | 0  |       if (b >= max_out)  | 
328  | 0  |   return IDN2_PUNYCODE_BIG_OUTPUT;  | 
329  |  |  | 
330  |  |       /* Check that all chars before last delimiter are basic chars */  | 
331  |  |       /* and copy the first b code points to the output. */  | 
332  | 0  |       for (j = 0; j < b; j++)  | 
333  | 0  |   output[out++] = input[j];  | 
334  |  | 
  | 
335  | 0  |       b += 1;     /* advance to non-basic char encoding */  | 
336  | 0  |     }  | 
337  |  |  | 
338  |  |   /* Initialize the state: */  | 
339  | 0  |   n = initial_n;  | 
340  | 0  |   i = 0;  | 
341  | 0  |   bias = initial_bias;  | 
342  |  |  | 
343  |  |   /* Main decoding loop:  Start just after the last delimiter if any  */  | 
344  |  |   /* basic code points were copied; start at the beginning otherwise. */  | 
345  | 0  |   for (in = b; in < input_length; ++out)  | 
346  | 0  |     { | 
347  |  |  | 
348  |  |       /* in is the index of the next ASCII code point to be consumed, */  | 
349  |  |       /* and out is the number of code points in the output array.    */  | 
350  |  |  | 
351  |  |       /* Decode a generalized variable-length integer into delta,  */  | 
352  |  |       /* which gets added to i.  The overflow checking is easier   */  | 
353  |  |       /* if we increase i as we go, then subtract off its starting */  | 
354  |  |       /* value at the end to obtain delta.                         */  | 
355  | 0  |       for (oldi = i, w = 1, k = base;; k += base)  | 
356  | 0  |   { | 
357  | 0  |     if (in >= input_length)  | 
358  | 0  |       return IDN2_PUNYCODE_BAD_INPUT;  | 
359  | 0  |     digit = decode_digit (input[in++]);  | 
360  | 0  |     if (digit >= base)  | 
361  | 0  |       return IDN2_PUNYCODE_BAD_INPUT;  | 
362  | 0  |     if (digit > (UINT32_MAX - i) / w)  | 
363  | 0  |       return IDN2_PUNYCODE_OVERFLOW;  | 
364  | 0  |     i += digit * w;  | 
365  | 0  |     t = k <= bias /* + tmin */ ? tmin :  /* +tmin not needed */  | 
366  | 0  |       k >= bias + tmax ? tmax : k - bias;  | 
367  | 0  |     if (digit < t)  | 
368  | 0  |       break;  | 
369  | 0  |     if (w > UINT32_MAX / (base - t))  | 
370  | 0  |       return IDN2_PUNYCODE_OVERFLOW;  | 
371  | 0  |     w *= (base - t);  | 
372  | 0  |   }  | 
373  |  |  | 
374  | 0  |       bias = adapt (i - oldi, out + 1, oldi == 0);  | 
375  |  |  | 
376  |  |       /* i was supposed to wrap around from out+1 to 0,   */  | 
377  |  |       /* incrementing n each time, so we'll fix that now: */  | 
378  | 0  |       if (i / (out + 1) > UINT32_MAX - n)  | 
379  | 0  |   return IDN2_PUNYCODE_OVERFLOW;  | 
380  | 0  |       n += i / (out + 1);  | 
381  | 0  |       if (n > 0x10FFFF || (n >= 0xD800 && n <= 0xDBFF))  | 
382  | 0  |   return IDN2_PUNYCODE_BAD_INPUT;  | 
383  | 0  |       i %= (out + 1);  | 
384  |  |  | 
385  |  |       /* Insert n at position i of the output: */  | 
386  |  |  | 
387  |  |       /* not needed for Punycode: */  | 
388  |  |       /* if (basic(n)) return IDN2_PUNYCODE_BAD_INPUT; */  | 
389  | 0  |       if (out >= max_out)  | 
390  | 0  |   return IDN2_PUNYCODE_BIG_OUTPUT;  | 
391  |  |  | 
392  | 0  |       memmove (output + i + 1, output + i, (out - i) * sizeof *output);  | 
393  | 0  |       output[i++] = n;  | 
394  | 0  |     }  | 
395  |  |  | 
396  | 0  |   *output_length = (size_t) out;  | 
397  |  |   /* cannot overflow because out <= old value of *output_length */  | 
398  | 0  |   return IDN2_OK;  | 
399  | 0  | }  | 
400  |  |  | 
401  |  | /* We are stuck exporting these old internal interfaces because old  | 
402  |  |    versions of GNUTLS used them, and maybe other software. */  | 
403  |  |  | 
404  |  | extern int _IDN2_API _idn2_punycode_decode (size_t input_length,  | 
405  |  |               const char input[],  | 
406  |  |               size_t *output_length,  | 
407  |  |               uint32_t output[]);  | 
408  |  | extern int _IDN2_API _idn2_punycode_encode (size_t input_length,  | 
409  |  |               const uint32_t input[],  | 
410  |  |               size_t *output_length,  | 
411  |  |               char output[]);  | 
412  |  |  | 
413  |  | int  | 
414  |  | _idn2_punycode_decode (size_t input_length,  | 
415  |  |            const char input[],  | 
416  |  |            size_t *output_length, uint32_t output[])  | 
417  | 0  | { | 
418  | 0  |   return idn2_punycode_decode (input, input_length, output, output_length);  | 
419  |  | 
  | 
420  | 0  | }  | 
421  |  |  | 
422  |  | int  | 
423  |  | _idn2_punycode_encode (size_t input_length,  | 
424  |  |            const uint32_t input[],  | 
425  |  |            size_t *output_length, char output[])  | 
426  | 0  | { | 
427  | 0  |   return idn2_punycode_encode (input, input_length, output, output_length);  | 
428  | 0  | }  |