Coverage Report

Created: 2026-06-30 07:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libpsl/src/psl.c
Line
Count
Source
1
/*
2
 * Copyright(c) 2014-2024 Tim Ruehsen
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice shall be included in
12
 * all copies or substantial portions of the Software.
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20
 * DEALINGS IN THE SOFTWARE.
21
 *
22
 * This file is part of libpsl.
23
 *
24
 * Public Suffix List routines
25
 *
26
 * Changelog
27
 * 19.03.2014  Tim Ruehsen  created from libmget/cookie.c
28
 *
29
 */
30
31
#if HAVE_CONFIG_H
32
# include <config.h>
33
#endif
34
35
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
36
#       define GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
37
#else
38
#       define GCC_VERSION_AT_LEAST(major, minor) 0
39
#endif
40
41
/* Must be defined before <sys/stat.h> */
42
#if defined(_MSC_VER) || defined(__MINGW32__)
43
# define USE_WIN32_LARGE_FILES
44
# ifdef __MINGW32__
45
#   ifndef _FILE_OFFSET_BITS
46
#    define _FILE_OFFSET_BITS 64
47
#   endif
48
# endif
49
#endif
50
51
#include <sys/types.h>
52
#include <sys/stat.h>
53
54
#if defined(_WIN32) && (defined(WITH_LIBIDN2) || defined(WITH_LIBIDN))
55
# ifndef WIN32_LEAN_AND_MEAN
56
# define WIN32_LEAN_AND_MEAN
57
# endif
58
# include <windows.h> /* for GetACP() */
59
#endif
60
61
#if defined(_WIN32)
62
# ifdef USE_WIN32_LARGE_FILES
63
#  define struct_stat  struct _stati64
64
#  define func_sys_stat _stati64
65
# else
66
#  define struct_stat  struct _stat
67
#  define func_sys_stat _stat
68
# endif
69
#endif
70
71
#ifndef struct_stat
72
3
# define struct_stat   struct stat
73
1
# define func_sys_stat stat
74
#endif
75
76
#if defined(_MSC_VER) && ! defined(ssize_t)
77
# include <basetsd.h>
78
typedef SSIZE_T ssize_t;
79
#endif
80
81
#include <stdio.h>
82
#include <stdlib.h>
83
#include <string.h>
84
#include <ctype.h>
85
#include <time.h>
86
#include <errno.h>
87
#include <limits.h> /* for UINT_MAX */
88
89
#ifdef HAVE_NL_LANGINFO
90
# include <langinfo.h>
91
#endif
92
93
#ifdef _WIN32
94
# include <malloc.h>
95
#endif
96
97
#ifdef WITH_LIBICU
98
# include <unicode/uversion.h>
99
# include <unicode/ustring.h>
100
# include <unicode/uidna.h>
101
# include <unicode/ucnv.h>
102
#elif defined(WITH_LIBICUCORE)
103
# include <iconv.h>
104
# include <unicode/uversion.h>
105
# include <unicode/ustring.h>
106
# include <unicode/uidna.h>
107
#elif defined(WITH_LIBICU_WIN)
108
# include <icu.h>
109
#elif defined(WITH_LIBIDN2)
110
# include <iconv.h>
111
# include <idn2.h>
112
# include <unicase.h>
113
# include <unistr.h>
114
#elif defined(WITH_LIBIDN)
115
# include <iconv.h>
116
# include <stringprep.h>
117
# include <idna.h>
118
# include <unicase.h>
119
# include <unistr.h>
120
#endif
121
122
#ifdef WINICONV_CONST
123
#  define ICONV_CONST WINICONV_CONST
124
#endif
125
#ifndef ICONV_CONST
126
#  define ICONV_CONST
127
#endif
128
129
130
#include <libpsl.h>
131
132
/**
133
 * SECTION:libpsl
134
 * @short_description: Public Suffix List library functions
135
 * @title: libpsl
136
 * @stability: Stable
137
 * @include: libpsl.h
138
 *
139
 * [Public Suffix List](https://publicsuffix.org/) library functions.
140
 *
141
 */
142
143
#define countof(a) (sizeof(a)/sizeof(*(a)))
144
145
0
#define PRIV_PSL_FLAG_EXCEPTION (1<<0)
146
0
#define PRIV_PSL_FLAG_WILDCARD  (1<<1)
147
0
#define PRIV_PSL_FLAG_ICANN     (1<<2) /* entry of ICANN section */
148
0
#define PRIV_PSL_FLAG_PRIVATE   (1<<3) /* entry of PRIVATE section */
149
0
#define PRIV_PSL_FLAG_PLAIN     (1<<4) /* just used for PSL syntax checking */
150
151
typedef struct {
152
  char
153
    label_buf[128];
154
  const char *
155
    label;
156
  unsigned short
157
    length;
158
  unsigned char
159
    nlabels, /* number of labels */
160
    flags;
161
} psl_entry_t;
162
163
/* stripped down version libmget vector routines */
164
typedef struct {
165
  int
166
    (*cmp)(const psl_entry_t **, const psl_entry_t **); /* comparison function */
167
  psl_entry_t
168
    **entry; /* pointer to array of pointers to elements */
169
  int
170
    max,     /* allocated elements */
171
    cur;     /* number of elements in use */
172
} psl_vector_t;
173
174
struct psl_ctx_st {
175
  psl_vector_t
176
    *suffixes;
177
  unsigned char
178
    *dafsa;
179
  size_t
180
    dafsa_size;
181
  int
182
    nsuffixes,
183
    nexceptions,
184
    nwildcards;
185
  unsigned
186
    utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
187
};
188
189
/* include the PSL data generated by psl-make-dafsa */
190
#ifdef ENABLE_BUILTIN
191
#include "suffixes_dafsa.h"
192
#else
193
static const unsigned char kDafsa[] = "";
194
static time_t _psl_file_time = 0;
195
static int _psl_nsuffixes = 0;
196
static int _psl_nexceptions = 0;
197
static int _psl_nwildcards = 0;
198
static const char _psl_sha1_checksum[] = "";
199
static const char _psl_filename[] = "";
200
#endif
201
202
/* references to these PSLs will result in lookups to built-in data */
203
static const psl_ctx_t
204
  builtin_psl;
205
206
#ifdef PSL_DISTFILE
207
static const char _psl_dist_filename[] = PSL_DISTFILE;
208
#else
209
static const char _psl_dist_filename[] = "";
210
#endif
211
212
static psl_vector_t *vector_alloc(int max, int (*cmp)(const psl_entry_t **, const psl_entry_t **))
213
0
{
214
0
  psl_vector_t *v;
215
216
0
  if (!(v = calloc(1, sizeof(psl_vector_t))))
217
0
    return NULL;
218
219
0
  if (!(v->entry = malloc(max * sizeof(psl_entry_t *)))) {
220
0
    free(v);
221
0
    return NULL;
222
0
  }
223
224
0
  v->max = max;
225
0
  v->cmp = cmp;
226
0
  return v;
227
0
}
228
229
static void vector_free(psl_vector_t **v)
230
0
{
231
0
  if (v && *v) {
232
0
    if ((*v)->entry) {
233
0
      int it;
234
235
0
      for (it = 0; it < (*v)->cur; it++)
236
0
        free((*v)->entry[it]);
237
238
0
      free((*v)->entry);
239
0
    }
240
0
    free(*v);
241
0
  }
242
0
}
243
244
static psl_entry_t *vector_get(const psl_vector_t *v, int pos)
245
0
{
246
0
  if (pos < 0 || !v || pos >= v->cur) return NULL;
247
248
0
  return v->entry[pos];
249
0
}
250
251
/* the entries must be sorted by */
252
static int vector_find(const psl_vector_t *v, const psl_entry_t *elem)
253
0
{
254
0
  if (v) {
255
0
    int l, r, m;
256
0
    int res;
257
258
    /* binary search for element (exact match) */
259
0
    for (l = 0, r = v->cur - 1; l <= r;) {
260
0
      m = (l + r) / 2;
261
0
      if ((res = v->cmp(&elem, (const psl_entry_t **)&(v->entry[m]))) > 0) l = m + 1;
262
0
      else if (res < 0) r = m - 1;
263
0
      else return m;
264
0
    }
265
0
  }
266
267
0
  return -1; /* not found */
268
0
}
269
270
static int vector_add(psl_vector_t *v, const psl_entry_t *elem)
271
0
{
272
0
  if (v) {
273
0
    void *elemp;
274
275
0
    if (!(elemp = malloc(sizeof(psl_entry_t))))
276
0
      return -1;
277
278
0
    memcpy(elemp, elem, sizeof(psl_entry_t));
279
280
0
    if (v->max == v->cur) {
281
0
      void *m = realloc(v->entry, (v->max *= 2) * sizeof(psl_entry_t *));
282
283
0
      if (m)
284
0
        v->entry = m;
285
0
      else {
286
0
        free(elemp);
287
0
        return -1;
288
0
      }
289
0
    }
290
291
0
    v->entry[v->cur++] = elemp;
292
0
    return v->cur - 1;
293
0
  }
294
295
0
  return -1;
296
0
}
297
298
static void vector_sort(psl_vector_t *v)
299
0
{
300
0
  if (v && v->cmp)
301
0
    qsort(v->entry, v->cur, sizeof(psl_vector_t **), (int(*)(const void *, const void *))v->cmp);
302
0
}
303
304
/* by this kind of sorting, we can easily see if a domain matches or not */
305
static int suffix_compare(const psl_entry_t *s1, const psl_entry_t *s2)
306
0
{
307
0
  int n;
308
309
0
  if ((n = s2->nlabels - s1->nlabels))
310
0
    return n; /* most labels first */
311
312
0
  if ((n = s1->length - s2->length))
313
0
    return n;  /* shorter rules first */
314
315
0
  return strncmp(s1->label ? s1->label : s1->label_buf, s2->label ? s2->label : s2->label_buf, s1->length);
316
0
}
317
318
/* needed to sort array of pointers, given to qsort() */
319
static int suffix_compare_array(const psl_entry_t **s1, const psl_entry_t **s2)
320
0
{
321
0
  return suffix_compare(*s1, *s2);
322
0
}
323
324
static int suffix_init(psl_entry_t *suffix, const char *rule, size_t length)
325
0
{
326
0
  const char *src;
327
0
  char *dst;
328
329
0
  suffix->label = suffix->label_buf;
330
331
0
  if (length >= sizeof(suffix->label_buf) - 1) {
332
0
    suffix->nlabels = 0;
333
    /* fprintf(stderr, "Suffix rule too long (%zd, ignored): %s\n", length, rule); */
334
0
    return -1;
335
0
  }
336
337
0
  suffix->length = (unsigned char)length;
338
339
0
  suffix->nlabels = 1;
340
341
0
  for (dst = suffix->label_buf, src = rule; *src;) {
342
0
    if (*src == '.')
343
0
      suffix->nlabels++;
344
0
    *dst++ = *src++;
345
0
  }
346
0
  *dst = 0;
347
348
0
  return 0;
349
0
}
350
351
static char *psl_strdup(const char *s)
352
6
{
353
6
  char *p = malloc(strlen(s) + 1);
354
6
  if (!p)
355
0
    return NULL;
356
6
  return strcpy(p, s);
357
6
}
358
359
#if !defined(WITH_LIBIDN) && !defined(WITH_LIBIDN2) && !defined(WITH_LIBICU) && !defined(WITH_LIBICUCORE) && !defined(WITH_LIBICU_WIN)
360
/*
361
 * When configured without runtime IDNA support (./configure --disable-runtime), we need a pure ASCII
362
 * representation of non-ASCII characters in labels as found in UTF-8 domain names.
363
 * This is because the current DAFSA format used may only hold character values [21..127].
364
 *
365
  Code copied from http://www.nicemice.net/idn/punycode-spec.gz on
366
  2011-01-04 with SHA-1 a966a8017f6be579d74a50a226accc7607c40133
367
  labeled punycode-spec 1.0.3 (2006-Mar-24-Thu).  It is modified for
368
  libpsl by Tim Rühsen.  License on the original code:
369
370
  punycode-spec 1.0.3 (2006-Mar-23-Thu)
371
  http://www.nicemice.net/idn/
372
  Adam M. Costello
373
  http://www.nicemice.net/amc/
374
375
  B. Disclaimer and license
376
377
    Regarding this entire document or any portion of it (including
378
    the pseudocode and C code), the author makes no guarantees and
379
    is not responsible for any damage resulting from its use.  The
380
    author grants irrevocable permission to anyone to use, modify,
381
    and distribute it in any way that does not diminish the rights
382
    of anyone else to use, modify, and distribute it, provided that
383
    redistributed derivative works do not contain misleading author or
384
    version information.  Derivative works need not be licensed under
385
    similar terms.
386
387
  C. Punycode sample implementation
388
389
  punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
390
  http://www.nicemice.net/idn/
391
  Adam M. Costello
392
  http://www.nicemice.net/amc/
393
394
  This is ANSI C code (C89) implementing Punycode 1.0.x.
395
 */
396
enum punycode_status {
397
  punycode_success = 0,
398
  punycode_bad_input = 1, /* Input is invalid.                       */
399
  punycode_big_output = 2, /* Output would exceed the space provided. */
400
  punycode_overflow = 3 /* Wider integers needed to process input. */
401
};
402
403
#ifdef PUNYCODE_UINT
404
  typedef PUNYCODE_UINT punycode_uint;
405
#elif UINT_MAX >= (1 << 26) - 1
406
  typedef unsigned int punycode_uint;
407
#else
408
  typedef unsigned long punycode_uint;
409
#endif
410
411
/*** Bootstring parameters for Punycode ***/
412
enum {
413
  base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
414
  initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
415
};
416
417
static char encode_digit(punycode_uint d)
418
{
419
  return d + 22 + 75 * (d < 26);
420
  /*  0..25 map to ASCII a..z or A..Z */
421
  /* 26..35 map to ASCII 0..9         */
422
}
423
#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
424
static const punycode_uint maxint = -1;
425
426
static punycode_uint adapt(punycode_uint delta, punycode_uint numpoints, int firsttime)
427
{
428
  punycode_uint k;
429
430
  delta = firsttime ? delta / damp : delta >> 1;
431
  /* delta >> 1 is a faster way of doing delta / 2 */
432
  delta += delta / numpoints;
433
434
  for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
435
    delta /= base - tmin;
436
  }
437
438
  return k + (base - tmin + 1) * delta / (delta + skew);
439
}
440
441
static enum punycode_status punycode_encode(
442
  size_t input_length_orig,
443
  const punycode_uint input[],
444
  size_t *output_length,
445
  char output[])
446
{
447
  punycode_uint input_length, n, delta, h, b, bias, j, m, q, k, t;
448
  size_t out, max_out;
449
450
  /* The Punycode spec assumes that the input length is the same type */
451
  /* of integer as a code point, so we need to convert the size_t to  */
452
  /* a punycode_uint, which could overflow.                           */
453
454
  if (input_length_orig > maxint)
455
    return punycode_overflow;
456
457
  input_length = (punycode_uint) input_length_orig;
458
459
  /* Initialize the state: */
460
461
  n = initial_n;
462
  delta = 0;
463
  out = 0;
464
  max_out = *output_length;
465
  bias = initial_bias;
466
467
  /* Handle the basic code points: */
468
  for (j = 0; j < input_length; ++j) {
469
    if (input[j] < 0x80) {
470
      if (max_out - out < 2)
471
        return punycode_big_output;
472
      output[out++] = (char) input[j];
473
    }
474
    /* else if (input[j] < n) return punycode_bad_input; */
475
    /* (not needed for Punycode with unsigned code points) */
476
  }
477
478
  h = b = (punycode_uint) out;
479
  /* cannot overflow because out <= input_length <= maxint */
480
481
  /* h is the number of code points that have been handled, b is the  */
482
  /* number of basic code points, and out is the number of ASCII code */
483
  /* points that have been output.                                    */
484
485
  if (b > 0)
486
    output[out++] = delimiter;
487
488
  /* Main encoding loop: */
489
490
  while (h < input_length) {
491
    /* All non-basic code points < n have been     */
492
    /* handled already.  Find the next larger one: */
493
494
    for (m = maxint, j = 0; j < input_length; ++j) {
495
      /* if (basic(input[j])) continue; */
496
      /* (not needed for Punycode) */
497
      if (input[j] >= n && input[j] < m)
498
        m = input[j];
499
    }
500
501
    /* Increase delta enough to advance the decoder's    */
502
    /* <n,i> state to <m,0>, but guard against overflow: */
503
504
    if (m - n > (maxint - delta) / (h + 1))
505
      return punycode_overflow;
506
    delta += (m - n) * (h + 1);
507
    n = m;
508
509
    for (j = 0; j < input_length; ++j) {
510
      /* Punycode does not need to check whether input[j] is basic: */
511
      if (input[j] < n /* || basic(input[j]) */) {
512
        if (++delta == 0)
513
          return punycode_overflow;
514
      }
515
516
      if (input[j] == n) {
517
        /* Represent delta as a generalized variable-length integer: */
518
519
        for (q = delta, k = base;; k += base) {
520
          if (out >= max_out)
521
            return punycode_big_output;
522
          t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
523
            k >= bias + tmax ? tmax : k - bias;
524
          if (q < t)
525
            break;
526
          output[out++] = encode_digit(t + (q - t) % (base - t));
527
          q = (q - t) / (base - t);
528
        }
529
530
        output[out++] = encode_digit(q);
531
        bias = adapt(delta, h + 1, h == b);
532
        delta = 0;
533
        ++h;
534
      }
535
    }
536
537
    ++delta, ++n;
538
  }
539
540
  *output_length = out;
541
  return punycode_success;
542
}
543
544
static ssize_t utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
545
{
546
  size_t n = 0;
547
  const unsigned char *s = (void *)in;
548
  const unsigned char *e = (void *)(in + inlen);
549
550
  if (!outlen)
551
    return -1;
552
553
  outlen--;
554
555
  while (n < outlen) {
556
    size_t inleft = e - s;
557
558
    if (inleft >= 1 && (*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
559
      out[n++] = *s;
560
      s++;
561
    } else if (inleft >= 2 && (*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
562
      if ((s[1] & 0xC0) != 0x80)
563
        return -1;
564
      out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
565
      s += 2;
566
    } else if (inleft >= 3 && (*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
567
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
568
        return -1;
569
      out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
570
      s += 3;
571
    } else if (inleft >= 4 && (*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
572
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
573
        return -1;
574
      out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
575
      s += 4;
576
    } else if (!inleft) {
577
      break;
578
    } else
579
      return -1;
580
  }
581
582
  return n;
583
}
584
585
static int mem_is_ascii(const char *s, size_t n)
586
{
587
  for (; n; n--) /* 'while(n--)' generates unsigned integer overflow on n = 0 */
588
    if (*((unsigned char *)s++) >= 128)
589
      return 0;
590
591
  return 1;
592
}
593
594
static int domain_to_punycode(const char *domain, char *out, size_t outsize)
595
{
596
  size_t outlen = 0, labellen;
597
  punycode_uint input[256];
598
  const char *label, *e;
599
600
  for (e = label = domain; e;) {
601
    e = strchr(label, '.');
602
    labellen = e ? (size_t) (e - label) : strlen(label);
603
604
    if (mem_is_ascii(label, labellen)) {
605
      if (outlen + labellen + (e != NULL) >= outsize)
606
        return 1;
607
608
      memcpy(out + outlen, label, labellen);
609
      outlen += labellen;
610
    } else {
611
      ssize_t inputlen = 0;
612
613
      if (outlen + labellen + (e != NULL) + 4 >= outsize)
614
        return 1;
615
616
      if ((inputlen = utf8_to_utf32(label, labellen, input, countof(input))) < 0)
617
        return 1;
618
619
      memcpy(out + outlen, "xn--", 4);
620
      outlen += 4;
621
622
      labellen = outsize - outlen - (e != NULL) - 1; /* -1 to leave space for the trailing \0 */
623
      if (punycode_encode(inputlen, input, &labellen, out + outlen))
624
        return 1;
625
      outlen += labellen;
626
    }
627
628
    if (e) {
629
      label = e + 1;
630
      out[outlen++] = '.';
631
    }
632
    out[outlen] = 0;
633
  }
634
635
  return 0;
636
}
637
#endif
638
639
static int isspace_ascii(const char c)
640
0
{
641
0
  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
642
0
}
643
644
static int str_is_ascii(const char *s)
645
6
{
646
12
  while (*s && *((unsigned char *)s) < 128) s++;
647
648
6
  return !*s;
649
6
}
650
651
#if defined(WITH_LIBIDN)
652
/*
653
 * Work around a libidn <= 1.30 vulnerability.
654
 *
655
 * The function checks for a valid UTF-8 character sequence before
656
 * passing it to idna_to_ascii_8z().
657
 *
658
 * [1] https://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
659
 * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
660
 * [3] https://curl.haxx.se/mail/lib-2015-06/0143.html
661
 */
662
static int utf8_is_valid(const char *utf8)
663
{
664
  const unsigned char *s = (const unsigned char *) utf8;
665
666
  while (*s) {
667
    if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
668
      s++;
669
    else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
670
      if ((s[1] & 0xC0) != 0x80)
671
        return 0;
672
      s += 2;
673
    } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
674
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
675
        return 0;
676
      s += 3;
677
    } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
678
      if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
679
        return 0;
680
      s += 4;
681
    } else
682
      return 0;
683
  }
684
685
  return 1;
686
}
687
#endif
688
689
typedef void *psl_idna_t;
690
691
static psl_idna_t *psl_idna_open(void)
692
0
{
693
#if defined(WITH_LIBICU) || defined(WITH_LIBICUCORE) || defined(WITH_LIBICU_WIN)
694
  UErrorCode status = 0;
695
  return (void *)uidna_openUTS46(UIDNA_USE_STD3_RULES | UIDNA_NONTRANSITIONAL_TO_ASCII, &status);
696
#endif
697
0
  return NULL;
698
0
}
699
700
static void psl_idna_close(psl_idna_t *idna)
701
0
{
702
0
  (void) idna;
703
704
#if defined(WITH_LIBICU) || defined(WITH_LIBICUCORE) || defined(WITH_LIBICU_WIN)
705
  if (idna)
706
    uidna_close((UIDNA *)idna);
707
#endif
708
0
}
709
710
static int psl_idna_toASCII(psl_idna_t *idna, const char *utf8, char **ascii)
711
0
{
712
0
  int ret = -1;
713
714
#if defined(WITH_LIBICU) || defined(WITH_LIBICUCORE) || defined(WITH_LIBICU_WIN)
715
  (void) idna;
716
717
  /* IDNA2008 UTS#46 punycode conversion */
718
  if (idna) {
719
    char lookupname_buf[128] = "", *lookupname = lookupname_buf;
720
    UErrorCode status = 0;
721
    UIDNAInfo info = UIDNA_INFO_INITIALIZER;
722
    UChar utf16_dst[128], utf16_src_buf[128];
723
    UChar *utf16_src = utf16_src_buf;
724
    int32_t utf16_src_length, bytes_written;
725
    int32_t utf16_dst_length;
726
727
    u_strFromUTF8(utf16_src, countof(utf16_src_buf), &utf16_src_length, utf8, -1, &status);
728
    if (!U_SUCCESS(status)) goto cleanup; /* UTF-8 to UTF-16 conversion failed */
729
730
    if (utf16_src_length >= (int) countof(utf16_src_buf)) {
731
      utf16_src = malloc((utf16_src_length + 1) * sizeof(UChar));
732
      if (!utf16_src) goto cleanup;
733
734
      u_strFromUTF8(utf16_src, utf16_src_length, NULL, utf8, -1, &status);
735
      if (!U_SUCCESS(status)) goto cleanup; /* UTF-8 to UTF-16 conversion failed */
736
737
      utf16_src[utf16_src_length] = 0; /* u_strFromUTF8() doesn't 0-terminate if dest is filled up */
738
    }
739
740
    utf16_dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, countof(utf16_dst), &info, &status);
741
    if (!U_SUCCESS(status)) goto cleanup; /* to ASCII conversion failed */
742
743
    u_strToUTF8(lookupname, sizeof(lookupname_buf), &bytes_written, utf16_dst, utf16_dst_length, &status);
744
    if (!U_SUCCESS(status)) goto cleanup; /* UTF-16 to UTF-8 conversion failed */
745
746
    if (bytes_written >= (int) sizeof(lookupname_buf)) {
747
      lookupname = malloc(bytes_written + 1);
748
      if (!lookupname) goto cleanup;
749
750
      u_strToUTF8(lookupname, bytes_written, NULL, utf16_dst, utf16_dst_length, &status);
751
      if (!U_SUCCESS(status)) goto cleanup; /* UTF-16 to UTF-8 conversion failed */
752
753
      lookupname[bytes_written] = 0; /* u_strToUTF8() doesn't 0-terminate if dest is filled up */
754
    } else {
755
      if (!(lookupname = psl_strdup(lookupname)))
756
        goto cleanup;
757
    }
758
759
    if (ascii) {
760
      *ascii = lookupname;
761
      lookupname = NULL;
762
    }
763
764
    ret = 0;
765
766
cleanup:
767
    if (lookupname != lookupname_buf)
768
      free(lookupname);
769
    if (utf16_src != utf16_src_buf)
770
      free(utf16_src);
771
  }
772
#elif defined(WITH_LIBIDN2)
773
#if IDN2_VERSION_NUMBER >= 0x00140000
774
0
  int rc;
775
776
0
  (void) idna;
777
778
  /* IDN2_TRANSITIONAL automatically converts to lowercase
779
   * IDN2_NFC_INPUT converts to NFC before toASCII conversion
780
   * Since IDN2_TRANSITIONAL implicitly does NFC conversion, we don't need
781
   * the additional IDN2_NFC_INPUT. But just for the unlikely case that the linked
782
   * library is not matching the headers when building and it doesn't support TR46,
783
   * we provide IDN2_NFC_INPUT. */
784
785
0
  if ((rc = idn2_lookup_u8((uint8_t *)utf8, (uint8_t **)ascii, IDN2_NFC_INPUT | IDN2_NONTRANSITIONAL)) == IDN2_OK)
786
0
    ret = 0;
787
  /* else
788
    fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
789
#else
790
  int rc;
791
  uint8_t *lower;
792
  size_t len = u8_strlen((uint8_t *)utf8) + 1;
793
794
  /* we need a conversion to lowercase */
795
  if (!(lower = u8_tolower((uint8_t *)utf8, len, 0, UNINORM_NFKC, NULL, &len))) {
796
    /* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
797
    return -1;
798
  }
799
800
  if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
801
    ret = 0;
802
  } /* else
803
    fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
804
805
  free(lower);
806
#endif
807
#elif defined(WITH_LIBIDN)
808
  int rc;
809
810
  (void) idna;
811
812
  if (!utf8_is_valid(utf8)) {
813
    /* fprintf(stderr, "Invalid UTF-8 sequence not converted: '%s'\n", utf8); */
814
    return -1;
815
  }
816
817
  /* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
818
819
  if ((rc = idna_to_ascii_8z(utf8, ascii, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
820
    ret = 0;
821
  } /* else
822
    fprintf(stderr, "toASCII failed (%d): %s\n", rc, idna_strerror(rc)); */
823
#else
824
  char lookupname[128];
825
826
  (void) idna;
827
828
  if (domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
829
    if (ascii)
830
      if ((*ascii = psl_strdup(lookupname)))
831
        ret = 0;
832
  }
833
#endif
834
835
0
  return ret;
836
0
}
837
838
static void add_punycode_if_needed(psl_idna_t *idna, psl_vector_t *v, psl_entry_t *e)
839
0
{
840
0
  char *lookupname;
841
842
0
  if (str_is_ascii(e->label_buf))
843
0
    return;
844
845
0
  if (psl_idna_toASCII(idna, e->label_buf, &lookupname) == 0) {
846
0
    if (strcmp(e->label_buf, lookupname)) {
847
0
      psl_entry_t suffix, *suffixp;
848
849
      /* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
850
0
      if (suffix_init(&suffix, lookupname, strlen(lookupname)) == 0) {
851
0
        suffix.flags = e->flags;
852
0
        if ((suffixp = vector_get(v, vector_add(v, &suffix))))
853
0
          suffixp->label = suffixp->label_buf; /* set label to changed address */
854
0
      }
855
0
    } /* else ignore */
856
857
0
    free(lookupname);
858
0
  }
859
0
}
860
861
/* prototypes */
862
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
863
int GetUtfMode(const unsigned char *graph, size_t length);
864
865
static int is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
866
0
{
867
0
  psl_entry_t suffix;
868
0
  const char *p;
869
0
  char *punycode = NULL;
870
0
  size_t domain_len;
871
0
  int need_conversion = 0;
872
873
  /* this function should be called without leading dots, just make sure */
874
0
  if (*domain == '.')
875
0
    domain++;
876
877
  /* a single leading dot needs to be handled here, so that e.g.,
878
   * co.uk and co.uk. are both detected as publicsuffix */
879
0
  domain_len = strlen(domain);
880
0
  if (domain_len > 0 && domain[domain_len - 1] == '.')
881
0
    domain_len--;
882
883
0
  suffix.nlabels = 1;
884
885
0
  for (p = domain; p < domain + domain_len; p++) {
886
0
    if (*p == '.') {
887
0
      if (suffix.nlabels == 255) /* weird input, avoid 8bit overflow */
888
0
        return 0;
889
0
      suffix.nlabels++;
890
0
    }
891
0
    else if (*((unsigned char *)p) >= 128)
892
0
      need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
893
0
  }
894
895
0
  if (suffix.nlabels == 1) {
896
    /* TLD, this is the prevailing '*' match. If type excludes the '*' rule, continue.
897
     */
898
0
    if (!(type & PSL_TYPE_NO_STAR_RULE))
899
0
      return 1;
900
0
  }
901
902
0
  type &= ~PSL_TYPE_NO_STAR_RULE;
903
904
0
  if (psl->utf8 || psl == &builtin_psl)
905
0
    need_conversion = 0;
906
907
0
  if (need_conversion) {
908
0
    psl_idna_t *idna = psl_idna_open();
909
910
0
    if (psl_idna_toASCII(idna, domain, &punycode) == 0) {
911
0
      suffix.label = punycode;
912
0
      suffix.length = strlen(punycode);
913
0
    } else {
914
      /* fallback */
915
916
0
      suffix.label = domain;
917
0
      suffix.length = domain_len;
918
0
    }
919
920
0
    psl_idna_close(idna);
921
0
  } else {
922
0
    suffix.label = domain;
923
0
    suffix.length = domain_len;
924
0
  }
925
926
0
  if (psl == &builtin_psl || psl->dafsa) {
927
0
    size_t dafsa_size = psl == &builtin_psl ? sizeof(kDafsa) : psl->dafsa_size;
928
0
    const unsigned char *dafsa = psl == &builtin_psl ? kDafsa : psl->dafsa;
929
0
    int rc = LookupStringInFixedSet(dafsa, dafsa_size, suffix.label, suffix.length);
930
0
    if (rc != -1) {
931
      /* check for correct rule type */
932
0
      if (type == PSL_TYPE_ICANN && !(rc & PRIV_PSL_FLAG_ICANN))
933
0
        goto suffix_no;
934
0
      else if (type == PSL_TYPE_PRIVATE && !(rc & PRIV_PSL_FLAG_PRIVATE))
935
0
        goto suffix_no;
936
937
0
      if (rc & PRIV_PSL_FLAG_EXCEPTION)
938
0
        goto suffix_no;
939
940
      /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
941
      /* definitely a match, no matter if the found rule is a wildcard or not */
942
0
      goto suffix_yes;
943
0
    }
944
0
    if ((suffix.label = strchr(suffix.label, '.'))) {
945
0
      suffix.label++;
946
0
      suffix.length = strlen(suffix.label);
947
0
      suffix.nlabels--;
948
949
0
      rc = LookupStringInFixedSet(dafsa, dafsa_size, suffix.label, suffix.length);
950
0
      if (rc != -1) {
951
        /* check for correct rule type */
952
0
        if (type == PSL_TYPE_ICANN && !(rc & PRIV_PSL_FLAG_ICANN))
953
0
          goto suffix_no;
954
0
        else if (type == PSL_TYPE_PRIVATE && !(rc & PRIV_PSL_FLAG_PRIVATE))
955
0
          goto suffix_no;
956
957
0
        if (rc & PRIV_PSL_FLAG_WILDCARD)
958
0
          goto suffix_yes;
959
0
      }
960
0
    }
961
0
  } else {
962
0
    psl_entry_t *rule = vector_get(psl->suffixes, 0);
963
964
0
    if (!rule || rule->nlabels < suffix.nlabels - 1)
965
0
      goto suffix_no;
966
967
0
    rule = vector_get(psl->suffixes, vector_find(psl->suffixes, &suffix));
968
969
0
    if (rule) {
970
      /* check for correct rule type */
971
0
      if (type == PSL_TYPE_ICANN && !(rule->flags & PRIV_PSL_FLAG_ICANN))
972
0
        goto suffix_no;
973
0
      else if (type == PSL_TYPE_PRIVATE && !(rule->flags & PRIV_PSL_FLAG_PRIVATE))
974
0
        goto suffix_no;
975
976
0
      if (rule->flags & PRIV_PSL_FLAG_EXCEPTION)
977
0
        goto suffix_no;
978
979
      /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
980
      /* definitely a match, no matter if the found rule is a wildcard or not */
981
0
      goto suffix_yes;
982
0
    }
983
984
0
    if ((suffix.label = strchr(suffix.label, '.'))) {
985
0
      suffix.label++;
986
0
      suffix.length = strlen(suffix.label);
987
0
      suffix.nlabels--;
988
989
0
      rule = vector_get(psl->suffixes, vector_find(psl->suffixes, &suffix));
990
991
0
      if (rule) {
992
        /* check for correct rule type */
993
0
        if (type == PSL_TYPE_ICANN && !(rule->flags & PRIV_PSL_FLAG_ICANN))
994
0
          goto suffix_no;
995
0
        else if (type == PSL_TYPE_PRIVATE && !(rule->flags & PRIV_PSL_FLAG_PRIVATE))
996
0
          goto suffix_no;
997
998
0
        if (rule->flags & PRIV_PSL_FLAG_WILDCARD)
999
0
          goto suffix_yes;
1000
0
      }
1001
0
    }
1002
0
  }
1003
1004
0
suffix_no:
1005
0
  if (punycode)
1006
0
    free(punycode);
1007
0
  return 0;
1008
1009
0
suffix_yes:
1010
0
  if (punycode)
1011
0
    free(punycode);
1012
0
  return 1;
1013
0
}
1014
1015
/**
1016
 * psl_is_public_suffix:
1017
 * @psl: PSL context
1018
 * @domain: Domain string
1019
 *
1020
 * This function checks if @domain is a public suffix by the means of the
1021
 * [Mozilla Public Suffix List](https://publicsuffix.org).
1022
 *
1023
 * For cookie domain checking see psl_is_cookie_domain_acceptable().
1024
 *
1025
 * International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
1026
 * Other encodings likely result in incorrect return values.
1027
 * Use helper function psl_str_to_utf8lower() for normalization @domain.
1028
 *
1029
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
1030
 * psl_builtin().
1031
 *
1032
 * Returns: 1 if domain is a public suffix, 0 if not.
1033
 *
1034
 * Since: 0.1
1035
 */
1036
int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
1037
0
{
1038
0
  if (!psl || !domain)
1039
0
    return 1;
1040
1041
0
  return is_public_suffix(psl, domain, PSL_TYPE_ANY);
1042
0
}
1043
1044
/**
1045
 * psl_is_public_suffix2:
1046
 * @psl: PSL context
1047
 * @domain: Domain string
1048
 * @type: Domain type
1049
 *
1050
 * This function checks if @domain is a public suffix by the means of the
1051
 * [Mozilla Public Suffix List](https://publicsuffix.org).
1052
 *
1053
 * @type specifies the PSL section where to perform the lookup. Valid values are
1054
 * %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN, %PSL_TYPE_NO_STAR_RULE, and %PSL_TYPE_ANY.
1055
 *
1056
 * %PSL_TYPE_NO_STAR_RULE switches of the 'prevailing star rule' (see
1057
 * [List](https://publicsuffix.org/list) under 'Algorithm' 2.).
1058
 * Applying the flag means that TLDs not explicitly listed in the PSL are *not* treated as public suffixes.
1059
 *
1060
 * International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
1061
 * Other encodings likely result in incorrect return values.
1062
 * Use helper function psl_str_to_utf8lower() for normalization @domain.
1063
 *
1064
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
1065
 * psl_builtin().
1066
 *
1067
 * Returns: 1 if domain is a public suffix, 0 if not.
1068
 *
1069
 * Since: 0.1
1070
 */
1071
int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
1072
0
{
1073
0
  if (!psl || !domain)
1074
0
    return 1;
1075
1076
0
  return is_public_suffix(psl, domain, type);
1077
0
}
1078
1079
/**
1080
 * psl_unregistrable_domain:
1081
 * @psl: PSL context
1082
 * @domain: Domain string
1083
 *
1084
 * This function finds the longest public suffix part of @domain by the means
1085
 * of the [Mozilla Public Suffix List](https://publicsuffix.org).
1086
 *
1087
 * International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
1088
 * Other encodings likely result in incorrect return values.
1089
 * Use helper function psl_str_to_utf8lower() for normalization @domain.
1090
 *
1091
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
1092
 * psl_builtin().
1093
 *
1094
 * Returns: Pointer to longest public suffix part of @domain or %NULL if @domain
1095
 * does not contain a public suffix (or if @psl is %NULL).
1096
 *
1097
 * Since: 0.1
1098
 */
1099
const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
1100
0
{
1101
0
  int nlabels = 0;
1102
0
  const char *p;
1103
1104
0
  if (!psl || !domain)
1105
0
    return NULL;
1106
1107
  /*
1108
   * In the main loop we introduce a O(N^2) behavior to avoid code duplication.
1109
   * To avoid nasty CPU hogging, we limit the lookup to max. 8 domain labels to the right.
1110
   */
1111
0
  for (p = domain + strlen(domain) - 1; p >= domain; p--) {
1112
0
    if (*p == '.' && ++nlabels > 8) {
1113
0
      domain = p + 1;
1114
0
      break;
1115
0
    }
1116
0
  }
1117
1118
  /*
1119
   *  We check from left to right to catch special PSL entries like 'forgot.his.name':
1120
   *   'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
1121
   */
1122
1123
0
  while (!is_public_suffix(psl, domain, 0)) {
1124
0
    if ((domain = strchr(domain, '.')))
1125
0
      domain++;
1126
0
    else
1127
0
      break; /* prevent endless loop if is_public_suffix() is broken. */
1128
0
  }
1129
1130
0
  return domain;
1131
0
}
1132
1133
/**
1134
 * psl_registrable_domain:
1135
 * @psl: PSL context
1136
 * @domain: Domain string
1137
 *
1138
 * This function finds the shortest private suffix part of @domain by the means
1139
 * of the [Mozilla Public Suffix List](https://publicsuffix.org).
1140
 *
1141
 * International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
1142
 * Other encodings likely result in incorrect return values.
1143
 * Use helper function psl_str_to_utf8lower() for normalization @domain.
1144
 *
1145
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
1146
 * psl_builtin().
1147
 *
1148
 * Returns: Pointer to shortest private suffix part of @domain or %NULL if @domain
1149
 * does not contain a private suffix (or if @psl is %NULL).
1150
 *
1151
 * Since: 0.1
1152
 */
1153
const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
1154
0
{
1155
0
  const char *p, *regdom = NULL;
1156
0
  int nlabels = 0;
1157
1158
0
  if (!psl || !domain || *domain == '.')
1159
0
    return NULL;
1160
1161
  /*
1162
   * In the main loop we introduce a O(N^2) behavior to avoid code duplication.
1163
   * To avoid nasty CPU hogging, we limit the lookup to max. 8 domain labels to the right.
1164
   */
1165
0
  for (p = domain + strlen(domain) - 1; p >= domain; p--) {
1166
0
    if (*p == '.' && ++nlabels > 8) {
1167
0
      domain = p + 1;
1168
0
      break;
1169
0
    }
1170
0
  }
1171
1172
  /*
1173
   *  We check from left to right to catch special PSL entries like 'forgot.his.name':
1174
   *   'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
1175
   */
1176
1177
0
  while (!is_public_suffix(psl, domain, 0)) {
1178
0
    if ((p = strchr(domain, '.'))) {
1179
0
      regdom = domain;
1180
0
      domain = p + 1;
1181
0
    } else
1182
0
      break; /* prevent endless loop if is_public_suffix() is broken. */
1183
0
  }
1184
1185
0
  return regdom;
1186
0
}
1187
1188
/**
1189
 * psl_load_file:
1190
 * @fname: Name of PSL file
1191
 *
1192
 * This function loads the public suffixes file named @fname.
1193
 * To free the allocated resources, call psl_free().
1194
 *
1195
 * The suffixes are expected to be UTF-8 encoded (lowercase + NFKC) if they are international.
1196
 *
1197
 * Returns: Pointer to a PSL context or %NULL on failure.
1198
 *
1199
 * Since: 0.1
1200
 */
1201
psl_ctx_t *psl_load_file(const char *fname)
1202
0
{
1203
0
  FILE *fp;
1204
0
  psl_ctx_t *psl = NULL;
1205
1206
0
  if (!fname)
1207
0
    return NULL;
1208
1209
0
  if ((fp = fopen(fname, "rb"))) {
1210
0
    psl = psl_load_fp(fp);
1211
0
    fclose(fp);
1212
0
  }
1213
1214
0
  return psl;
1215
0
}
1216
1217
/**
1218
 * psl_load_fp:
1219
 * @fp: %FILE pointer
1220
 *
1221
 * This function loads the public suffixes from a %FILE pointer.
1222
 * To free the allocated resources, call psl_free().
1223
 *
1224
 * The suffixes are expected to be UTF-8 encoded (lowercase + NFKC) if they are international.
1225
 *
1226
 * Returns: Pointer to a PSL context or %NULL on failure.
1227
 *
1228
 * Since: 0.1
1229
 */
1230
psl_ctx_t *psl_load_fp(FILE *fp)
1231
0
{
1232
0
  psl_ctx_t *psl;
1233
0
  psl_entry_t suffix, *suffixp;
1234
0
  char buf[256], *linep, *p;
1235
0
  int type = 0, is_dafsa;
1236
0
  psl_idna_t *idna;
1237
1238
0
  if (!fp)
1239
0
    return NULL;
1240
1241
0
  if (!(psl = calloc(1, sizeof(psl_ctx_t))))
1242
0
    return NULL;
1243
1244
  /* read first line to allow ASCII / DAFSA detection */
1245
0
  if (!(linep = fgets(buf, sizeof(buf) - 1, fp)))
1246
0
    goto fail;
1247
1248
0
  is_dafsa = strlen(buf) == 16 && !strncmp(buf, ".DAFSA@PSL_", 11);
1249
1250
0
  if (is_dafsa) {
1251
0
    void *m;
1252
0
    size_t size = 65536, n, len = 0;
1253
0
    int version = atoi(buf + 11);
1254
1255
0
    if (version != 0)
1256
0
      goto fail;
1257
1258
0
    if (!(psl->dafsa = malloc(size)))
1259
0
      goto fail;
1260
1261
0
    memcpy(psl->dafsa, buf, len);
1262
1263
0
    while ((n = fread(psl->dafsa + len, 1, size - len, fp)) > 0) {
1264
0
      len += n;
1265
0
      if (len >= size) {
1266
0
        if (!(m = realloc(psl->dafsa, size *= 2)))
1267
0
          goto fail;
1268
0
        psl->dafsa = m;
1269
0
      }
1270
0
    }
1271
1272
    /* release unused memory */
1273
0
    if ((m = realloc(psl->dafsa, len)))
1274
0
      psl->dafsa = m;
1275
0
    else if (!len)
1276
0
      psl->dafsa = NULL; /* realloc() just free'd psl->dafsa */
1277
1278
0
    psl->dafsa_size = len;
1279
0
    psl->utf8 = !!GetUtfMode(psl->dafsa, len);
1280
1281
0
    return psl;
1282
0
  }
1283
1284
0
  idna = psl_idna_open();
1285
1286
  /*
1287
   *  as of 02.11.2012, the list at https://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
1288
   *  as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
1289
   *  as of 07.10.2018, the list at https://publicsuffix.org/list/ contains ~8600 rules and 8 exceptions.
1290
   */
1291
0
  psl->suffixes = vector_alloc(8*1024, suffix_compare_array);
1292
0
  psl->utf8 = 1; /* we put UTF-8 and punycode rules in the lookup vector */
1293
1294
0
  do {
1295
0
    while (isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
1296
0
    if (!*linep) continue; /* skip empty lines */
1297
1298
0
    if (*linep == '/' && linep[1] == '/') {
1299
0
      if (!type) {
1300
0
        if (strstr(linep + 2, "===BEGIN ICANN DOMAINS==="))
1301
0
          type = PRIV_PSL_FLAG_ICANN;
1302
0
        else if (!type && strstr(linep + 2, "===BEGIN PRIVATE DOMAINS==="))
1303
0
          type = PRIV_PSL_FLAG_PRIVATE;
1304
0
      }
1305
0
      else if (type == PRIV_PSL_FLAG_ICANN && strstr(linep + 2, "===END ICANN DOMAINS==="))
1306
0
        type = 0;
1307
0
      else if (type == PRIV_PSL_FLAG_PRIVATE && strstr(linep + 2, "===END PRIVATE DOMAINS==="))
1308
0
        type = 0;
1309
1310
0
      continue; /* skip comments */
1311
0
    }
1312
1313
    /* parse suffix rule */
1314
0
    for (p = linep; *linep && !isspace_ascii(*linep);) linep++;
1315
0
    *linep = 0;
1316
1317
0
    if (*p == '!') {
1318
0
      p++;
1319
0
      suffix.flags = PRIV_PSL_FLAG_EXCEPTION | type;
1320
0
      psl->nexceptions++;
1321
0
    } else if (*p == '*') {
1322
0
      if (*++p != '.') {
1323
        /* fprintf(stderr, "Unsupported kind of rule (ignored): %s\n", p - 1); */
1324
0
        continue;
1325
0
      }
1326
0
      p++;
1327
      /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
1328
0
      suffix.flags = PRIV_PSL_FLAG_WILDCARD | PRIV_PSL_FLAG_PLAIN | type;
1329
0
      psl->nwildcards++;
1330
0
      psl->nsuffixes++;
1331
0
    } else {
1332
0
      suffix.flags = PRIV_PSL_FLAG_PLAIN | type;
1333
0
      psl->nsuffixes++;
1334
0
    }
1335
1336
0
    if (suffix_init(&suffix, p, linep - p) == 0) {
1337
0
      int index;
1338
1339
0
      if ((index = vector_find(psl->suffixes, &suffix)) >= 0) {
1340
        /* Found existing entry:
1341
         * Combination of exception and plain rule is ambiguous
1342
         * !foo.bar
1343
         * foo.bar
1344
         *
1345
         * Allowed:
1346
         * !foo.bar + *.foo.bar
1347
         * foo.bar + *.foo.bar
1348
         *
1349
         * We do not check here, let's do it later.
1350
         */
1351
1352
0
        suffixp = vector_get(psl->suffixes, index);
1353
0
        suffixp->flags |= suffix.flags;
1354
0
      } else {
1355
        /* New entry */
1356
0
        suffixp = vector_get(psl->suffixes, vector_add(psl->suffixes, &suffix));
1357
0
      }
1358
1359
0
      if (suffixp) {
1360
0
        suffixp->label = suffixp->label_buf; /* set label to changed address */
1361
0
        add_punycode_if_needed(idna, psl->suffixes, suffixp);
1362
0
      }
1363
0
    }
1364
0
  } while ((linep = fgets(buf, sizeof(buf), fp)));
1365
1366
0
  vector_sort(psl->suffixes);
1367
1368
0
  psl_idna_close(idna);
1369
1370
0
  return psl;
1371
1372
0
fail:
1373
0
  psl_free(psl);
1374
0
  return NULL;
1375
0
}
1376
1377
/**
1378
 * psl_free:
1379
 * @psl: PSL context pointer
1380
 *
1381
 * This function frees the the PSL context that has been retrieved via
1382
 * psl_load_fp() or psl_load_file().
1383
 *
1384
 * Since: 0.1
1385
 */
1386
void psl_free(psl_ctx_t *psl)
1387
1.90k
{
1388
1.90k
  if (psl && psl != &builtin_psl) {
1389
0
    vector_free(&psl->suffixes);
1390
0
    free(psl->dafsa);
1391
0
    free(psl);
1392
0
  }
1393
1.90k
}
1394
1395
/**
1396
 * psl_builtin:
1397
 *
1398
 * This function returns the PSL context that has been generated and built in at compile-time.
1399
 * You don't have to free the returned context explicitly.
1400
 *
1401
 * The builtin data also contains punycode entries, one for each international domain name.
1402
 *
1403
 * If the generation of built-in data has been disabled during compilation, %NULL will be returned.
1404
 * When using the builtin psl context, you can provide UTF-8 (lowercase + NFKC) or ASCII/ACE (punycode)
1405
 * representations of domains to functions like psl_is_public_suffix().
1406
 *
1407
 * Returns: Pointer to the built in PSL data or %NULL if this data is not available.
1408
 *
1409
 * Since: 0.1
1410
 */
1411
const psl_ctx_t *psl_builtin(void)
1412
1
{
1413
1
#ifdef ENABLE_BUILTIN
1414
1
  return &builtin_psl;
1415
#else
1416
  return NULL;
1417
#endif
1418
1
}
1419
1420
/**
1421
 * psl_suffix_count:
1422
 * @psl: PSL context pointer
1423
 *
1424
 * This function returns number of public suffixes maintained by @psl.
1425
 * The number of exceptions within the Public Suffix List are not included.
1426
 *
1427
 * If the information is not available, the return value is -1 (since 0.19).
1428
 * This is the case with DAFSA blobs or if @psl is %NULL.
1429
 *
1430
 * Returns: Number of public suffixes entries in PSL context or -1 if this information is not available.
1431
 *
1432
 * Since: 0.1
1433
 */
1434
int psl_suffix_count(const psl_ctx_t *psl)
1435
0
{
1436
0
  if (psl == &builtin_psl)
1437
0
    return _psl_nsuffixes;
1438
0
  else if (psl)
1439
0
    return psl->dafsa ? -1 : psl->nsuffixes;
1440
0
  else
1441
0
    return -1;
1442
0
}
1443
1444
/**
1445
 * psl_suffix_exception_count:
1446
 * @psl: PSL context pointer
1447
 *
1448
 * This function returns number of public suffix exceptions maintained by @psl.
1449
 *
1450
 * If the information is not available, the return value is -1 (since 0.19).
1451
 * This is the case with DAFSA blobs or if @psl is %NULL.
1452
 *
1453
 * Returns: Number of public suffix exceptions in PSL context or -1 if this information is not available.
1454
 *
1455
 * Since: 0.1
1456
 */
1457
int psl_suffix_exception_count(const psl_ctx_t *psl)
1458
0
{
1459
0
  if (psl == &builtin_psl)
1460
0
    return _psl_nexceptions;
1461
0
  else if (psl)
1462
0
    return psl->dafsa ? -1 : psl->nexceptions;
1463
0
  else
1464
0
    return -1;
1465
0
}
1466
1467
/**
1468
 * psl_suffix_wildcard_count:
1469
 * @psl: PSL context pointer
1470
 *
1471
 * This function returns number of public suffix wildcards maintained by @psl.
1472
 *
1473
 * If the information is not available, the return value is -1 (since 0.19).
1474
 * This is the case with DAFSA blobs or if @psl is %NULL.
1475
 *
1476
 * Returns: Number of public suffix wildcards in PSL context or -1 if this information is not available.
1477
 *
1478
 * Since: 0.10.0
1479
 */
1480
int psl_suffix_wildcard_count(const psl_ctx_t *psl)
1481
0
{
1482
0
  if (psl == &builtin_psl)
1483
0
    return _psl_nwildcards;
1484
0
  else if (psl)
1485
0
    return psl->dafsa ? -1 : psl->nwildcards;
1486
0
  else
1487
0
    return -1;
1488
0
}
1489
1490
/**
1491
 * psl_builtin_file_time:
1492
 *
1493
 * This function returns the mtime of the Public Suffix List file that has been built in.
1494
 *
1495
 * If the generation of built-in data has been disabled during compilation, 0 will be returned.
1496
 *
1497
 * Returns: time_t value or 0.
1498
 *
1499
 * Since: 0.1
1500
 */
1501
time_t psl_builtin_file_time(void)
1502
0
{
1503
0
  return _psl_file_time;
1504
0
}
1505
1506
/**
1507
 * psl_builtin_sha1sum:
1508
 *
1509
 * This function returns the SHA1 checksum of the Public Suffix List file that has been built in.
1510
 * The returned string is in lowercase hex encoding, e.g. "2af1e9e3044eda0678bb05949d7cca2f769901d8".
1511
 *
1512
 * If the generation of built-in data has been disabled during compilation, an empty string will be returned.
1513
 *
1514
 * Returns: String containing SHA1 checksum or an empty string.
1515
 *
1516
 * Since: 0.1
1517
 */
1518
const char *psl_builtin_sha1sum(void)
1519
0
{
1520
0
  return _psl_sha1_checksum;
1521
0
}
1522
1523
/**
1524
 * psl_builtin_filename:
1525
 *
1526
 * This function returns the file name of the Public Suffix List file that has been built in.
1527
 *
1528
 * If the generation of built-in data has been disabled during compilation, an empty string will be returned.
1529
 *
1530
 * Returns: String containing the PSL file name or an empty string.
1531
 *
1532
 * Since: 0.1
1533
 */
1534
const char *psl_builtin_filename(void)
1535
0
{
1536
0
  return _psl_filename;
1537
0
}
1538
1539
/**
1540
 * psl_builtin_outdated:
1541
 *
1542
 * This function checks if the built-in data is older than the file it has been created from.
1543
 * If it is, it might be a good idea for the application to reload the PSL.
1544
 * The mtime is taken as reference.
1545
 *
1546
 * If the PSL file does not exist, it is assumed that the built-in data is not outdated.
1547
 *
1548
 * Returns: 1 if the built-in is outdated, 0 otherwise.
1549
 *
1550
 * Since: 0.10.0
1551
 */
1552
int psl_builtin_outdated(void)
1553
0
{
1554
0
  struct_stat st;
1555
1556
0
  if (func_sys_stat(_psl_filename, &st) == 0 && st.st_mtime > _psl_file_time)
1557
0
    return 1;
1558
1559
0
  return 0;
1560
0
}
1561
1562
/**
1563
 * psl_dist_filename:
1564
 *
1565
 * This function returns the file name of the distribution/system PSL data file.
1566
 * This file will be considered by psl_latest().
1567
 *
1568
 * Return the filename that is set by ./configure --with-psl-distfile, or an empty string.
1569
 *
1570
 * Returns: String containing a PSL file name or an empty string.
1571
 *
1572
 * Since: 0.16
1573
 */
1574
const char *psl_dist_filename(void)
1575
0
{
1576
0
  return _psl_dist_filename;
1577
0
}
1578
1579
/**
1580
 * psl_get_version:
1581
 *
1582
 * Get libpsl version.
1583
 *
1584
 * Returns: String containing version of libpsl.
1585
 *
1586
 * Since: 0.2.5
1587
 **/
1588
const char *psl_get_version(void)
1589
0
{
1590
#ifdef WITH_LIBICU
1591
  return PACKAGE_VERSION " (+libicu/" U_ICU_VERSION ")";
1592
#elif defined(WITH_LIBICUCORE)
1593
  return PACKAGE_VERSION " (+libicucore/" U_ICU_VERSION ")";
1594
#elif defined(WITH_LIBICU_WIN)
1595
  return PACKAGE_VERSION " (+icu.lib/Windows)";
1596
#elif defined(WITH_LIBIDN2)
1597
0
  return PACKAGE_VERSION " (+libidn2/" IDN2_VERSION ")";
1598
#elif defined(WITH_LIBIDN)
1599
  return PACKAGE_VERSION " (+libidn/" STRINGPREP_VERSION ")";
1600
#else
1601
  return PACKAGE_VERSION " (no IDNA support)";
1602
#endif
1603
0
}
1604
1605
/**
1606
 * psl_check_version_number:
1607
 * @version: Version number (hex) to check against.
1608
 *
1609
 * Check the given version number is at minimum the current library version number.
1610
 * The version number must be a hexadecimal number like 0x000a01 (V0.10.1).
1611
 *
1612
 * Returns: Returns the library version number if the given version number is at least
1613
 * the version of the library, else return 0; If the argument is 0, the function returns
1614
 * the library version number without performing a check.
1615
 *
1616
 * Since: 0.11.0
1617
 **/
1618
int psl_check_version_number(int version)
1619
0
{
1620
0
  if (version) {
1621
0
    int major = version >> 16;
1622
0
    int minor = (version >> 8) & 0xFF;
1623
0
    int patch = version & 0xFF;
1624
1625
0
    if (major < PSL_VERSION_MAJOR
1626
0
      || (major == PSL_VERSION_MAJOR && minor < PSL_VERSION_MINOR)
1627
0
      || (major == PSL_VERSION_MAJOR && minor == PSL_VERSION_MINOR && patch < PSL_VERSION_PATCH))
1628
0
    {
1629
0
      return 0;
1630
0
    }
1631
0
  }
1632
1633
0
  return PSL_VERSION_NUMBER;
1634
0
}
1635
/*
1636
 * Return true if 'src' is a valid dotted quad, else false.
1637
 * Assume that characters '0'..'9' have consecutive byte values.
1638
 * credit:
1639
 *    inspired by Paul Vixie
1640
 */
1641
static int is_ip4(const char *s)
1642
3
{
1643
3
  int i, n;
1644
3
  unsigned char c;
1645
1646
3
  for (i = 0; i < 4; i++) {
1647
3
    if (!(c = *s++) || c < '0' || c > '9')
1648
3
      return 0;
1649
1650
0
    n = c - '0';
1651
0
    if ((c = *s++) && c >= '0' && c <= '9') {
1652
0
      n = n * 10 + c - '0';
1653
0
      if ((c = *s++) && c >= '0' && c <= '9') {
1654
0
        n = n * 10 + c - '0';
1655
0
        if ((c = *s++) && c >= '0' && c <= '9') {
1656
0
          n = n * 10 + c - '0';
1657
0
          c = *s++;
1658
0
        }
1659
0
      }
1660
0
    }
1661
1662
0
    if (n > 255)
1663
0
      return 0;
1664
1665
0
    if (i < 3 && c != '.')
1666
0
      return 0;
1667
0
  }
1668
1669
0
  return !c;
1670
3
}
1671
1672
static int hexval(unsigned c)
1673
3
{
1674
3
  if (c - '0' < 10) return c - '0';
1675
3
  c |= 32;
1676
3
  if (c - 'a' < 6) return c - 'a' + 10;
1677
3
  return -1;
1678
3
}
1679
1680
/*
1681
 * Original code taken from musl inet_pton(),
1682
 *   which has a standard MIT license (https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT).
1683
 * Amended and simplified to out needs.
1684
 */
1685
static int is_ip6(const char *s)
1686
3
{
1687
3
  int i, j, n, d, brk = -1, need_v4 = 0;
1688
1689
3
  if (*s == ':' && *++s != ':') return 0;
1690
1691
3
  for (i = 0; ; i++) {
1692
3
    if (s[0] == ':' && brk < 0) {
1693
0
      brk = i;
1694
0
      if (!*++s) break;
1695
0
      continue;
1696
0
    }
1697
3
    for (n = j = 0; j < 4 && (d = hexval(s[j])) >= 0; j++)
1698
0
      n = n * 16 + d;
1699
3
    if (j == 0) return 0;
1700
0
    if (!s[j] && (brk >= 0 || i == 7)) break;
1701
0
    if (i == 7) return 0;
1702
0
    if (s[j] != ':') {
1703
0
      if (s[j] != '.' || (i < 6 && brk < 0)) return 0;
1704
0
      need_v4 = 1;
1705
0
      i++;
1706
0
      break;
1707
0
    }
1708
0
    s += j + 1;
1709
0
  }
1710
1711
0
  if (need_v4 && !is_ip4(s)) return 0;
1712
0
  return 1;
1713
0
}
1714
1715
/* return whether hostname is an IP address or not */
1716
static int isip(const char *hostname)
1717
3
{
1718
3
  return is_ip4(hostname) || is_ip6(hostname);
1719
3
}
1720
1721
/**
1722
 * psl_is_cookie_domain_acceptable:
1723
 * @psl: PSL context pointer
1724
 * @hostname: The request hostname.
1725
 * @cookie_domain: The domain value from a cookie
1726
 *
1727
 * This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
1728
 * @hostname.
1729
 *
1730
 * For international domain names both, @hostname and @cookie_domain, have to be either in UTF-8 (lowercase + NFKC)
1731
 * or in ASCII/ACE (punycode) format. Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
1732
 *
1733
 * Use helper function psl_str_to_utf8lower() for normalization of @hostname and @cookie_domain.
1734
 *
1735
 * Hint for Windows users:
1736
 * Please make sure the calling application has called WSAStartup() before calling psl_is_cookie_domain_acceptable().
1737
 *
1738
 * Examples:
1739
 * 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
1740
 * but '.com' or 'com' would NOT be acceptable since 'com' is a public suffix.
1741
 *
1742
 * 2. Cookie domain 'his.name' would be acceptable for hostname 'remember.his.name',
1743
 *  but NOT for 'forgot.his.name' since 'forgot.his.name' is a public suffix.
1744
 *
1745
 * Returns: 1 if acceptable, 0 if not acceptable.
1746
 *
1747
 * Since: 0.1
1748
 */
1749
int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain)
1750
3
{
1751
3
  const char *p;
1752
3
  size_t hostname_length, cookie_domain_length;
1753
1754
3
  if (!psl || !hostname || !cookie_domain)
1755
0
    return 0;
1756
1757
3
  while (*cookie_domain == '.')
1758
0
    cookie_domain++;
1759
1760
3
  if (!strcmp(hostname, cookie_domain))
1761
0
    return 1; /* an exact match is acceptable (and pretty common) */
1762
1763
3
  if (isip(hostname))
1764
0
    return 0; /* Hostname is an IP address and these must match fully (RFC 6265, 5.1.3) */
1765
1766
3
  cookie_domain_length = strlen(cookie_domain);
1767
3
  hostname_length = strlen(hostname);
1768
1769
3
  if (cookie_domain_length >= hostname_length)
1770
3
    return 0; /* cookie_domain is too long */
1771
1772
0
  p = hostname + hostname_length - cookie_domain_length;
1773
0
  if (!strcmp(p, cookie_domain) && p[-1] == '.') {
1774
    /* OK, cookie_domain matches, but it must be longer than the longest public suffix in 'hostname' */
1775
1776
0
    if (!(p = psl_unregistrable_domain(psl, hostname)))
1777
0
      return 1;
1778
1779
0
    if (cookie_domain_length > strlen(p))
1780
0
      return 1;
1781
0
  }
1782
1783
0
  return 0;
1784
0
}
1785
1786
/**
1787
 * psl_free_string:
1788
 * @str: pointer to lowercase string returned by psl_str_to_utf8lower()
1789
 *
1790
 * This function free()'s the memory allocated by psl_str_to_utf8lower() when
1791
 * returning a lowercase string
1792
 *
1793
 * Since: 0.19
1794
 */
1795
void psl_free_string(char *str)
1796
0
{
1797
0
  if (str)
1798
0
    free(str);
1799
0
}
1800
1801
#if defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) || defined(WITH_LIBICUCORE)
1802
/* Avoid using strcasecmp() or _stricmp() */
1803
0
static int isUTF8(const char *s) {
1804
0
  return (s[0] == 'u' || s[0] == 'U')
1805
0
    && (s[1] == 't' || s[1] == 'T')
1806
0
    && (s[2] == 'f' || s[2] == 'F')
1807
0
    && s[3] == '-' && s[4] == 0;
1808
0
}
1809
1810
static char *idn_u8_tolower(const char *buf, size_t len, const char *locale)
1811
0
{
1812
#if defined(WITH_LIBICUCORE)
1813
  if (len > INT_MAX)
1814
    return NULL;
1815
1816
  int32_t src_len = (int32_t)len;
1817
  if (src_len > 0 && buf[src_len - 1] == 0)
1818
    src_len--;
1819
1820
  UErrorCode status = U_ZERO_ERROR;
1821
  int32_t utf16_src_len;
1822
  u_strFromUTF8(NULL, 0, &utf16_src_len, buf, src_len, &status);
1823
  if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1824
    return NULL;
1825
1826
  UChar *utf16_src = malloc((size_t)utf16_src_len * sizeof(UChar));
1827
  if (!utf16_src)
1828
    return NULL;
1829
1830
  status = U_ZERO_ERROR;
1831
  u_strFromUTF8(utf16_src, utf16_src_len, NULL, buf, src_len, &status);
1832
  if (U_FAILURE(status)) {
1833
    free(utf16_src);
1834
    return NULL;
1835
  }
1836
1837
  status = U_ZERO_ERROR;
1838
  int32_t utf16_lower_len = u_strToLower(NULL, 0, utf16_src, utf16_src_len, locale, &status);
1839
  if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
1840
    free(utf16_src);
1841
    return NULL;
1842
  }
1843
1844
  UChar *utf16_lower = malloc((size_t)utf16_lower_len * sizeof(UChar));
1845
  if (!utf16_lower) {
1846
    free(utf16_src);
1847
    return NULL;
1848
  }
1849
1850
  status = U_ZERO_ERROR;
1851
  u_strToLower(utf16_lower, utf16_lower_len, utf16_src, utf16_src_len, locale, &status);
1852
  free(utf16_src);
1853
  if (U_FAILURE(status)) {
1854
    free(utf16_lower);
1855
    return NULL;
1856
  }
1857
1858
  status = U_ZERO_ERROR;
1859
  int32_t utf8_lower_len;
1860
  u_strToUTF8(NULL, 0, &utf8_lower_len, utf16_lower, utf16_lower_len, &status);
1861
  if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
1862
    free(utf16_lower);
1863
    return NULL;
1864
  }
1865
1866
  char *result = malloc((size_t)utf8_lower_len + 1);
1867
  if (!result) {
1868
    free(utf16_lower);
1869
    return NULL;
1870
  }
1871
1872
  status = U_ZERO_ERROR;
1873
  u_strToUTF8(result, utf8_lower_len + 1, NULL, utf16_lower, utf16_lower_len, &status);
1874
  free(utf16_lower);
1875
  if (U_FAILURE(status)) {
1876
    free(result);
1877
    result = NULL;
1878
  }
1879
1880
  return result;
1881
#else
1882
0
  (void) locale;
1883
1884
0
  return (char *)u8_tolower((uint8_t *)buf, len, 0, UNINORM_NFKC, NULL, &len);
1885
0
#endif
1886
0
}
1887
#endif
1888
1889
/**
1890
 * psl_str_to_utf8lower:
1891
 * @str: string to convert
1892
 * @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL
1893
 * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
1894
 * @lower: return value containing the converted string
1895
 *
1896
 * This helper function converts a string to UTF-8 lowercase + NFKC representation.
1897
 * Lowercase + NFKC UTF-8 is needed as input to the domain checking functions.
1898
 *
1899
 * @lower stays unchanged on error.
1900
 *
1901
 * When returning PSL_SUCCESS, the return value 'lower' must be freed after usage.
1902
 *
1903
 * Returns: psl_error_t value.
1904
 *   PSL_SUCCESS: Success
1905
 *   PSL_ERR_INVALID_ARG: @str is a %NULL value.
1906
 *   PSL_ERR_CONVERTER: Failed to open the unicode converter with name @encoding
1907
 *   PSL_ERR_TO_UTF16: Failed to convert @str to unicode
1908
 *   PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
1909
 *   PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
1910
 *   PSL_ERR_NO_MEM: Failed to allocate memory
1911
 *
1912
 * Since: 0.4
1913
 */
1914
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower)
1915
6
{
1916
6
  int ret = PSL_ERR_INVALID_ARG;
1917
1918
6
  (void) encoding;
1919
6
  (void) locale;
1920
1921
6
  if (!str)
1922
0
    return PSL_ERR_INVALID_ARG;
1923
1924
  /* shortcut to avoid costly conversion */
1925
6
  if (str_is_ascii(str)) {
1926
6
    if (lower) {
1927
6
      char *p, *tmp;
1928
1929
6
      if (!(tmp = psl_strdup(str)))
1930
0
        return PSL_ERR_NO_MEM;
1931
1932
6
      *lower = tmp;
1933
1934
      /* convert ASCII string to lowercase */
1935
12
      for (p = *lower; *p; p++)
1936
6
        if (isupper(*p))
1937
3
          *p = tolower(*p);
1938
6
    }
1939
6
    return PSL_SUCCESS;
1940
6
  }
1941
1942
#if defined(WITH_LIBICU) || defined(WITH_LIBICU_WIN)
1943
#define STACK_STRLENGTH 256
1944
  do {
1945
  UErrorCode status = 0;
1946
  UChar *utf16_dst, *utf16_lower;
1947
  char *utf8_lower;
1948
  int32_t utf16_dst_length, utf16_dst_size, utf16_lower_size, utf8_lower_size;
1949
  UConverter *uconv;
1950
  UChar utf16_dst_buf[STACK_STRLENGTH * 2 + 1];
1951
  UChar utf16_lower_buf[STACK_STRLENGTH * 2 + 1];
1952
  char utf8_lower_buf[STACK_STRLENGTH * 6 + 1];
1953
  size_t str_length = strlen(str);
1954
1955
  if (str_length <= STACK_STRLENGTH) {
1956
    utf16_dst_size = countof(utf16_dst_buf);
1957
    utf16_lower_size = countof(utf16_lower_buf);
1958
    utf8_lower_size = countof(utf8_lower_buf);
1959
    utf16_dst   = utf16_dst_buf;
1960
    utf16_lower = utf16_lower_buf;
1961
    utf8_lower  = utf8_lower_buf;
1962
  } else {
1963
    utf16_dst_size = utf16_lower_size = str_length * 2 + 1;
1964
    utf8_lower_size = str_length * 6 + 1;
1965
    utf16_dst   = malloc(sizeof(UChar) * utf16_dst_size);
1966
    utf16_lower = malloc(sizeof(UChar) * utf16_lower_size);
1967
    utf8_lower  = malloc(sizeof(char) * utf8_lower_size);
1968
1969
    if (!utf16_dst || !utf16_lower || !utf8_lower) {
1970
      ret = PSL_ERR_NO_MEM;
1971
      goto out;
1972
    }
1973
  }
1974
1975
  uconv = ucnv_open(encoding, &status);
1976
  if (U_SUCCESS(status)) {
1977
    utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, utf16_dst_size, str, str_length, &status);
1978
    ucnv_close(uconv);
1979
1980
    if (U_SUCCESS(status)) {
1981
      int32_t utf16_lower_length = u_strToLower(utf16_lower, utf16_lower_size, utf16_dst, utf16_dst_length, locale, &status);
1982
      if (U_SUCCESS(status)) {
1983
        u_strToUTF8(utf8_lower, utf8_lower_size, NULL, utf16_lower, utf16_lower_length, &status);
1984
        if (U_SUCCESS(status)) {
1985
          ret = PSL_SUCCESS;
1986
          if (lower) {
1987
            char *tmp = psl_strdup(utf8_lower);
1988
1989
            if (tmp)
1990
              *lower = tmp;
1991
            else
1992
              ret = PSL_ERR_NO_MEM;
1993
          }
1994
        } else {
1995
          ret = PSL_ERR_TO_UTF8;
1996
          /* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
1997
        }
1998
      } else {
1999
        ret = PSL_ERR_TO_LOWER;
2000
        /* fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status); */
2001
      }
2002
    } else {
2003
      ret = PSL_ERR_TO_UTF16;
2004
      /* fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status); */
2005
    }
2006
  } else {
2007
    ret = PSL_ERR_CONVERTER;
2008
    /* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
2009
  }
2010
out:
2011
  if (utf16_dst != utf16_dst_buf)
2012
    free(utf16_dst);
2013
  if (utf16_lower != utf16_lower_buf)
2014
    free(utf16_lower);
2015
  if (utf8_lower != utf8_lower_buf)
2016
    free(utf8_lower);
2017
2018
  } while (0);
2019
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) || defined(WITH_LIBICUCORE)
2020
0
  do {
2021
    /* find out local charset encoding */
2022
0
    if (!encoding) {
2023
0
#ifdef HAVE_NL_LANGINFO
2024
0
      encoding = nl_langinfo(CODESET);
2025
#elif defined _WIN32
2026
      static char buf[16];
2027
      snprintf(buf, sizeof(buf), "CP%u", GetACP());
2028
      encoding = buf;
2029
#endif
2030
0
      if (!encoding || !*encoding)
2031
0
        encoding = "ASCII";
2032
0
    }
2033
2034
    /* convert to UTF-8 */
2035
0
    if (!isUTF8(encoding)) {
2036
0
      iconv_t cd = iconv_open("utf-8", encoding);
2037
2038
0
      if (cd != (iconv_t)-1) {
2039
0
        char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
2040
0
        size_t tmp_len = strlen(str) + 1;
2041
0
        size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
2042
0
        char *dst = malloc(dst_len + 1), *dst_tmp = dst;
2043
2044
0
        if (!dst) {
2045
0
          ret = PSL_ERR_NO_MEM;
2046
0
        }
2047
0
        else if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
2048
0
          && iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
2049
0
        {
2050
          /* start size for u8_tolower internal memory allocation.
2051
           * u8_tolower() does not terminate the result string. we have 0 byte included in above tmp_len
2052
           * and thus in len. */
2053
0
          size_t len = dst_len - dst_len_tmp;
2054
2055
0
          if ((tmp = idn_u8_tolower(dst, len, locale))) {
2056
0
            ret = PSL_SUCCESS;
2057
0
            if (lower) {
2058
0
              *lower = tmp;
2059
0
              tmp = NULL;
2060
0
            } else
2061
0
              free(tmp);
2062
0
          } else {
2063
0
            ret = PSL_ERR_TO_LOWER;
2064
            /* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
2065
0
          }
2066
0
        } else {
2067
0
          ret = PSL_ERR_TO_UTF8;
2068
          /* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
2069
0
        }
2070
2071
0
        free(dst);
2072
0
        iconv_close(cd);
2073
0
      } else {
2074
0
        ret = PSL_ERR_TO_UTF8;
2075
        /* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
2076
0
      }
2077
0
    } else {
2078
      /* we need a conversion to lowercase */
2079
0
      char *tmp;
2080
2081
      /* start size for u8_tolower internal memory allocation.
2082
       * u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
2083
0
      size_t len = strlen(str) + 1;
2084
2085
0
      if ((tmp = idn_u8_tolower(str, len, locale))) {
2086
0
        ret = PSL_SUCCESS;
2087
0
        if (lower) {
2088
0
          *lower = tmp;
2089
0
          tmp = NULL;
2090
0
        } else
2091
0
          free(tmp);
2092
0
      } else {
2093
0
        ret = PSL_ERR_TO_LOWER;
2094
        /* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
2095
0
      }
2096
0
    }
2097
2098
0
  } while (0);
2099
0
#endif
2100
2101
0
  return ret;
2102
6
}
2103
2104
/* if file is newer than the builtin data, insert it reverse sorted by mtime */
2105
static int insert_file(const char *fname, const char **psl_fname, time_t *psl_mtime, int n)
2106
3
{
2107
3
  struct_stat st;
2108
3
  int it;
2109
2110
3
  if (fname && *fname && func_sys_stat(fname, &st) == 0 && st.st_mtime > _psl_file_time) {
2111
    /* add file name and mtime to end of array */
2112
0
    psl_fname[n] = fname;
2113
0
    psl_mtime[n++] = st.st_mtime;
2114
2115
    /* move the new entry to it's correct position */
2116
0
    for (it = n - 2; it >= 0 && st.st_mtime > psl_mtime[it]; it--) {
2117
0
      psl_fname[it + 1] = psl_fname[it];
2118
0
      psl_mtime[it + 1] = psl_mtime[it];
2119
0
      psl_fname[it] = fname;
2120
0
      psl_mtime[it] = st.st_mtime;
2121
0
    }
2122
0
  }
2123
2124
3
  return n;
2125
3
}
2126
2127
/**
2128
 * psl_latest:
2129
 * @fname: Name of PSL file or %NULL
2130
 *
2131
 * This function loads the the latest available PSL data from either
2132
 * - @fname (application specific filename, may be %NULL)
2133
 * - location specified during built-time (filename from ./configure --with-psl-distfile)
2134
 * - built-in PSL data (generated from ./configure --with-psl-file)
2135
 * - location of built-in data (filename from ./configure --with-psl-file)
2136
 *
2137
 * If none of the above is available, the function returns %NULL.
2138
 *
2139
 * To free the allocated resources, call psl_free().
2140
 *
2141
 * Returns: Pointer to a PSL context or %NULL on failure.
2142
 *
2143
 * Since: 0.16
2144
 */
2145
psl_ctx_t *psl_latest(const char *fname)
2146
1
{
2147
1
  psl_ctx_t *psl;
2148
1
  const char *psl_fname[3];
2149
1
  time_t psl_mtime[3];
2150
1
  int it, ntimes;
2151
2152
1
  psl_fname[0] = NULL; /* silence gcc 6.2 false warning */
2153
2154
  /* create array of PSL files reverse sorted by mtime (latest first) */
2155
1
  ntimes = insert_file(fname, psl_fname, psl_mtime, 0);
2156
1
  ntimes = insert_file(_psl_dist_filename, psl_fname, psl_mtime, ntimes);
2157
1
  ntimes = insert_file(_psl_filename, psl_fname, psl_mtime, ntimes);
2158
2159
  /* load PSL data from the latest file, falling back to the second recent, ... */
2160
1
  for (psl = NULL, it = 0; it < ntimes; it++) {
2161
0
    if (psl_mtime[it] > _psl_file_time)
2162
0
      if ((psl = psl_load_file(psl_fname[it])))
2163
0
        break;
2164
0
  }
2165
2166
  /* if file loading failed or there is no file newer than the builtin data,
2167
   * then return the builtin data. */
2168
1
  return psl ? psl : (psl_ctx_t *) psl_builtin();
2169
1
}