Coverage Report

Created: 2026-01-25 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gettext-0.26/gettext-tools/src/read-po-lex.c
Line
Count
Source
1
/* GNU gettext - internationalization aids
2
   Copyright (C) 1995-2024 Free Software Foundation, Inc.
3
4
   This file was written by Peter Miller <millerp@canb.auug.org.au>.
5
   Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
6
7
   This program is free software: you can redistribute it and/or modify
8
   it under the terms of the GNU General Public License as published by
9
   the Free Software Foundation; either version 3 of the License, or
10
   (at your option) any later version.
11
12
   This program is distributed in the hope that it will be useful,
13
   but WITHOUT ANY WARRANTY; without even the implied warranty of
14
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
   GNU General Public License for more details.
16
17
   You should have received a copy of the GNU General Public License
18
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
19
20
21
#ifdef HAVE_CONFIG_H
22
# include "config.h"
23
#endif
24
25
/* Specification.  */
26
#include "read-po-lex.h"
27
28
#include <errno.h>
29
#include <limits.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <stdarg.h>
34
35
#if HAVE_ICONV
36
# include <iconv.h>
37
#endif
38
39
#include <error.h>
40
#include "attribute.h"
41
#include "c-ctype.h"
42
#include "uniwidth.h"
43
#include "gettext.h"
44
#include "po-charset.h"
45
#include "xalloc.h"
46
#include "xvasprintf.h"
47
#include "xstrerror.h"
48
#include "po-error.h"
49
#include "xerror-handler.h"
50
#include "xmalloca.h"
51
#if !IN_LIBGETTEXTPO
52
# include "basename-lgpl.h"
53
# include "progname.h"
54
#endif
55
#include "c-strstr.h"
56
#include "pos.h"
57
#include "message.h"
58
#include "str-list.h"
59
#include "read-po.h"
60
#include "read-po-internal.h"
61
#include "read-po-gram.h"
62
63
3.16M
#define _(str) gettext(str)
64
65
#if HAVE_DECL_GETC_UNLOCKED
66
# undef getc
67
127M
# define getc getc_unlocked
68
#endif
69
70
71
/* Error handling during the parsing of a PO file.
72
   These functions can access ps->gram_pos and ps->gram_pos_column.  */
73
74
void
75
po_gram_error (struct po_parser_state *ps, const char *fmt, ...)
76
312k
{
77
312k
  va_list ap;
78
312k
  char *buffer;
79
80
312k
  va_start (ap, fmt);
81
312k
  if (vasprintf (&buffer, fmt, ap) < 0)
82
0
    ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
83
0
                           _("memory exhausted"));
84
312k
  va_end (ap);
85
312k
  ps->catr->xeh->xerror (CAT_SEVERITY_ERROR, NULL,
86
312k
                         ps->gram_pos.file_name, ps->gram_pos.line_number,
87
312k
                         ps->gram_pos_column + 1, false, buffer);
88
312k
  free (buffer);
89
90
312k
  if (*(ps->catr->xeh->error_message_count_p) >= gram_max_allowed_errors)
91
0
    ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
92
0
                           _("too many errors, aborting"));
93
312k
}
94
95
void
96
po_gram_error_at_line (abstract_catalog_reader_ty *catr, const lex_pos_ty *pp,
97
                       const char *fmt, ...)
98
2.99M
{
99
2.99M
  va_list ap;
100
2.99M
  char *buffer;
101
102
2.99M
  va_start (ap, fmt);
103
2.99M
  if (vasprintf (&buffer, fmt, ap) < 0)
104
0
    catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
105
0
                       _("memory exhausted"));
106
2.99M
  va_end (ap);
107
2.99M
  catr->xeh->xerror (CAT_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
108
2.99M
                     (size_t)(-1), false, buffer);
109
2.99M
  free (buffer);
110
111
2.99M
  if (*(catr->xeh->error_message_count_p) >= gram_max_allowed_errors)
112
0
    catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
113
0
                       _("too many errors, aborting"));
114
2.99M
}
115
116
117
/* Charset handling while parsing PO files.  */
118
119
/* Initialize the PO file's encoding.  */
120
static void
121
po_lex_charset_init (struct po_parser_state *ps)
122
8.54k
{
123
8.54k
  ps->po_lex_charset = NULL;
124
8.54k
  ps->catr->po_lex_isolate_start = NULL;
125
8.54k
  ps->catr->po_lex_isolate_end = NULL;
126
8.54k
#if HAVE_ICONV
127
8.54k
  ps->po_lex_iconv = (iconv_t)(-1);
128
8.54k
#endif
129
8.54k
  ps->po_lex_weird_cjk = false;
130
8.54k
}
131
132
/* Set the PO file's encoding from the header entry.
133
   If is_pot_role is true, "charset=CHARSET" is expected and does not deserve
134
   a warning.  */
135
void
136
po_lex_charset_set (struct po_parser_state *ps,
137
                    const char *header_entry,
138
                    const char *filename, bool is_pot_role)
139
33.5k
{
140
  /* Verify the validity of CHARSET.  It is necessary
141
     1. for the correct treatment of multibyte characters containing
142
        0x5C bytes in the PO lexer,
143
     2. so that at run time, gettext() can call iconv() to convert
144
        msgstr.  */
145
33.5k
  const char *charsetstr = c_strstr (header_entry, "charset=");
146
147
33.5k
  if (charsetstr != NULL)
148
11.5k
    {
149
11.5k
      size_t len;
150
11.5k
      char *charset;
151
11.5k
      const char *canon_charset;
152
153
11.5k
      charsetstr += strlen ("charset=");
154
11.5k
      len = strcspn (charsetstr, " \t\n");
155
11.5k
      charset = (char *) xmalloca (len + 1);
156
11.5k
      memcpy (charset, charsetstr, len);
157
11.5k
      charset[len] = '\0';
158
159
11.5k
      canon_charset = po_charset_canonicalize (charset);
160
11.5k
      if (canon_charset == NULL)
161
11.0k
        {
162
          /* Don't warn for POT files, because POT files usually contain
163
             only ASCII msgids.  */
164
11.0k
          size_t filenamelen = strlen (filename);
165
166
11.0k
          if (!(strcmp (charset, "CHARSET") == 0
167
1.31k
                && ((filenamelen >= 4
168
1.31k
                     && memcmp (filename + filenamelen - 4, ".pot", 4) == 0)
169
1.31k
                    || is_pot_role)))
170
11.0k
            {
171
11.0k
              char *warning_message =
172
11.0k
                xasprintf (_("\
173
11.0k
Charset \"%s\" is not a portable encoding name.\n\
174
11.0k
Message conversion to user's charset might not work.\n"),
175
11.0k
                           charset);
176
11.0k
              ps->catr->xeh->xerror (CAT_SEVERITY_WARNING, NULL,
177
11.0k
                                     filename, (size_t)(-1), (size_t)(-1), true,
178
11.0k
                                     warning_message);
179
11.0k
              free (warning_message);
180
11.0k
            }
181
11.0k
        }
182
568
      else
183
568
        {
184
568
          const char *envval;
185
186
568
          ps->po_lex_charset = canon_charset;
187
188
568
          if (strcmp (canon_charset, "UTF-8") == 0)
189
0
            {
190
0
              ps->catr->po_lex_isolate_start = "\xE2\x81\xA8";
191
0
              ps->catr->po_lex_isolate_end = "\xE2\x81\xA9";
192
0
            }
193
568
          else if (strcmp (canon_charset, "GB18030") == 0)
194
0
            {
195
0
              ps->catr->po_lex_isolate_start = "\x81\x36\xAC\x34";
196
0
              ps->catr->po_lex_isolate_end = "\x81\x36\xAC\x35";
197
0
            }
198
568
          else
199
568
            {
200
              /* The other encodings don't contain U+2068, U+2069.  */
201
568
              ps->catr->po_lex_isolate_start = NULL;
202
568
              ps->catr->po_lex_isolate_end = NULL;
203
568
            }
204
205
568
#if HAVE_ICONV
206
568
          if (ps->po_lex_iconv != (iconv_t)(-1))
207
233
            iconv_close (ps->po_lex_iconv);
208
568
#endif
209
210
          /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
211
             don't know about multibyte encodings, and require a spurious
212
             backslash after every multibyte character whose last byte is
213
             0x5C.  Some programs, like vim, distribute PO files in this
214
             broken format.  GNU msgfmt must continue to support this old
215
             PO file format when the Makefile requests it.  */
216
568
          envval = getenv ("OLD_PO_FILE_INPUT");
217
568
          if (envval != NULL && *envval != '\0')
218
0
            {
219
              /* Assume the PO file is in old format, with extraneous
220
                 backslashes.  */
221
0
#if HAVE_ICONV
222
0
              ps->po_lex_iconv = (iconv_t)(-1);
223
0
#endif
224
0
              ps->po_lex_weird_cjk = false;
225
0
            }
226
568
          else
227
568
            {
228
              /* Use iconv() to parse multibyte characters.  */
229
568
#if HAVE_ICONV
230
568
              ps->po_lex_iconv = iconv_open ("UTF-8", ps->po_lex_charset);
231
568
              if (ps->po_lex_iconv == (iconv_t)(-1))
232
0
                {
233
0
                  const char *progname;
234
0
                  char *warning_message;
235
0
                  const char *recommendation;
236
0
                  const char *note;
237
0
                  char *whole_message;
238
239
0
# if IN_LIBGETTEXTPO
240
0
                  progname = "libgettextpo";
241
# else
242
                  progname = last_component (program_name);
243
# endif
244
245
0
                  warning_message =
246
0
                    xasprintf (_("\
247
0
Charset \"%s\" is not supported. %s relies on iconv(),\n\
248
0
and iconv() does not support \"%s\".\n"),
249
0
                               ps->po_lex_charset, progname, ps->po_lex_charset);
250
251
0
# if !defined _LIBICONV_VERSION || (_LIBICONV_VERSION == 0x10b && defined __APPLE__)
252
0
                  recommendation = _("\
253
0
Installing GNU libiconv and then reinstalling GNU gettext\n\
254
0
would fix this problem.\n");
255
# else
256
                  recommendation = "";
257
# endif
258
259
                  /* Test for a charset which has double-byte characters
260
                     ending in 0x5C.  For these encodings, the string parser
261
                     is likely to be confused if it can't see the character
262
                     boundaries.  */
263
0
                  ps->po_lex_weird_cjk = po_is_charset_weird_cjk (ps->po_lex_charset);
264
0
                  if (po_is_charset_weird (ps->po_lex_charset)
265
0
                      && !ps->po_lex_weird_cjk)
266
0
                    note = _("Continuing anyway, expect parse errors.");
267
0
                  else
268
0
                    note = _("Continuing anyway.");
269
270
0
                  whole_message =
271
0
                    xasprintf ("%s%s%s\n",
272
0
                               warning_message, recommendation, note);
273
274
0
                  ps->catr->xeh->xerror (CAT_SEVERITY_WARNING, NULL,
275
0
                                         filename, (size_t)(-1), (size_t)(-1),
276
0
                                         true, whole_message);
277
278
0
                  free (whole_message);
279
0
                  free (warning_message);
280
0
                }
281
#else
282
              /* Test for a charset which has double-byte characters
283
                 ending in 0x5C.  For these encodings, the string parser
284
                 is likely to be confused if it can't see the character
285
                 boundaries.  */
286
              ps->po_lex_weird_cjk = po_is_charset_weird_cjk (ps->po_lex_charset);
287
              if (po_is_charset_weird (ps->po_lex_charset) && !ps->po_lex_weird_cjk)
288
                {
289
                  const char *progname;
290
                  char *warning_message;
291
                  const char *recommendation;
292
                  const char *note;
293
                  char *whole_message;
294
295
# if IN_LIBGETTEXTPO
296
                  progname = "libgettextpo";
297
# else
298
                  progname = last_component (program_name);
299
# endif
300
301
                  warning_message =
302
                    xasprintf (_("\
303
Charset \"%s\" is not supported. %s relies on iconv().\n\
304
This version was built without iconv().\n"),
305
                               ps->po_lex_charset, progname);
306
307
                  recommendation = _("\
308
Installing GNU libiconv and then reinstalling GNU gettext\n\
309
would fix this problem.\n");
310
311
                  note = _("Continuing anyway, expect parse errors.");
312
313
                  whole_message =
314
                    xasprintf ("%s%s%s\n",
315
                               warning_message, recommendation, note);
316
317
                  ps->catr->xeh->xerror (CAT_SEVERITY_WARNING, NULL,
318
                                         filename, (size_t)(-1), (size_t)(-1),
319
                                         true, whole_message);
320
321
                  free (whole_message);
322
                  free (warning_message);
323
                }
324
#endif
325
568
            }
326
568
        }
327
11.5k
      freea (charset);
328
11.5k
    }
329
21.9k
  else
330
21.9k
    {
331
      /* Don't warn for POT files, because POT files usually contain
332
         only ASCII msgids.  */
333
21.9k
      size_t filenamelen = strlen (filename);
334
335
21.9k
      if (!(filenamelen >= 4
336
21.9k
            && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
337
21.9k
        ps->catr->xeh->xerror (CAT_SEVERITY_WARNING,
338
21.9k
                               NULL, filename, (size_t)(-1), (size_t)(-1), true,
339
21.9k
                               _("\
340
21.9k
Charset missing in header.\n\
341
21.9k
Message conversion to user's charset will not work.\n"));
342
21.9k
    }
343
33.5k
}
344
345
/* Finish up with the PO file's encoding.  */
346
static void
347
po_lex_charset_close (struct po_parser_state *ps)
348
8.54k
{
349
8.54k
  ps->po_lex_charset = NULL;
350
8.54k
  ps->catr->po_lex_isolate_start = NULL;
351
8.54k
  ps->catr->po_lex_isolate_end = NULL;
352
8.54k
#if HAVE_ICONV
353
8.54k
  if (ps->po_lex_iconv != (iconv_t)(-1))
354
335
    {
355
335
      iconv_close (ps->po_lex_iconv);
356
335
      ps->po_lex_iconv = (iconv_t)(-1);
357
335
    }
358
8.54k
#endif
359
8.54k
  ps->po_lex_weird_cjk = false;
360
8.54k
}
361
362
363
/* The lowest level of PO file parsing converts bytes to multibyte characters.
364
   This is needed
365
   1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
366
      translation phase maps bytes to characters.
367
   2. to keep track of the current column, for the sake of precise error
368
      location. Emacs compile.el interprets the column in error messages
369
      by default as a screen column number, not as character number.
370
   3. to avoid skipping backslash-newline in the midst of a multibyte
371
      character. If XY is a multibyte character,  X \ newline Y  is invalid.
372
 */
373
374
/* A version of memcpy optimized for the case n <= 1.  */
375
static inline void
376
memcpy_small (void *dst, const void *src, size_t n)
377
195M
{
378
195M
  if (n > 0)
379
195M
    {
380
195M
      char *q = (char *) dst;
381
195M
      const char *p = (const char *) src;
382
383
195M
      *q = *p;
384
195M
      if (--n > 0)
385
570k
        do *++q = *++p; while (--n > 0);
386
195M
    }
387
195M
}
388
389
/* EOF (not a real character) is represented with bytes = 0 and
390
   uc_valid = false.  */
391
static inline bool
392
mb_iseof (const mbchar_t mbc)
393
372M
{
394
372M
  return (mbc->bytes == 0);
395
372M
}
396
397
/* Access the current character.  */
398
static inline const char *
399
mb_ptr (const mbchar_t mbc)
400
156M
{
401
156M
  return mbc->buf;
402
156M
}
403
static inline size_t
404
mb_len (const mbchar_t mbc)
405
251M
{
406
251M
  return mbc->bytes;
407
251M
}
408
409
/* Comparison of characters.  */
410
411
static inline bool
412
mb_iseq (const mbchar_t mbc, char sc)
413
517M
{
414
  /* Note: It is wrong to compare only mbc->uc, because when the encoding is
415
     SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
416
     want to treat it as an escape character, although it looks like a Yen
417
     sign.  */
418
#if HAVE_ICONV && 0
419
  if (mbc->uc_valid)
420
    return (mbc->uc == sc); /* wrong! */
421
  else
422
#endif
423
517M
    return (mbc->bytes == 1 && mbc->buf[0] == sc);
424
517M
}
425
426
MAYBE_UNUSED static inline bool
427
mb_isnul (const mbchar_t mbc)
428
0
{
429
0
#if HAVE_ICONV
430
0
  if (mbc->uc_valid)
431
0
    return (mbc->uc == 0);
432
0
  else
433
0
#endif
434
0
    return (mbc->bytes == 1 && mbc->buf[0] == 0);
435
0
}
436
437
MAYBE_UNUSED static inline int
438
mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
439
0
{
440
0
#if HAVE_ICONV
441
0
  if (mbc1->uc_valid && mbc2->uc_valid)
442
0
    return (int) mbc1->uc - (int) mbc2->uc;
443
0
  else
444
0
#endif
445
0
    return (mbc1->bytes == mbc2->bytes
446
0
            ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
447
0
            : mbc1->bytes < mbc2->bytes
448
0
              ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
449
0
              : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
450
0
}
451
452
MAYBE_UNUSED static inline bool
453
mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
454
0
{
455
0
#if HAVE_ICONV
456
0
  if (mbc1->uc_valid && mbc2->uc_valid)
457
0
    return mbc1->uc == mbc2->uc;
458
0
  else
459
0
#endif
460
0
    return (mbc1->bytes == mbc2->bytes
461
0
            && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
462
0
}
463
464
/* <ctype.h>, <wctype.h> classification.  */
465
466
MAYBE_UNUSED static inline bool
467
mb_isascii (const mbchar_t mbc)
468
0
{
469
0
#if HAVE_ICONV
470
0
  if (mbc->uc_valid)
471
0
    return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
472
0
  else
473
0
#endif
474
0
    return (mbc->bytes == 1
475
0
#if CHAR_MIN < 0x00 /* to avoid gcc warning */
476
0
            && mbc->buf[0] >= 0x00
477
0
#endif
478
0
#if CHAR_MAX > 0x7F /* to avoid gcc warning */
479
0
            && mbc->buf[0] <= 0x7F
480
0
#endif
481
0
           );
482
0
}
483
484
/* Extra <wchar.h> function.  */
485
486
/* Unprintable characters appear as a small box of width 1.  */
487
95.0M
#define MB_UNPRINTABLE_WIDTH 1
488
489
static int
490
mb_width (struct po_parser_state *ps, const mbchar_t mbc)
491
133M
{
492
133M
#if HAVE_ICONV
493
133M
  if (mbc->uc_valid)
494
4.33M
    {
495
4.33M
      ucs4_t uc = mbc->uc;
496
4.33M
      const char *encoding =
497
4.33M
        (ps->po_lex_iconv != (iconv_t)(-1) ? ps->po_lex_charset : "");
498
4.33M
      int w = uc_width (uc, encoding);
499
      /* For unprintable characters, arbitrarily return 0 for control
500
         characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise.  */
501
4.33M
      if (w >= 0)
502
4.15M
        return w;
503
181k
      if (uc >= 0x0000 && uc <= 0x001F)
504
175k
        {
505
175k
          if (uc == 0x0009)
506
10.3k
            return 8 - (ps->gram_pos_column & 7);
507
165k
          return 0;
508
175k
        }
509
5.50k
      if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
510
5.50k
        return 0;
511
0
      return MB_UNPRINTABLE_WIDTH;
512
5.50k
    }
513
128M
  else
514
128M
#endif
515
128M
    {
516
128M
      if (mbc->bytes == 1)
517
128M
        {
518
128M
          if (
519
128M
#if CHAR_MIN < 0x00 /* to avoid gcc warning */
520
128M
              mbc->buf[0] >= 0x00 &&
521
99.9M
#endif
522
99.9M
              mbc->buf[0] <= 0x1F)
523
33.2M
            {
524
33.2M
              if (mbc->buf[0] == 0x09)
525
3.37M
                return 8 - (ps->gram_pos_column & 7);
526
29.9M
              return 0;
527
33.2M
            }
528
95.7M
          if (mbc->buf[0] == 0x7F)
529
642k
            return 0;
530
95.7M
        }
531
95.0M
      return MB_UNPRINTABLE_WIDTH;
532
128M
    }
533
133M
}
534
535
/* Output.  */
536
MAYBE_UNUSED static inline void
537
mb_putc (const mbchar_t mbc, FILE *stream)
538
0
{
539
0
  fwrite (mbc->buf, 1, mbc->bytes, stream);
540
0
}
541
542
/* Assignment.  */
543
MAYBE_UNUSED static inline void
544
mb_setascii (mbchar_t mbc, char sc)
545
0
{
546
0
  mbc->bytes = 1;
547
0
#if HAVE_ICONV
548
0
  mbc->uc_valid = 1;
549
0
  mbc->uc = sc;
550
0
#endif
551
0
  mbc->buf[0] = sc;
552
0
}
553
554
/* Copying a character.  */
555
static inline void
556
mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc)
557
8.79M
{
558
8.79M
  memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes);
559
8.79M
  new_mbc->bytes = old_mbc->bytes;
560
8.79M
#if HAVE_ICONV
561
8.79M
  if ((new_mbc->uc_valid = old_mbc->uc_valid))
562
233k
    new_mbc->uc = old_mbc->uc;
563
8.79M
#endif
564
8.79M
}
565
566
567
/* Multibyte character input.  */
568
569
static inline void
570
mbfile_init (mbfile_t mbf, FILE *stream)
571
8.54k
{
572
8.54k
  mbf->fp = stream;
573
8.54k
  mbf->eof_seen = false;
574
8.54k
  mbf->pushback_count = 0;
575
8.54k
  mbf->bufcount = 0;
576
8.54k
}
577
578
/* Read the next multibyte character from mbf and put it into mbc.
579
   If a read error occurs, errno is set and ferror (mbf->fp) becomes true.  */
580
static void
581
mbfile_getc (struct po_parser_state *ps, mbchar_t mbc, mbfile_t mbf)
582
131M
{
583
131M
  size_t bytes;
584
585
  /* Return character pushed back, if there is one.  */
586
131M
  if (mbf->pushback_count > 0)
587
4.39M
    {
588
4.39M
      mbf->pushback_count--;
589
4.39M
      mb_copy (mbc, &mbf->pushback[mbf->pushback_count]);
590
4.39M
      return;
591
4.39M
    }
592
593
  /* If EOF has already been seen, don't use getc.  This matters if
594
     mbf->fp is connected to an interactive tty.  */
595
127M
  if (mbf->eof_seen)
596
6.84k
    goto eof;
597
598
  /* Before using iconv, we need at least one byte.  */
599
127M
  if (mbf->bufcount == 0)
600
127M
    {
601
127M
      int c = getc (mbf->fp);
602
127M
      if (c == EOF)
603
8.46k
        {
604
8.46k
          mbf->eof_seen = true;
605
8.46k
          goto eof;
606
8.46k
        }
607
127M
      mbf->buf[0] = (unsigned char) c;
608
127M
      mbf->bufcount++;
609
127M
    }
610
611
127M
#if HAVE_ICONV
612
127M
  if (ps->po_lex_iconv != (iconv_t)(-1))
613
4.43M
    {
614
      /* Use iconv on an increasing number of bytes.  Read only as many
615
         bytes from mbf->fp as needed.  This is needed to give reasonable
616
         interactive behaviour when mbf->fp is connected to an interactive
617
         tty.  */
618
4.43M
      for (;;)
619
4.87M
        {
620
4.87M
          unsigned char scratchbuf[64];
621
4.87M
          const char *inptr = &mbf->buf[0];
622
4.87M
          size_t insize = mbf->bufcount;
623
4.87M
          char *outptr = (char *) &scratchbuf[0];
624
4.87M
          size_t outsize = sizeof (scratchbuf);
625
626
4.87M
          size_t res = iconv (ps->po_lex_iconv,
627
4.87M
                              (ICONV_CONST char **) &inptr, &insize,
628
4.87M
                              &outptr, &outsize);
629
          /* We expect that a character has been produced if and only if
630
             some input bytes have been consumed.  */
631
4.87M
          if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
632
0
            abort ();
633
4.87M
          if (outsize == sizeof (scratchbuf))
634
658k
            {
635
              /* No character has been produced.  Must be an error.  */
636
658k
              if (res != (size_t)(-1))
637
0
                abort ();
638
639
658k
              if (errno == EILSEQ)
640
211k
                {
641
                  /* An invalid multibyte sequence was encountered.  */
642
                  /* Return a single byte.  */
643
211k
                  if (ps->signal_eilseq)
644
149k
                    po_gram_error (ps, _("invalid multibyte sequence"));
645
211k
                  bytes = 1;
646
211k
                  mbc->uc_valid = false;
647
211k
                  break;
648
211k
                }
649
446k
              else if (errno == EINVAL)
650
446k
                {
651
                  /* An incomplete multibyte character.  */
652
446k
                  int c;
653
654
446k
                  if (mbf->bufcount == MBCHAR_BUF_SIZE)
655
0
                    {
656
                      /* An overlong incomplete multibyte sequence was
657
                         encountered.  */
658
                      /* Return a single byte.  */
659
0
                      bytes = 1;
660
0
                      mbc->uc_valid = false;
661
0
                      break;
662
0
                    }
663
664
                  /* Read one more byte and retry iconv.  */
665
446k
                  c = getc (mbf->fp);
666
446k
                  if (c == EOF)
667
74
                    {
668
74
                      mbf->eof_seen = true;
669
74
                      if (ferror (mbf->fp))
670
0
                        goto eof;
671
74
                      if (ps->signal_eilseq)
672
62
                        po_gram_error (ps, _("incomplete multibyte sequence at end of file"));
673
74
                      bytes = mbf->bufcount;
674
74
                      mbc->uc_valid = false;
675
74
                      break;
676
74
                    }
677
446k
                  mbf->buf[mbf->bufcount++] = (unsigned char) c;
678
446k
                  if (c == '\n')
679
3.55k
                    {
680
3.55k
                      if (ps->signal_eilseq)
681
2.61k
                        po_gram_error (ps, _("incomplete multibyte sequence at end of line"));
682
3.55k
                      bytes = mbf->bufcount - 1;
683
3.55k
                      mbc->uc_valid = false;
684
3.55k
                      break;
685
3.55k
                    }
686
446k
                }
687
0
              else
688
0
                {
689
0
                  int err = errno;
690
0
                  ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR,
691
0
                                         NULL, NULL, 0, 0, false,
692
0
                                         xstrerror (_("iconv failure"), err));
693
0
                }
694
658k
            }
695
4.21M
          else
696
4.21M
            {
697
4.21M
              size_t outbytes = sizeof (scratchbuf) - outsize;
698
4.21M
              bytes = mbf->bufcount - insize;
699
700
              /* We expect that one character has been produced.  */
701
4.21M
              if (bytes == 0)
702
0
                abort ();
703
4.21M
              if (outbytes == 0)
704
0
                abort ();
705
              /* Convert it from UTF-8 to UCS-4.  */
706
4.21M
              if (u8_mbtoucr (&mbc->uc, scratchbuf, outbytes) < (int) outbytes)
707
0
                {
708
                  /* scratchbuf contains an out-of-range Unicode character
709
                     (> 0x10ffff).  */
710
0
                  if (ps->signal_eilseq)
711
0
                    po_gram_error (ps, _("invalid multibyte sequence"));
712
0
                  mbc->uc_valid = false;
713
0
                  break;
714
0
                }
715
4.21M
              mbc->uc_valid = true;
716
4.21M
              break;
717
4.21M
            }
718
4.87M
        }
719
4.43M
    }
720
122M
  else
721
122M
#endif
722
122M
    {
723
122M
      if (ps->po_lex_weird_cjk
724
          /* Special handling of encodings with CJK structure.  */
725
0
          && (unsigned char) mbf->buf[0] >= 0x80)
726
0
        {
727
0
          if (mbf->bufcount == 1)
728
0
            {
729
              /* Read one more byte.  */
730
0
              int c = getc (mbf->fp);
731
0
              if (c == EOF)
732
0
                {
733
0
                  if (ferror (mbf->fp))
734
0
                    {
735
0
                      mbf->eof_seen = true;
736
0
                      goto eof;
737
0
                    }
738
0
                }
739
0
              else
740
0
                {
741
0
                  mbf->buf[1] = (unsigned char) c;
742
0
                  mbf->bufcount++;
743
0
                }
744
0
            }
745
0
          if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
746
            /* Return a double byte.  */
747
0
            bytes = 2;
748
0
          else
749
            /* Return a single byte.  */
750
0
            bytes = 1;
751
0
        }
752
122M
      else
753
122M
        {
754
          /* Return a single byte.  */
755
122M
          bytes = 1;
756
122M
        }
757
122M
#if HAVE_ICONV
758
122M
      mbc->uc_valid = false;
759
122M
#endif
760
122M
    }
761
762
  /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
763
127M
  memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
764
127M
  mbc->bytes = bytes;
765
766
127M
  mbf->bufcount -= bytes;
767
127M
  if (mbf->bufcount > 0)
768
131k
    {
769
      /* It's not worth calling memmove() for so few bytes.  */
770
131k
      unsigned int count = mbf->bufcount;
771
131k
      char *p = &mbf->buf[0];
772
773
131k
      do
774
131k
        {
775
131k
          *p = *(p + bytes);
776
131k
          p++;
777
131k
        }
778
131k
      while (--count > 0);
779
131k
    }
780
127M
  return;
781
782
15.3k
eof:
783
  /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
784
15.3k
  mbc->bytes = 0;
785
15.3k
#if HAVE_ICONV
786
15.3k
  mbc->uc_valid = false;
787
15.3k
#endif
788
15.3k
  return;
789
127M
}
790
791
static void
792
mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
793
4.39M
{
794
4.39M
  if (mbf->pushback_count >= MBFILE_MAX_PUSHBACK)
795
0
    abort ();
796
4.39M
  mb_copy (&mbf->pushback[mbf->pushback_count], mbc);
797
4.39M
  mbf->pushback_count++;
798
4.39M
}
799
800
801
/* Prepare lexical analysis.  */
802
void
803
lex_start (struct po_parser_state *ps,
804
           FILE *fp, const char *real_filename, const char *logical_filename)
805
8.54k
{
806
  /* Ignore the logical_filename, because PO file entries already have
807
     their file names attached.  But use real_filename for error messages.  */
808
8.54k
  ps->gram_pos.file_name = xstrdup (real_filename);
809
810
8.54k
  mbfile_init (ps->mbf, fp);
811
812
8.54k
  ps->gram_pos.line_number = 1;
813
8.54k
  ps->gram_pos_column = 0;
814
8.54k
  ps->signal_eilseq = true;
815
8.54k
  ps->po_lex_obsolete = false;
816
8.54k
  ps->po_lex_previous = false;
817
8.54k
  po_lex_charset_init (ps);
818
8.54k
  ps->buf = NULL;
819
8.54k
  ps->bufmax = 0;
820
8.54k
}
821
822
/* Terminate lexical analysis.  */
823
void
824
lex_end (struct po_parser_state *ps)
825
8.54k
{
826
8.54k
  ps->gram_pos.file_name = NULL;
827
8.54k
  ps->gram_pos.line_number = 0;
828
8.54k
  po_lex_charset_close (ps);
829
8.54k
  free (ps->buf);
830
8.54k
}
831
832
833
/* Read a single character, collapsing the Windows CRLF line terminator
834
   to a single LF.
835
   Supports 1 character of pushback (via mbfile_ungetc).  */
836
static void
837
mbfile_getc_normalized (struct po_parser_state *ps, mbchar_t mbc, mbfile_t mbf)
838
131M
{
839
131M
  mbfile_getc (ps, mbc, ps->mbf);
840
131M
  if (!mb_iseof (mbc) && mb_iseq (mbc, '\r'))
841
125k
    {
842
125k
      mbchar_t mbc2;
843
844
125k
      mbfile_getc (ps, mbc2, ps->mbf);
845
125k
      if (!mb_iseof (mbc2))
846
125k
        {
847
125k
          if (mb_iseq (mbc2, '\n'))
848
            /* Eliminate the CR.  */
849
4.81k
            mb_copy (mbc, mbc2);
850
120k
          else
851
120k
            {
852
120k
              mbfile_ungetc (mbc2, ps->mbf);
853
              /* If we get here, the caller can still do
854
                   mbfile_ungetc (mbc, ps->mbf);
855
                 since mbfile_getc supports 2 characters of pushback.  */
856
120k
            }
857
125k
        }
858
125k
    }
859
131M
}
860
861
862
/* Read a single character, dealing with backslash-newline.
863
   Also keep track of the current line number and column number.  */
864
static void
865
lex_getc (struct po_parser_state *ps, mbchar_t mbc)
866
131M
{
867
131M
  for (;;)
868
131M
    {
869
131M
      mbfile_getc_normalized (ps, mbc, ps->mbf);
870
871
131M
      if (mb_iseof (mbc))
872
15.0k
        {
873
15.0k
          if (ferror (ps->mbf->fp))
874
0
           bomb:
875
0
            {
876
0
              int err = errno;
877
0
              ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR,
878
0
                                     NULL, NULL, 0, 0, false,
879
0
                                     xstrerror (xasprintf (_("error while reading \"%s\""),
880
0
                                                           ps->gram_pos.file_name),
881
0
                                                err));
882
0
            }
883
15.0k
          break;
884
15.0k
        }
885
886
131M
      if (mb_iseq (mbc, '\n'))
887
1.81M
        {
888
1.81M
          ps->gram_pos.line_number++;
889
1.81M
          ps->gram_pos_column = 0;
890
1.81M
          break;
891
1.81M
        }
892
893
129M
      ps->gram_pos_column += mb_width (ps, mbc);
894
895
129M
      if (mb_iseq (mbc, '\\'))
896
314k
        {
897
314k
          mbchar_t mbc2;
898
899
314k
          mbfile_getc_normalized (ps, mbc2, ps->mbf);
900
901
314k
          if (mb_iseof (mbc2))
902
158
            {
903
158
              if (ferror (ps->mbf->fp))
904
0
                goto bomb;
905
158
              break;
906
158
            }
907
908
314k
          if (!mb_iseq (mbc2, '\n'))
909
311k
            {
910
311k
              mbfile_ungetc (mbc2, ps->mbf);
911
311k
              break;
912
311k
            }
913
914
3.18k
          ps->gram_pos.line_number++;
915
3.18k
          ps->gram_pos_column = 0;
916
3.18k
        }
917
129M
      else
918
129M
        break;
919
129M
    }
920
131M
}
921
922
923
static void
924
lex_ungetc (struct po_parser_state *ps, const mbchar_t mbc)
925
3.96M
{
926
3.96M
  if (!mb_iseof (mbc))
927
3.96M
    {
928
3.96M
      if (mb_iseq (mbc, '\n'))
929
        /* Decrement the line number, but don't care about the column.  */
930
189k
        ps->gram_pos.line_number--;
931
3.77M
      else
932
        /* Decrement the column number.  Also works well enough for tabs.  */
933
3.77M
        ps->gram_pos_column -= mb_width (ps, mbc);
934
935
3.96M
      mbfile_ungetc (mbc, ps->mbf);
936
3.96M
    }
937
3.96M
}
938
939
940
static int
941
keyword_p (struct po_parser_state *ps, const char *s)
942
2.97M
{
943
2.97M
  if (!ps->po_lex_previous)
944
2.94M
    {
945
2.94M
      if (!strcmp (s, "domain"))
946
5.01k
        return DOMAIN;
947
2.94M
      if (!strcmp (s, "msgid"))
948
109k
        return MSGID;
949
2.83M
      if (!strcmp (s, "msgid_plural"))
950
4.54k
        return MSGID_PLURAL;
951
2.82M
      if (!strcmp (s, "msgstr"))
952
93.0k
        return MSGSTR;
953
2.73M
      if (!strcmp (s, "msgctxt"))
954
10.7k
        return MSGCTXT;
955
2.73M
    }
956
26.1k
  else
957
26.1k
    {
958
      /* Inside a "#|" context, the keywords have a different meaning.  */
959
26.1k
      if (!strcmp (s, "msgid"))
960
6.28k
        return PREV_MSGID;
961
19.9k
      if (!strcmp (s, "msgid_plural"))
962
369
        return PREV_MSGID_PLURAL;
963
19.5k
      if (!strcmp (s, "msgctxt"))
964
2.64k
        return PREV_MSGCTXT;
965
19.5k
    }
966
2.74M
  po_gram_error_at_line (ps->catr, &ps->gram_pos,
967
2.74M
                         _("keyword \"%s\" unknown"), s);
968
2.74M
  return NAME;
969
2.97M
}
970
971
972
static int
973
control_sequence (struct po_parser_state *ps)
974
87.4k
{
975
87.4k
  mbchar_t mbc;
976
87.4k
  int val;
977
87.4k
  int max;
978
979
87.4k
  lex_getc (ps, mbc);
980
87.4k
  if (mb_len (mbc) == 1)
981
87.1k
    switch (mb_ptr (mbc) [0])
982
87.1k
      {
983
346
      case 'n':
984
346
        return '\n';
985
986
1.17k
      case 't':
987
1.17k
        return '\t';
988
989
897
      case 'b':
990
897
        return '\b';
991
992
3.86k
      case 'r':
993
3.86k
        return '\r';
994
995
2.62k
      case 'f':
996
2.62k
        return '\f';
997
998
275
      case 'v':
999
275
        return '\v';
1000
1001
309
      case 'a':
1002
309
        return '\a';
1003
1004
20.0k
      case '\\':
1005
20.6k
      case '"':
1006
20.6k
        return mb_ptr (mbc) [0];
1007
1008
6.18k
      case '0': case '1': case '2': case '3':
1009
8.05k
      case '4': case '5': case '6': case '7':
1010
8.05k
        val = 0;
1011
8.05k
        max = 0;
1012
8.05k
        for (;;)
1013
16.2k
          {
1014
16.2k
            char c = mb_ptr (mbc) [0];
1015
            /* Warning: not portable, can't depend on '0'..'7' ordering.  */
1016
16.2k
            val = val * 8 + (c - '0');
1017
16.2k
            if (++max == 3)
1018
3.63k
              break;
1019
12.5k
            lex_getc (ps, mbc);
1020
12.5k
            if (mb_len (mbc) == 1)
1021
12.5k
              switch (mb_ptr (mbc) [0])
1022
12.5k
                {
1023
6.72k
                case '0': case '1': case '2': case '3':
1024
8.15k
                case '4': case '5': case '6': case '7':
1025
8.15k
                  continue;
1026
1027
4.39k
                default:
1028
4.39k
                  break;
1029
12.5k
                }
1030
4.42k
            lex_ungetc (ps, mbc);
1031
4.42k
            break;
1032
12.5k
          }
1033
8.05k
        return val;
1034
1035
26.8k
      case 'x':
1036
26.8k
        lex_getc (ps, mbc);
1037
26.8k
        if (mb_iseof (mbc) || mb_len (mbc) != 1
1038
26.7k
            || !c_isxdigit (mb_ptr (mbc) [0]))
1039
5.84k
          break;
1040
1041
21.0k
        val = 0;
1042
21.0k
        for (;;)
1043
102k
          {
1044
102k
            char c = mb_ptr (mbc) [0];
1045
102k
            val *= 16;
1046
102k
            if (c_isdigit (c))
1047
              /* Warning: not portable, can't depend on '0'..'9' ordering */
1048
68.3k
              val += c - '0';
1049
34.2k
            else if (c_isupper (c))
1050
              /* Warning: not portable, can't depend on 'A'..'F' ordering */
1051
16.4k
              val += c - 'A' + 10;
1052
17.7k
            else
1053
              /* Warning: not portable, can't depend on 'a'..'f' ordering */
1054
17.7k
              val += c - 'a' + 10;
1055
1056
102k
            lex_getc (ps, mbc);
1057
102k
            if (mb_len (mbc) == 1)
1058
102k
              switch (mb_ptr (mbc) [0])
1059
102k
                {
1060
41.0k
                case '0': case '1': case '2': case '3': case '4':
1061
58.9k
                case '5': case '6': case '7': case '8': case '9':
1062
74.2k
                case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1063
81.5k
                case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1064
81.5k
                  continue;
1065
1066
20.8k
                default:
1067
20.8k
                  break;
1068
102k
                }
1069
21.0k
            lex_ungetc (ps, mbc);
1070
21.0k
            break;
1071
102k
          }
1072
21.0k
        return val;
1073
1074
      /* FIXME: \u and \U are not handled.  */
1075
87.1k
      }
1076
28.2k
  lex_ungetc (ps, mbc);
1077
28.2k
  po_gram_error (ps, _("invalid control sequence"));
1078
28.2k
  return ' ';
1079
87.4k
}
1080
1081
1082
/* Return the next token in the PO file.  The return codes are defined
1083
   in "read-po-gram.h".  Associated data is put in 'po_gram_lval'.  */
1084
int
1085
po_gram_lex (union PO_GRAM_STYPE *lval, struct po_parser_state *ps)
1086
40.2M
{
1087
  /* Cache ps->buf and ps->bufmax in local variables.  */
1088
40.2M
  char *buf = ps->buf;
1089
40.2M
  size_t bufmax = ps->bufmax;
1090
1091
40.2M
  mbchar_t mbc;
1092
40.2M
  size_t bufpos;
1093
1094
40.2M
  for (;;)
1095
43.5M
    {
1096
43.5M
      lex_getc (ps, mbc);
1097
1098
43.5M
      if (mb_iseof (mbc))
1099
        /* Yacc want this for end of file.  */
1100
8.54k
        return 0;
1101
1102
43.5M
      if (mb_len (mbc) == 1)
1103
43.4M
        switch (mb_ptr (mbc) [0])
1104
43.4M
          {
1105
648k
          case '\n':
1106
648k
            ps->po_lex_obsolete = false;
1107
648k
            ps->po_lex_previous = false;
1108
            /* Ignore whitespace, not relevant for the grammar.  */
1109
648k
            break;
1110
1111
576k
          case ' ':
1112
2.56M
          case '\t':
1113
2.62M
          case '\r':
1114
2.66M
          case '\f':
1115
2.69M
          case '\v':
1116
            /* Ignore whitespace, not relevant for the grammar.  */
1117
2.69M
            break;
1118
1119
796k
          case '#':
1120
796k
            lex_getc (ps, mbc);
1121
796k
            if (mb_iseq (mbc, '~'))
1122
              /* A pseudo-comment beginning with #~ is found.  This is
1123
                 not a comment.  It is the format for obsolete entries.
1124
                 We simply discard the "#~" prefix.  The following
1125
                 characters are expected to be well formed.  */
1126
6.40k
              {
1127
6.40k
                ps->po_lex_obsolete = true;
1128
                /* A pseudo-comment beginning with #~| denotes a previous
1129
                   untranslated string in an obsolete entry.  This does not
1130
                   make much sense semantically, and is implemented here
1131
                   for completeness only.  */
1132
6.40k
                lex_getc (ps, mbc);
1133
6.40k
                if (mb_iseq (mbc, '|'))
1134
95
                  ps->po_lex_previous = true;
1135
6.31k
                else
1136
6.31k
                  lex_ungetc (ps, mbc);
1137
6.40k
                break;
1138
6.40k
              }
1139
789k
            if (mb_iseq (mbc, '|'))
1140
              /* A pseudo-comment beginning with #| is found.  This is
1141
                 the previous untranslated string.  We discard the "#|"
1142
                 prefix, but change the keywords and string returns
1143
                 accordingly.  */
1144
6.74k
              {
1145
6.74k
                ps->po_lex_previous = true;
1146
6.74k
                break;
1147
6.74k
              }
1148
1149
            /* Accumulate comments into a buffer.  If we have been asked
1150
               to pass comments, generate a COMMENT token, otherwise
1151
               discard it.  */
1152
783k
            ps->signal_eilseq = false;
1153
783k
            if (ps->catr->pass_comments)
1154
783k
              {
1155
783k
                bufpos = 0;
1156
783k
                for (;;)
1157
32.2M
                  {
1158
32.3M
                    while (bufpos + mb_len (mbc) >= bufmax)
1159
134k
                      {
1160
134k
                        bufmax += 100;
1161
134k
                        buf = xrealloc (buf, bufmax);
1162
134k
                        ps->bufmax = bufmax;
1163
134k
                        ps->buf = buf;
1164
134k
                      }
1165
32.2M
                    if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
1166
783k
                      break;
1167
1168
31.4M
                    memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
1169
31.4M
                    bufpos += mb_len (mbc);
1170
1171
31.4M
                    lex_getc (ps, mbc);
1172
31.4M
                  }
1173
783k
                buf[bufpos] = '\0';
1174
1175
783k
                lval->string.string = buf;
1176
783k
                lval->string.pos = ps->gram_pos;
1177
783k
                lval->string.obsolete = ps->po_lex_obsolete;
1178
783k
                ps->po_lex_obsolete = false;
1179
783k
                ps->signal_eilseq = true;
1180
783k
                return COMMENT;
1181
783k
              }
1182
0
            else
1183
0
              {
1184
                /* We do this in separate loop because collecting large
1185
                   comments while they get not passed to the upper layers
1186
                   is not very efficient.  */
1187
0
                while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
1188
0
                  lex_getc (ps, mbc);
1189
0
                ps->po_lex_obsolete = false;
1190
0
                ps->signal_eilseq = true;
1191
0
              }
1192
0
            break;
1193
1194
633k
          case '"':
1195
            /* Accumulate a string.  */
1196
633k
            bufpos = 0;
1197
633k
            for (;;)
1198
29.0M
              {
1199
29.0M
                lex_getc (ps, mbc);
1200
29.1M
                while (bufpos + mb_len (mbc) >= bufmax)
1201
107k
                  {
1202
107k
                    bufmax += 100;
1203
107k
                    buf = xrealloc (buf, bufmax);
1204
107k
                    ps->bufmax = bufmax;
1205
107k
                    ps->buf = buf;
1206
107k
                  }
1207
29.0M
                if (mb_iseof (mbc))
1208
1.89k
                  {
1209
1.89k
                    po_gram_error_at_line (ps->catr, &ps->gram_pos,
1210
1.89k
                                           _("end-of-file within string"));
1211
1.89k
                    break;
1212
1.89k
                  }
1213
29.0M
                if (mb_iseq (mbc, '\n'))
1214
195k
                  {
1215
195k
                    po_gram_error_at_line (ps->catr, &ps->gram_pos,
1216
195k
                                           _("end-of-line within string"));
1217
195k
                    break;
1218
195k
                  }
1219
28.8M
                if (mb_iseq (mbc, '"'))
1220
435k
                  break;
1221
28.3M
                if (mb_iseq (mbc, '\\'))
1222
87.4k
                  {
1223
87.4k
                    buf[bufpos++] = control_sequence (ps);
1224
87.4k
                    continue;
1225
87.4k
                  }
1226
1227
                /* Add mbc to the accumulator.  */
1228
28.2M
                memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
1229
28.2M
                bufpos += mb_len (mbc);
1230
28.2M
              }
1231
633k
            buf[bufpos] = '\0';
1232
1233
            /* Strings cannot contain the msgctxt separator, because it cannot
1234
               be faithfully represented in the msgid of a .mo file.  */
1235
633k
            if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
1236
11.1k
              po_gram_error_at_line (ps->catr, &ps->gram_pos,
1237
11.1k
                                     _("context separator <EOT> within string"));
1238
1239
            /* FIXME: Treatment of embedded \000 chars is incorrect.  */
1240
633k
            lval->string.string = xstrdup (buf);
1241
633k
            lval->string.pos = ps->gram_pos;
1242
633k
            lval->string.obsolete = ps->po_lex_obsolete;
1243
633k
            return (ps->po_lex_previous ? PREV_STRING : STRING);
1244
1245
499k
          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1246
737k
          case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1247
1.42M
          case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1248
1.69M
          case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1249
1.76M
          case 'y': case 'z':
1250
2.09M
          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1251
2.32M
          case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1252
2.57M
          case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1253
2.84M
          case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1254
2.89M
          case 'Y': case 'Z':
1255
2.97M
          case '_': case '$':
1256
2.97M
            bufpos = 0;
1257
2.97M
            for (;;)
1258
19.6M
              {
1259
19.6M
                char c = mb_ptr (mbc) [0];
1260
19.6M
                if (bufpos + 1 >= bufmax)
1261
52.7k
                  {
1262
52.7k
                    bufmax += 100;
1263
52.7k
                    buf = xrealloc (buf, bufmax);
1264
52.7k
                    ps->bufmax = bufmax;
1265
52.7k
                    ps->buf = buf;
1266
52.7k
                  }
1267
19.6M
                buf[bufpos++] = c;
1268
19.6M
                lex_getc (ps, mbc);
1269
19.6M
                if (mb_len (mbc) == 1)
1270
19.6M
                  switch (mb_ptr (mbc) [0])
1271
19.6M
                    {
1272
2.94M
                    default:
1273
2.94M
                      break;
1274
3.44M
                    case 'a': case 'b': case 'c': case 'd': case 'e':
1275
4.24M
                    case 'f': case 'g': case 'h': case 'i': case 'j':
1276
4.81M
                    case 'k': case 'l': case 'm': case 'n': case 'o':
1277
6.07M
                    case 'p': case 'q': case 'r': case 's': case 't':
1278
7.03M
                    case 'u': case 'v': case 'w': case 'x': case 'y':
1279
7.42M
                    case 'z':
1280
7.70M
                    case 'A': case 'B': case 'C': case 'D': case 'E':
1281
8.06M
                    case 'F': case 'G': case 'H': case 'I': case 'J':
1282
8.61M
                    case 'K': case 'L': case 'M': case 'N': case 'O':
1283
9.65M
                    case 'P': case 'Q': case 'R': case 'S': case 'T':
1284
10.2M
                    case 'U': case 'V': case 'W': case 'X': case 'Y':
1285
10.2M
                    case 'Z':
1286
10.2M
                    case '_': case '$':
1287
16.4M
                    case '0': case '1': case '2': case '3': case '4':
1288
16.6M
                    case '5': case '6': case '7': case '8': case '9':
1289
16.6M
                      continue;
1290
19.6M
                    }
1291
2.97M
                break;
1292
19.6M
              }
1293
2.97M
            lex_ungetc (ps, mbc);
1294
1295
2.97M
            buf[bufpos] = '\0';
1296
1297
2.97M
            {
1298
2.97M
              int k = keyword_p (ps, buf);
1299
2.97M
              if (k == NAME)
1300
2.74M
                {
1301
2.74M
                  lval->string.string = xstrdup (buf);
1302
2.74M
                  lval->string.pos = ps->gram_pos;
1303
2.74M
                  lval->string.obsolete = ps->po_lex_obsolete;
1304
2.74M
                }
1305
232k
              else
1306
232k
                {
1307
232k
                  lval->pos.pos = ps->gram_pos;
1308
232k
                  lval->pos.obsolete = ps->po_lex_obsolete;
1309
232k
                }
1310
2.97M
              return k;
1311
2.97M
            }
1312
1313
713k
          case '0': case '1': case '2': case '3': case '4':
1314
933k
          case '5': case '6': case '7': case '8': case '9':
1315
933k
            bufpos = 0;
1316
933k
            for (;;)
1317
6.60M
              {
1318
6.60M
                char c = mb_ptr (mbc) [0];
1319
6.60M
                if (bufpos + 1 >= bufmax)
1320
32.5k
                  {
1321
32.5k
                    bufmax += 100;
1322
32.5k
                    buf = xrealloc (buf, bufmax + 1);
1323
32.5k
                    ps->bufmax = bufmax;
1324
32.5k
                    ps->buf = buf;
1325
32.5k
                  }
1326
6.60M
                buf[bufpos++] = c;
1327
6.60M
                lex_getc (ps, mbc);
1328
6.60M
                if (mb_len (mbc) == 1)
1329
6.60M
                  switch (mb_ptr (mbc) [0])
1330
6.60M
                    {
1331
927k
                    default:
1332
927k
                      break;
1333
1334
4.55M
                    case '0': case '1': case '2': case '3': case '4':
1335
5.67M
                    case '5': case '6': case '7': case '8': case '9':
1336
5.67M
                      continue;
1337
6.60M
                    }
1338
933k
                break;
1339
6.60M
              }
1340
933k
            lex_ungetc (ps, mbc);
1341
1342
933k
            buf[bufpos] = '\0';
1343
1344
933k
            lval->number.number = atol (buf);
1345
933k
            lval->number.pos = ps->gram_pos;
1346
933k
            lval->number.obsolete = ps->po_lex_obsolete;
1347
933k
            return NUMBER;
1348
1349
68.1k
          case '[':
1350
68.1k
            lval->pos.pos = ps->gram_pos;
1351
68.1k
            lval->pos.obsolete = ps->po_lex_obsolete;
1352
68.1k
            return '[';
1353
1354
39.0k
          case ']':
1355
39.0k
            lval->pos.pos = ps->gram_pos;
1356
39.0k
            lval->pos.obsolete = ps->po_lex_obsolete;
1357
39.0k
            return ']';
1358
1359
34.6M
          default:
1360
            /* This will cause a syntax error.  */
1361
34.6M
            return JUNK;
1362
43.4M
          }
1363
124k
      else
1364
        /* This will cause a syntax error.  */
1365
124k
        return JUNK;
1366
43.5M
    }
1367
40.2M
}