/src/gettext-0.26/gettext-tools/src/read-po-lex.c
Line | Count | Source |
1 | | /* GNU gettext - internationalization aids |
2 | | Copyright (C) 1995-2024 Free Software Foundation, Inc. |
3 | | |
4 | | This file was written by Peter Miller <millerp@canb.auug.org.au>. |
5 | | Multibyte character handling by Bruno Haible <haible@clisp.cons.org>. |
6 | | |
7 | | This program is free software: you can redistribute it and/or modify |
8 | | it under the terms of the GNU General Public License as published by |
9 | | the Free Software Foundation; either version 3 of the License, or |
10 | | (at your option) any later version. |
11 | | |
12 | | This program is distributed in the hope that it will be useful, |
13 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | GNU General Public License for more details. |
16 | | |
17 | | You should have received a copy of the GNU General Public License |
18 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
19 | | |
20 | | |
21 | | #ifdef HAVE_CONFIG_H |
22 | | # include "config.h" |
23 | | #endif |
24 | | |
25 | | /* Specification. */ |
26 | | #include "read-po-lex.h" |
27 | | |
28 | | #include <errno.h> |
29 | | #include <limits.h> |
30 | | #include <stdio.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <stdarg.h> |
34 | | |
35 | | #if HAVE_ICONV |
36 | | # include <iconv.h> |
37 | | #endif |
38 | | |
39 | | #include <error.h> |
40 | | #include "attribute.h" |
41 | | #include "c-ctype.h" |
42 | | #include "uniwidth.h" |
43 | | #include "gettext.h" |
44 | | #include "po-charset.h" |
45 | | #include "xalloc.h" |
46 | | #include "xvasprintf.h" |
47 | | #include "xstrerror.h" |
48 | | #include "po-error.h" |
49 | | #include "xerror-handler.h" |
50 | | #include "xmalloca.h" |
51 | | #if !IN_LIBGETTEXTPO |
52 | | # include "basename-lgpl.h" |
53 | | # include "progname.h" |
54 | | #endif |
55 | | #include "c-strstr.h" |
56 | | #include "pos.h" |
57 | | #include "message.h" |
58 | | #include "str-list.h" |
59 | | #include "read-po.h" |
60 | | #include "read-po-internal.h" |
61 | | #include "read-po-gram.h" |
62 | | |
63 | 3.16M | #define _(str) gettext(str) |
64 | | |
65 | | #if HAVE_DECL_GETC_UNLOCKED |
66 | | # undef getc |
67 | 127M | # define getc getc_unlocked |
68 | | #endif |
69 | | |
70 | | |
71 | | /* Error handling during the parsing of a PO file. |
72 | | These functions can access ps->gram_pos and ps->gram_pos_column. */ |
73 | | |
74 | | void |
75 | | po_gram_error (struct po_parser_state *ps, const char *fmt, ...) |
76 | 312k | { |
77 | 312k | va_list ap; |
78 | 312k | char *buffer; |
79 | | |
80 | 312k | va_start (ap, fmt); |
81 | 312k | if (vasprintf (&buffer, fmt, ap) < 0) |
82 | 0 | ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, |
83 | 0 | _("memory exhausted")); |
84 | 312k | va_end (ap); |
85 | 312k | ps->catr->xeh->xerror (CAT_SEVERITY_ERROR, NULL, |
86 | 312k | ps->gram_pos.file_name, ps->gram_pos.line_number, |
87 | 312k | ps->gram_pos_column + 1, false, buffer); |
88 | 312k | free (buffer); |
89 | | |
90 | 312k | if (*(ps->catr->xeh->error_message_count_p) >= gram_max_allowed_errors) |
91 | 0 | ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, |
92 | 0 | _("too many errors, aborting")); |
93 | 312k | } |
94 | | |
95 | | void |
96 | | po_gram_error_at_line (abstract_catalog_reader_ty *catr, const lex_pos_ty *pp, |
97 | | const char *fmt, ...) |
98 | 2.99M | { |
99 | 2.99M | va_list ap; |
100 | 2.99M | char *buffer; |
101 | | |
102 | 2.99M | va_start (ap, fmt); |
103 | 2.99M | if (vasprintf (&buffer, fmt, ap) < 0) |
104 | 0 | catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, |
105 | 0 | _("memory exhausted")); |
106 | 2.99M | va_end (ap); |
107 | 2.99M | catr->xeh->xerror (CAT_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number, |
108 | 2.99M | (size_t)(-1), false, buffer); |
109 | 2.99M | free (buffer); |
110 | | |
111 | 2.99M | if (*(catr->xeh->error_message_count_p) >= gram_max_allowed_errors) |
112 | 0 | catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, |
113 | 0 | _("too many errors, aborting")); |
114 | 2.99M | } |
115 | | |
116 | | |
117 | | /* Charset handling while parsing PO files. */ |
118 | | |
119 | | /* Initialize the PO file's encoding. */ |
120 | | static void |
121 | | po_lex_charset_init (struct po_parser_state *ps) |
122 | 8.54k | { |
123 | 8.54k | ps->po_lex_charset = NULL; |
124 | 8.54k | ps->catr->po_lex_isolate_start = NULL; |
125 | 8.54k | ps->catr->po_lex_isolate_end = NULL; |
126 | 8.54k | #if HAVE_ICONV |
127 | 8.54k | ps->po_lex_iconv = (iconv_t)(-1); |
128 | 8.54k | #endif |
129 | 8.54k | ps->po_lex_weird_cjk = false; |
130 | 8.54k | } |
131 | | |
132 | | /* Set the PO file's encoding from the header entry. |
133 | | If is_pot_role is true, "charset=CHARSET" is expected and does not deserve |
134 | | a warning. */ |
135 | | void |
136 | | po_lex_charset_set (struct po_parser_state *ps, |
137 | | const char *header_entry, |
138 | | const char *filename, bool is_pot_role) |
139 | 33.5k | { |
140 | | /* Verify the validity of CHARSET. It is necessary |
141 | | 1. for the correct treatment of multibyte characters containing |
142 | | 0x5C bytes in the PO lexer, |
143 | | 2. so that at run time, gettext() can call iconv() to convert |
144 | | msgstr. */ |
145 | 33.5k | const char *charsetstr = c_strstr (header_entry, "charset="); |
146 | | |
147 | 33.5k | if (charsetstr != NULL) |
148 | 11.5k | { |
149 | 11.5k | size_t len; |
150 | 11.5k | char *charset; |
151 | 11.5k | const char *canon_charset; |
152 | | |
153 | 11.5k | charsetstr += strlen ("charset="); |
154 | 11.5k | len = strcspn (charsetstr, " \t\n"); |
155 | 11.5k | charset = (char *) xmalloca (len + 1); |
156 | 11.5k | memcpy (charset, charsetstr, len); |
157 | 11.5k | charset[len] = '\0'; |
158 | | |
159 | 11.5k | canon_charset = po_charset_canonicalize (charset); |
160 | 11.5k | if (canon_charset == NULL) |
161 | 11.0k | { |
162 | | /* Don't warn for POT files, because POT files usually contain |
163 | | only ASCII msgids. */ |
164 | 11.0k | size_t filenamelen = strlen (filename); |
165 | | |
166 | 11.0k | if (!(strcmp (charset, "CHARSET") == 0 |
167 | 1.31k | && ((filenamelen >= 4 |
168 | 1.31k | && memcmp (filename + filenamelen - 4, ".pot", 4) == 0) |
169 | 1.31k | || is_pot_role))) |
170 | 11.0k | { |
171 | 11.0k | char *warning_message = |
172 | 11.0k | xasprintf (_("\ |
173 | 11.0k | Charset \"%s\" is not a portable encoding name.\n\ |
174 | 11.0k | Message conversion to user's charset might not work.\n"), |
175 | 11.0k | charset); |
176 | 11.0k | ps->catr->xeh->xerror (CAT_SEVERITY_WARNING, NULL, |
177 | 11.0k | filename, (size_t)(-1), (size_t)(-1), true, |
178 | 11.0k | warning_message); |
179 | 11.0k | free (warning_message); |
180 | 11.0k | } |
181 | 11.0k | } |
182 | 568 | else |
183 | 568 | { |
184 | 568 | const char *envval; |
185 | | |
186 | 568 | ps->po_lex_charset = canon_charset; |
187 | | |
188 | 568 | if (strcmp (canon_charset, "UTF-8") == 0) |
189 | 0 | { |
190 | 0 | ps->catr->po_lex_isolate_start = "\xE2\x81\xA8"; |
191 | 0 | ps->catr->po_lex_isolate_end = "\xE2\x81\xA9"; |
192 | 0 | } |
193 | 568 | else if (strcmp (canon_charset, "GB18030") == 0) |
194 | 0 | { |
195 | 0 | ps->catr->po_lex_isolate_start = "\x81\x36\xAC\x34"; |
196 | 0 | ps->catr->po_lex_isolate_end = "\x81\x36\xAC\x35"; |
197 | 0 | } |
198 | 568 | else |
199 | 568 | { |
200 | | /* The other encodings don't contain U+2068, U+2069. */ |
201 | 568 | ps->catr->po_lex_isolate_start = NULL; |
202 | 568 | ps->catr->po_lex_isolate_end = NULL; |
203 | 568 | } |
204 | | |
205 | 568 | #if HAVE_ICONV |
206 | 568 | if (ps->po_lex_iconv != (iconv_t)(-1)) |
207 | 233 | iconv_close (ps->po_lex_iconv); |
208 | 568 | #endif |
209 | | |
210 | | /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35 |
211 | | don't know about multibyte encodings, and require a spurious |
212 | | backslash after every multibyte character whose last byte is |
213 | | 0x5C. Some programs, like vim, distribute PO files in this |
214 | | broken format. GNU msgfmt must continue to support this old |
215 | | PO file format when the Makefile requests it. */ |
216 | 568 | envval = getenv ("OLD_PO_FILE_INPUT"); |
217 | 568 | if (envval != NULL && *envval != '\0') |
218 | 0 | { |
219 | | /* Assume the PO file is in old format, with extraneous |
220 | | backslashes. */ |
221 | 0 | #if HAVE_ICONV |
222 | 0 | ps->po_lex_iconv = (iconv_t)(-1); |
223 | 0 | #endif |
224 | 0 | ps->po_lex_weird_cjk = false; |
225 | 0 | } |
226 | 568 | else |
227 | 568 | { |
228 | | /* Use iconv() to parse multibyte characters. */ |
229 | 568 | #if HAVE_ICONV |
230 | 568 | ps->po_lex_iconv = iconv_open ("UTF-8", ps->po_lex_charset); |
231 | 568 | if (ps->po_lex_iconv == (iconv_t)(-1)) |
232 | 0 | { |
233 | 0 | const char *progname; |
234 | 0 | char *warning_message; |
235 | 0 | const char *recommendation; |
236 | 0 | const char *note; |
237 | 0 | char *whole_message; |
238 | |
|
239 | 0 | # if IN_LIBGETTEXTPO |
240 | 0 | progname = "libgettextpo"; |
241 | | # else |
242 | | progname = last_component (program_name); |
243 | | # endif |
244 | |
|
245 | 0 | warning_message = |
246 | 0 | xasprintf (_("\ |
247 | 0 | Charset \"%s\" is not supported. %s relies on iconv(),\n\ |
248 | 0 | and iconv() does not support \"%s\".\n"), |
249 | 0 | ps->po_lex_charset, progname, ps->po_lex_charset); |
250 | |
|
251 | 0 | # if !defined _LIBICONV_VERSION || (_LIBICONV_VERSION == 0x10b && defined __APPLE__) |
252 | 0 | recommendation = _("\ |
253 | 0 | Installing GNU libiconv and then reinstalling GNU gettext\n\ |
254 | 0 | would fix this problem.\n"); |
255 | | # else |
256 | | recommendation = ""; |
257 | | # endif |
258 | | |
259 | | /* Test for a charset which has double-byte characters |
260 | | ending in 0x5C. For these encodings, the string parser |
261 | | is likely to be confused if it can't see the character |
262 | | boundaries. */ |
263 | 0 | ps->po_lex_weird_cjk = po_is_charset_weird_cjk (ps->po_lex_charset); |
264 | 0 | if (po_is_charset_weird (ps->po_lex_charset) |
265 | 0 | && !ps->po_lex_weird_cjk) |
266 | 0 | note = _("Continuing anyway, expect parse errors."); |
267 | 0 | else |
268 | 0 | note = _("Continuing anyway."); |
269 | |
|
270 | 0 | whole_message = |
271 | 0 | xasprintf ("%s%s%s\n", |
272 | 0 | warning_message, recommendation, note); |
273 | |
|
274 | 0 | ps->catr->xeh->xerror (CAT_SEVERITY_WARNING, NULL, |
275 | 0 | filename, (size_t)(-1), (size_t)(-1), |
276 | 0 | true, whole_message); |
277 | |
|
278 | 0 | free (whole_message); |
279 | 0 | free (warning_message); |
280 | 0 | } |
281 | | #else |
282 | | /* Test for a charset which has double-byte characters |
283 | | ending in 0x5C. For these encodings, the string parser |
284 | | is likely to be confused if it can't see the character |
285 | | boundaries. */ |
286 | | ps->po_lex_weird_cjk = po_is_charset_weird_cjk (ps->po_lex_charset); |
287 | | if (po_is_charset_weird (ps->po_lex_charset) && !ps->po_lex_weird_cjk) |
288 | | { |
289 | | const char *progname; |
290 | | char *warning_message; |
291 | | const char *recommendation; |
292 | | const char *note; |
293 | | char *whole_message; |
294 | | |
295 | | # if IN_LIBGETTEXTPO |
296 | | progname = "libgettextpo"; |
297 | | # else |
298 | | progname = last_component (program_name); |
299 | | # endif |
300 | | |
301 | | warning_message = |
302 | | xasprintf (_("\ |
303 | | Charset \"%s\" is not supported. %s relies on iconv().\n\ |
304 | | This version was built without iconv().\n"), |
305 | | ps->po_lex_charset, progname); |
306 | | |
307 | | recommendation = _("\ |
308 | | Installing GNU libiconv and then reinstalling GNU gettext\n\ |
309 | | would fix this problem.\n"); |
310 | | |
311 | | note = _("Continuing anyway, expect parse errors."); |
312 | | |
313 | | whole_message = |
314 | | xasprintf ("%s%s%s\n", |
315 | | warning_message, recommendation, note); |
316 | | |
317 | | ps->catr->xeh->xerror (CAT_SEVERITY_WARNING, NULL, |
318 | | filename, (size_t)(-1), (size_t)(-1), |
319 | | true, whole_message); |
320 | | |
321 | | free (whole_message); |
322 | | free (warning_message); |
323 | | } |
324 | | #endif |
325 | 568 | } |
326 | 568 | } |
327 | 11.5k | freea (charset); |
328 | 11.5k | } |
329 | 21.9k | else |
330 | 21.9k | { |
331 | | /* Don't warn for POT files, because POT files usually contain |
332 | | only ASCII msgids. */ |
333 | 21.9k | size_t filenamelen = strlen (filename); |
334 | | |
335 | 21.9k | if (!(filenamelen >= 4 |
336 | 21.9k | && memcmp (filename + filenamelen - 4, ".pot", 4) == 0)) |
337 | 21.9k | ps->catr->xeh->xerror (CAT_SEVERITY_WARNING, |
338 | 21.9k | NULL, filename, (size_t)(-1), (size_t)(-1), true, |
339 | 21.9k | _("\ |
340 | 21.9k | Charset missing in header.\n\ |
341 | 21.9k | Message conversion to user's charset will not work.\n")); |
342 | 21.9k | } |
343 | 33.5k | } |
344 | | |
345 | | /* Finish up with the PO file's encoding. */ |
346 | | static void |
347 | | po_lex_charset_close (struct po_parser_state *ps) |
348 | 8.54k | { |
349 | 8.54k | ps->po_lex_charset = NULL; |
350 | 8.54k | ps->catr->po_lex_isolate_start = NULL; |
351 | 8.54k | ps->catr->po_lex_isolate_end = NULL; |
352 | 8.54k | #if HAVE_ICONV |
353 | 8.54k | if (ps->po_lex_iconv != (iconv_t)(-1)) |
354 | 335 | { |
355 | 335 | iconv_close (ps->po_lex_iconv); |
356 | 335 | ps->po_lex_iconv = (iconv_t)(-1); |
357 | 335 | } |
358 | 8.54k | #endif |
359 | 8.54k | ps->po_lex_weird_cjk = false; |
360 | 8.54k | } |
361 | | |
362 | | |
363 | | /* The lowest level of PO file parsing converts bytes to multibyte characters. |
364 | | This is needed |
365 | | 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first |
366 | | translation phase maps bytes to characters. |
367 | | 2. to keep track of the current column, for the sake of precise error |
368 | | location. Emacs compile.el interprets the column in error messages |
369 | | by default as a screen column number, not as character number. |
370 | | 3. to avoid skipping backslash-newline in the midst of a multibyte |
371 | | character. If XY is a multibyte character, X \ newline Y is invalid. |
372 | | */ |
373 | | |
374 | | /* A version of memcpy optimized for the case n <= 1. */ |
375 | | static inline void |
376 | | memcpy_small (void *dst, const void *src, size_t n) |
377 | 195M | { |
378 | 195M | if (n > 0) |
379 | 195M | { |
380 | 195M | char *q = (char *) dst; |
381 | 195M | const char *p = (const char *) src; |
382 | | |
383 | 195M | *q = *p; |
384 | 195M | if (--n > 0) |
385 | 570k | do *++q = *++p; while (--n > 0); |
386 | 195M | } |
387 | 195M | } |
388 | | |
389 | | /* EOF (not a real character) is represented with bytes = 0 and |
390 | | uc_valid = false. */ |
391 | | static inline bool |
392 | | mb_iseof (const mbchar_t mbc) |
393 | 372M | { |
394 | 372M | return (mbc->bytes == 0); |
395 | 372M | } |
396 | | |
397 | | /* Access the current character. */ |
398 | | static inline const char * |
399 | | mb_ptr (const mbchar_t mbc) |
400 | 156M | { |
401 | 156M | return mbc->buf; |
402 | 156M | } |
403 | | static inline size_t |
404 | | mb_len (const mbchar_t mbc) |
405 | 251M | { |
406 | 251M | return mbc->bytes; |
407 | 251M | } |
408 | | |
409 | | /* Comparison of characters. */ |
410 | | |
411 | | static inline bool |
412 | | mb_iseq (const mbchar_t mbc, char sc) |
413 | 517M | { |
414 | | /* Note: It is wrong to compare only mbc->uc, because when the encoding is |
415 | | SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we |
416 | | want to treat it as an escape character, although it looks like a Yen |
417 | | sign. */ |
418 | | #if HAVE_ICONV && 0 |
419 | | if (mbc->uc_valid) |
420 | | return (mbc->uc == sc); /* wrong! */ |
421 | | else |
422 | | #endif |
423 | 517M | return (mbc->bytes == 1 && mbc->buf[0] == sc); |
424 | 517M | } |
425 | | |
426 | | MAYBE_UNUSED static inline bool |
427 | | mb_isnul (const mbchar_t mbc) |
428 | 0 | { |
429 | 0 | #if HAVE_ICONV |
430 | 0 | if (mbc->uc_valid) |
431 | 0 | return (mbc->uc == 0); |
432 | 0 | else |
433 | 0 | #endif |
434 | 0 | return (mbc->bytes == 1 && mbc->buf[0] == 0); |
435 | 0 | } |
436 | | |
437 | | MAYBE_UNUSED static inline int |
438 | | mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2) |
439 | 0 | { |
440 | 0 | #if HAVE_ICONV |
441 | 0 | if (mbc1->uc_valid && mbc2->uc_valid) |
442 | 0 | return (int) mbc1->uc - (int) mbc2->uc; |
443 | 0 | else |
444 | 0 | #endif |
445 | 0 | return (mbc1->bytes == mbc2->bytes |
446 | 0 | ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) |
447 | 0 | : mbc1->bytes < mbc2->bytes |
448 | 0 | ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1) |
449 | 0 | : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1)); |
450 | 0 | } |
451 | | |
452 | | MAYBE_UNUSED static inline bool |
453 | | mb_equal (const mbchar_t mbc1, const mbchar_t mbc2) |
454 | 0 | { |
455 | 0 | #if HAVE_ICONV |
456 | 0 | if (mbc1->uc_valid && mbc2->uc_valid) |
457 | 0 | return mbc1->uc == mbc2->uc; |
458 | 0 | else |
459 | 0 | #endif |
460 | 0 | return (mbc1->bytes == mbc2->bytes |
461 | 0 | && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0); |
462 | 0 | } |
463 | | |
464 | | /* <ctype.h>, <wctype.h> classification. */ |
465 | | |
466 | | MAYBE_UNUSED static inline bool |
467 | | mb_isascii (const mbchar_t mbc) |
468 | 0 | { |
469 | 0 | #if HAVE_ICONV |
470 | 0 | if (mbc->uc_valid) |
471 | 0 | return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F); |
472 | 0 | else |
473 | 0 | #endif |
474 | 0 | return (mbc->bytes == 1 |
475 | 0 | #if CHAR_MIN < 0x00 /* to avoid gcc warning */ |
476 | 0 | && mbc->buf[0] >= 0x00 |
477 | 0 | #endif |
478 | 0 | #if CHAR_MAX > 0x7F /* to avoid gcc warning */ |
479 | 0 | && mbc->buf[0] <= 0x7F |
480 | 0 | #endif |
481 | 0 | ); |
482 | 0 | } |
483 | | |
484 | | /* Extra <wchar.h> function. */ |
485 | | |
486 | | /* Unprintable characters appear as a small box of width 1. */ |
487 | 95.0M | #define MB_UNPRINTABLE_WIDTH 1 |
488 | | |
489 | | static int |
490 | | mb_width (struct po_parser_state *ps, const mbchar_t mbc) |
491 | 133M | { |
492 | 133M | #if HAVE_ICONV |
493 | 133M | if (mbc->uc_valid) |
494 | 4.33M | { |
495 | 4.33M | ucs4_t uc = mbc->uc; |
496 | 4.33M | const char *encoding = |
497 | 4.33M | (ps->po_lex_iconv != (iconv_t)(-1) ? ps->po_lex_charset : ""); |
498 | 4.33M | int w = uc_width (uc, encoding); |
499 | | /* For unprintable characters, arbitrarily return 0 for control |
500 | | characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */ |
501 | 4.33M | if (w >= 0) |
502 | 4.15M | return w; |
503 | 181k | if (uc >= 0x0000 && uc <= 0x001F) |
504 | 175k | { |
505 | 175k | if (uc == 0x0009) |
506 | 10.3k | return 8 - (ps->gram_pos_column & 7); |
507 | 165k | return 0; |
508 | 175k | } |
509 | 5.50k | if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029)) |
510 | 5.50k | return 0; |
511 | 0 | return MB_UNPRINTABLE_WIDTH; |
512 | 5.50k | } |
513 | 128M | else |
514 | 128M | #endif |
515 | 128M | { |
516 | 128M | if (mbc->bytes == 1) |
517 | 128M | { |
518 | 128M | if ( |
519 | 128M | #if CHAR_MIN < 0x00 /* to avoid gcc warning */ |
520 | 128M | mbc->buf[0] >= 0x00 && |
521 | 99.9M | #endif |
522 | 99.9M | mbc->buf[0] <= 0x1F) |
523 | 33.2M | { |
524 | 33.2M | if (mbc->buf[0] == 0x09) |
525 | 3.37M | return 8 - (ps->gram_pos_column & 7); |
526 | 29.9M | return 0; |
527 | 33.2M | } |
528 | 95.7M | if (mbc->buf[0] == 0x7F) |
529 | 642k | return 0; |
530 | 95.7M | } |
531 | 95.0M | return MB_UNPRINTABLE_WIDTH; |
532 | 128M | } |
533 | 133M | } |
534 | | |
535 | | /* Output. */ |
536 | | MAYBE_UNUSED static inline void |
537 | | mb_putc (const mbchar_t mbc, FILE *stream) |
538 | 0 | { |
539 | 0 | fwrite (mbc->buf, 1, mbc->bytes, stream); |
540 | 0 | } |
541 | | |
542 | | /* Assignment. */ |
543 | | MAYBE_UNUSED static inline void |
544 | | mb_setascii (mbchar_t mbc, char sc) |
545 | 0 | { |
546 | 0 | mbc->bytes = 1; |
547 | 0 | #if HAVE_ICONV |
548 | 0 | mbc->uc_valid = 1; |
549 | 0 | mbc->uc = sc; |
550 | 0 | #endif |
551 | 0 | mbc->buf[0] = sc; |
552 | 0 | } |
553 | | |
554 | | /* Copying a character. */ |
555 | | static inline void |
556 | | mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc) |
557 | 8.79M | { |
558 | 8.79M | memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes); |
559 | 8.79M | new_mbc->bytes = old_mbc->bytes; |
560 | 8.79M | #if HAVE_ICONV |
561 | 8.79M | if ((new_mbc->uc_valid = old_mbc->uc_valid)) |
562 | 233k | new_mbc->uc = old_mbc->uc; |
563 | 8.79M | #endif |
564 | 8.79M | } |
565 | | |
566 | | |
567 | | /* Multibyte character input. */ |
568 | | |
569 | | static inline void |
570 | | mbfile_init (mbfile_t mbf, FILE *stream) |
571 | 8.54k | { |
572 | 8.54k | mbf->fp = stream; |
573 | 8.54k | mbf->eof_seen = false; |
574 | 8.54k | mbf->pushback_count = 0; |
575 | 8.54k | mbf->bufcount = 0; |
576 | 8.54k | } |
577 | | |
578 | | /* Read the next multibyte character from mbf and put it into mbc. |
579 | | If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */ |
580 | | static void |
581 | | mbfile_getc (struct po_parser_state *ps, mbchar_t mbc, mbfile_t mbf) |
582 | 131M | { |
583 | 131M | size_t bytes; |
584 | | |
585 | | /* Return character pushed back, if there is one. */ |
586 | 131M | if (mbf->pushback_count > 0) |
587 | 4.39M | { |
588 | 4.39M | mbf->pushback_count--; |
589 | 4.39M | mb_copy (mbc, &mbf->pushback[mbf->pushback_count]); |
590 | 4.39M | return; |
591 | 4.39M | } |
592 | | |
593 | | /* If EOF has already been seen, don't use getc. This matters if |
594 | | mbf->fp is connected to an interactive tty. */ |
595 | 127M | if (mbf->eof_seen) |
596 | 6.84k | goto eof; |
597 | | |
598 | | /* Before using iconv, we need at least one byte. */ |
599 | 127M | if (mbf->bufcount == 0) |
600 | 127M | { |
601 | 127M | int c = getc (mbf->fp); |
602 | 127M | if (c == EOF) |
603 | 8.46k | { |
604 | 8.46k | mbf->eof_seen = true; |
605 | 8.46k | goto eof; |
606 | 8.46k | } |
607 | 127M | mbf->buf[0] = (unsigned char) c; |
608 | 127M | mbf->bufcount++; |
609 | 127M | } |
610 | | |
611 | 127M | #if HAVE_ICONV |
612 | 127M | if (ps->po_lex_iconv != (iconv_t)(-1)) |
613 | 4.43M | { |
614 | | /* Use iconv on an increasing number of bytes. Read only as many |
615 | | bytes from mbf->fp as needed. This is needed to give reasonable |
616 | | interactive behaviour when mbf->fp is connected to an interactive |
617 | | tty. */ |
618 | 4.43M | for (;;) |
619 | 4.87M | { |
620 | 4.87M | unsigned char scratchbuf[64]; |
621 | 4.87M | const char *inptr = &mbf->buf[0]; |
622 | 4.87M | size_t insize = mbf->bufcount; |
623 | 4.87M | char *outptr = (char *) &scratchbuf[0]; |
624 | 4.87M | size_t outsize = sizeof (scratchbuf); |
625 | | |
626 | 4.87M | size_t res = iconv (ps->po_lex_iconv, |
627 | 4.87M | (ICONV_CONST char **) &inptr, &insize, |
628 | 4.87M | &outptr, &outsize); |
629 | | /* We expect that a character has been produced if and only if |
630 | | some input bytes have been consumed. */ |
631 | 4.87M | if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf))) |
632 | 0 | abort (); |
633 | 4.87M | if (outsize == sizeof (scratchbuf)) |
634 | 658k | { |
635 | | /* No character has been produced. Must be an error. */ |
636 | 658k | if (res != (size_t)(-1)) |
637 | 0 | abort (); |
638 | | |
639 | 658k | if (errno == EILSEQ) |
640 | 211k | { |
641 | | /* An invalid multibyte sequence was encountered. */ |
642 | | /* Return a single byte. */ |
643 | 211k | if (ps->signal_eilseq) |
644 | 149k | po_gram_error (ps, _("invalid multibyte sequence")); |
645 | 211k | bytes = 1; |
646 | 211k | mbc->uc_valid = false; |
647 | 211k | break; |
648 | 211k | } |
649 | 446k | else if (errno == EINVAL) |
650 | 446k | { |
651 | | /* An incomplete multibyte character. */ |
652 | 446k | int c; |
653 | | |
654 | 446k | if (mbf->bufcount == MBCHAR_BUF_SIZE) |
655 | 0 | { |
656 | | /* An overlong incomplete multibyte sequence was |
657 | | encountered. */ |
658 | | /* Return a single byte. */ |
659 | 0 | bytes = 1; |
660 | 0 | mbc->uc_valid = false; |
661 | 0 | break; |
662 | 0 | } |
663 | | |
664 | | /* Read one more byte and retry iconv. */ |
665 | 446k | c = getc (mbf->fp); |
666 | 446k | if (c == EOF) |
667 | 74 | { |
668 | 74 | mbf->eof_seen = true; |
669 | 74 | if (ferror (mbf->fp)) |
670 | 0 | goto eof; |
671 | 74 | if (ps->signal_eilseq) |
672 | 62 | po_gram_error (ps, _("incomplete multibyte sequence at end of file")); |
673 | 74 | bytes = mbf->bufcount; |
674 | 74 | mbc->uc_valid = false; |
675 | 74 | break; |
676 | 74 | } |
677 | 446k | mbf->buf[mbf->bufcount++] = (unsigned char) c; |
678 | 446k | if (c == '\n') |
679 | 3.55k | { |
680 | 3.55k | if (ps->signal_eilseq) |
681 | 2.61k | po_gram_error (ps, _("incomplete multibyte sequence at end of line")); |
682 | 3.55k | bytes = mbf->bufcount - 1; |
683 | 3.55k | mbc->uc_valid = false; |
684 | 3.55k | break; |
685 | 3.55k | } |
686 | 446k | } |
687 | 0 | else |
688 | 0 | { |
689 | 0 | int err = errno; |
690 | 0 | ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, |
691 | 0 | NULL, NULL, 0, 0, false, |
692 | 0 | xstrerror (_("iconv failure"), err)); |
693 | 0 | } |
694 | 658k | } |
695 | 4.21M | else |
696 | 4.21M | { |
697 | 4.21M | size_t outbytes = sizeof (scratchbuf) - outsize; |
698 | 4.21M | bytes = mbf->bufcount - insize; |
699 | | |
700 | | /* We expect that one character has been produced. */ |
701 | 4.21M | if (bytes == 0) |
702 | 0 | abort (); |
703 | 4.21M | if (outbytes == 0) |
704 | 0 | abort (); |
705 | | /* Convert it from UTF-8 to UCS-4. */ |
706 | 4.21M | if (u8_mbtoucr (&mbc->uc, scratchbuf, outbytes) < (int) outbytes) |
707 | 0 | { |
708 | | /* scratchbuf contains an out-of-range Unicode character |
709 | | (> 0x10ffff). */ |
710 | 0 | if (ps->signal_eilseq) |
711 | 0 | po_gram_error (ps, _("invalid multibyte sequence")); |
712 | 0 | mbc->uc_valid = false; |
713 | 0 | break; |
714 | 0 | } |
715 | 4.21M | mbc->uc_valid = true; |
716 | 4.21M | break; |
717 | 4.21M | } |
718 | 4.87M | } |
719 | 4.43M | } |
720 | 122M | else |
721 | 122M | #endif |
722 | 122M | { |
723 | 122M | if (ps->po_lex_weird_cjk |
724 | | /* Special handling of encodings with CJK structure. */ |
725 | 0 | && (unsigned char) mbf->buf[0] >= 0x80) |
726 | 0 | { |
727 | 0 | if (mbf->bufcount == 1) |
728 | 0 | { |
729 | | /* Read one more byte. */ |
730 | 0 | int c = getc (mbf->fp); |
731 | 0 | if (c == EOF) |
732 | 0 | { |
733 | 0 | if (ferror (mbf->fp)) |
734 | 0 | { |
735 | 0 | mbf->eof_seen = true; |
736 | 0 | goto eof; |
737 | 0 | } |
738 | 0 | } |
739 | 0 | else |
740 | 0 | { |
741 | 0 | mbf->buf[1] = (unsigned char) c; |
742 | 0 | mbf->bufcount++; |
743 | 0 | } |
744 | 0 | } |
745 | 0 | if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30) |
746 | | /* Return a double byte. */ |
747 | 0 | bytes = 2; |
748 | 0 | else |
749 | | /* Return a single byte. */ |
750 | 0 | bytes = 1; |
751 | 0 | } |
752 | 122M | else |
753 | 122M | { |
754 | | /* Return a single byte. */ |
755 | 122M | bytes = 1; |
756 | 122M | } |
757 | 122M | #if HAVE_ICONV |
758 | 122M | mbc->uc_valid = false; |
759 | 122M | #endif |
760 | 122M | } |
761 | | |
762 | | /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ |
763 | 127M | memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes); |
764 | 127M | mbc->bytes = bytes; |
765 | | |
766 | 127M | mbf->bufcount -= bytes; |
767 | 127M | if (mbf->bufcount > 0) |
768 | 131k | { |
769 | | /* It's not worth calling memmove() for so few bytes. */ |
770 | 131k | unsigned int count = mbf->bufcount; |
771 | 131k | char *p = &mbf->buf[0]; |
772 | | |
773 | 131k | do |
774 | 131k | { |
775 | 131k | *p = *(p + bytes); |
776 | 131k | p++; |
777 | 131k | } |
778 | 131k | while (--count > 0); |
779 | 131k | } |
780 | 127M | return; |
781 | | |
782 | 15.3k | eof: |
783 | | /* An mbchar_t with bytes == 0 is used to indicate EOF. */ |
784 | 15.3k | mbc->bytes = 0; |
785 | 15.3k | #if HAVE_ICONV |
786 | 15.3k | mbc->uc_valid = false; |
787 | 15.3k | #endif |
788 | 15.3k | return; |
789 | 127M | } |
790 | | |
791 | | static void |
792 | | mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf) |
793 | 4.39M | { |
794 | 4.39M | if (mbf->pushback_count >= MBFILE_MAX_PUSHBACK) |
795 | 0 | abort (); |
796 | 4.39M | mb_copy (&mbf->pushback[mbf->pushback_count], mbc); |
797 | 4.39M | mbf->pushback_count++; |
798 | 4.39M | } |
799 | | |
800 | | |
801 | | /* Prepare lexical analysis. */ |
802 | | void |
803 | | lex_start (struct po_parser_state *ps, |
804 | | FILE *fp, const char *real_filename, const char *logical_filename) |
805 | 8.54k | { |
806 | | /* Ignore the logical_filename, because PO file entries already have |
807 | | their file names attached. But use real_filename for error messages. */ |
808 | 8.54k | ps->gram_pos.file_name = xstrdup (real_filename); |
809 | | |
810 | 8.54k | mbfile_init (ps->mbf, fp); |
811 | | |
812 | 8.54k | ps->gram_pos.line_number = 1; |
813 | 8.54k | ps->gram_pos_column = 0; |
814 | 8.54k | ps->signal_eilseq = true; |
815 | 8.54k | ps->po_lex_obsolete = false; |
816 | 8.54k | ps->po_lex_previous = false; |
817 | 8.54k | po_lex_charset_init (ps); |
818 | 8.54k | ps->buf = NULL; |
819 | 8.54k | ps->bufmax = 0; |
820 | 8.54k | } |
821 | | |
822 | | /* Terminate lexical analysis. */ |
823 | | void |
824 | | lex_end (struct po_parser_state *ps) |
825 | 8.54k | { |
826 | 8.54k | ps->gram_pos.file_name = NULL; |
827 | 8.54k | ps->gram_pos.line_number = 0; |
828 | 8.54k | po_lex_charset_close (ps); |
829 | 8.54k | free (ps->buf); |
830 | 8.54k | } |
831 | | |
832 | | |
833 | | /* Read a single character, collapsing the Windows CRLF line terminator |
834 | | to a single LF. |
835 | | Supports 1 character of pushback (via mbfile_ungetc). */ |
836 | | static void |
837 | | mbfile_getc_normalized (struct po_parser_state *ps, mbchar_t mbc, mbfile_t mbf) |
838 | 131M | { |
839 | 131M | mbfile_getc (ps, mbc, ps->mbf); |
840 | 131M | if (!mb_iseof (mbc) && mb_iseq (mbc, '\r')) |
841 | 125k | { |
842 | 125k | mbchar_t mbc2; |
843 | | |
844 | 125k | mbfile_getc (ps, mbc2, ps->mbf); |
845 | 125k | if (!mb_iseof (mbc2)) |
846 | 125k | { |
847 | 125k | if (mb_iseq (mbc2, '\n')) |
848 | | /* Eliminate the CR. */ |
849 | 4.81k | mb_copy (mbc, mbc2); |
850 | 120k | else |
851 | 120k | { |
852 | 120k | mbfile_ungetc (mbc2, ps->mbf); |
853 | | /* If we get here, the caller can still do |
854 | | mbfile_ungetc (mbc, ps->mbf); |
855 | | since mbfile_getc supports 2 characters of pushback. */ |
856 | 120k | } |
857 | 125k | } |
858 | 125k | } |
859 | 131M | } |
860 | | |
861 | | |
862 | | /* Read a single character, dealing with backslash-newline. |
863 | | Also keep track of the current line number and column number. */ |
864 | | static void |
865 | | lex_getc (struct po_parser_state *ps, mbchar_t mbc) |
866 | 131M | { |
867 | 131M | for (;;) |
868 | 131M | { |
869 | 131M | mbfile_getc_normalized (ps, mbc, ps->mbf); |
870 | | |
871 | 131M | if (mb_iseof (mbc)) |
872 | 15.0k | { |
873 | 15.0k | if (ferror (ps->mbf->fp)) |
874 | 0 | bomb: |
875 | 0 | { |
876 | 0 | int err = errno; |
877 | 0 | ps->catr->xeh->xerror (CAT_SEVERITY_FATAL_ERROR, |
878 | 0 | NULL, NULL, 0, 0, false, |
879 | 0 | xstrerror (xasprintf (_("error while reading \"%s\""), |
880 | 0 | ps->gram_pos.file_name), |
881 | 0 | err)); |
882 | 0 | } |
883 | 15.0k | break; |
884 | 15.0k | } |
885 | | |
886 | 131M | if (mb_iseq (mbc, '\n')) |
887 | 1.81M | { |
888 | 1.81M | ps->gram_pos.line_number++; |
889 | 1.81M | ps->gram_pos_column = 0; |
890 | 1.81M | break; |
891 | 1.81M | } |
892 | | |
893 | 129M | ps->gram_pos_column += mb_width (ps, mbc); |
894 | | |
895 | 129M | if (mb_iseq (mbc, '\\')) |
896 | 314k | { |
897 | 314k | mbchar_t mbc2; |
898 | | |
899 | 314k | mbfile_getc_normalized (ps, mbc2, ps->mbf); |
900 | | |
901 | 314k | if (mb_iseof (mbc2)) |
902 | 158 | { |
903 | 158 | if (ferror (ps->mbf->fp)) |
904 | 0 | goto bomb; |
905 | 158 | break; |
906 | 158 | } |
907 | | |
908 | 314k | if (!mb_iseq (mbc2, '\n')) |
909 | 311k | { |
910 | 311k | mbfile_ungetc (mbc2, ps->mbf); |
911 | 311k | break; |
912 | 311k | } |
913 | | |
914 | 3.18k | ps->gram_pos.line_number++; |
915 | 3.18k | ps->gram_pos_column = 0; |
916 | 3.18k | } |
917 | 129M | else |
918 | 129M | break; |
919 | 129M | } |
920 | 131M | } |
921 | | |
922 | | |
923 | | static void |
924 | | lex_ungetc (struct po_parser_state *ps, const mbchar_t mbc) |
925 | 3.96M | { |
926 | 3.96M | if (!mb_iseof (mbc)) |
927 | 3.96M | { |
928 | 3.96M | if (mb_iseq (mbc, '\n')) |
929 | | /* Decrement the line number, but don't care about the column. */ |
930 | 189k | ps->gram_pos.line_number--; |
931 | 3.77M | else |
932 | | /* Decrement the column number. Also works well enough for tabs. */ |
933 | 3.77M | ps->gram_pos_column -= mb_width (ps, mbc); |
934 | | |
935 | 3.96M | mbfile_ungetc (mbc, ps->mbf); |
936 | 3.96M | } |
937 | 3.96M | } |
938 | | |
939 | | |
940 | | static int |
941 | | keyword_p (struct po_parser_state *ps, const char *s) |
942 | 2.97M | { |
943 | 2.97M | if (!ps->po_lex_previous) |
944 | 2.94M | { |
945 | 2.94M | if (!strcmp (s, "domain")) |
946 | 5.01k | return DOMAIN; |
947 | 2.94M | if (!strcmp (s, "msgid")) |
948 | 109k | return MSGID; |
949 | 2.83M | if (!strcmp (s, "msgid_plural")) |
950 | 4.54k | return MSGID_PLURAL; |
951 | 2.82M | if (!strcmp (s, "msgstr")) |
952 | 93.0k | return MSGSTR; |
953 | 2.73M | if (!strcmp (s, "msgctxt")) |
954 | 10.7k | return MSGCTXT; |
955 | 2.73M | } |
956 | 26.1k | else |
957 | 26.1k | { |
958 | | /* Inside a "#|" context, the keywords have a different meaning. */ |
959 | 26.1k | if (!strcmp (s, "msgid")) |
960 | 6.28k | return PREV_MSGID; |
961 | 19.9k | if (!strcmp (s, "msgid_plural")) |
962 | 369 | return PREV_MSGID_PLURAL; |
963 | 19.5k | if (!strcmp (s, "msgctxt")) |
964 | 2.64k | return PREV_MSGCTXT; |
965 | 19.5k | } |
966 | 2.74M | po_gram_error_at_line (ps->catr, &ps->gram_pos, |
967 | 2.74M | _("keyword \"%s\" unknown"), s); |
968 | 2.74M | return NAME; |
969 | 2.97M | } |
970 | | |
971 | | |
972 | | static int |
973 | | control_sequence (struct po_parser_state *ps) |
974 | 87.4k | { |
975 | 87.4k | mbchar_t mbc; |
976 | 87.4k | int val; |
977 | 87.4k | int max; |
978 | | |
979 | 87.4k | lex_getc (ps, mbc); |
980 | 87.4k | if (mb_len (mbc) == 1) |
981 | 87.1k | switch (mb_ptr (mbc) [0]) |
982 | 87.1k | { |
983 | 346 | case 'n': |
984 | 346 | return '\n'; |
985 | | |
986 | 1.17k | case 't': |
987 | 1.17k | return '\t'; |
988 | | |
989 | 897 | case 'b': |
990 | 897 | return '\b'; |
991 | | |
992 | 3.86k | case 'r': |
993 | 3.86k | return '\r'; |
994 | | |
995 | 2.62k | case 'f': |
996 | 2.62k | return '\f'; |
997 | | |
998 | 275 | case 'v': |
999 | 275 | return '\v'; |
1000 | | |
1001 | 309 | case 'a': |
1002 | 309 | return '\a'; |
1003 | | |
1004 | 20.0k | case '\\': |
1005 | 20.6k | case '"': |
1006 | 20.6k | return mb_ptr (mbc) [0]; |
1007 | | |
1008 | 6.18k | case '0': case '1': case '2': case '3': |
1009 | 8.05k | case '4': case '5': case '6': case '7': |
1010 | 8.05k | val = 0; |
1011 | 8.05k | max = 0; |
1012 | 8.05k | for (;;) |
1013 | 16.2k | { |
1014 | 16.2k | char c = mb_ptr (mbc) [0]; |
1015 | | /* Warning: not portable, can't depend on '0'..'7' ordering. */ |
1016 | 16.2k | val = val * 8 + (c - '0'); |
1017 | 16.2k | if (++max == 3) |
1018 | 3.63k | break; |
1019 | 12.5k | lex_getc (ps, mbc); |
1020 | 12.5k | if (mb_len (mbc) == 1) |
1021 | 12.5k | switch (mb_ptr (mbc) [0]) |
1022 | 12.5k | { |
1023 | 6.72k | case '0': case '1': case '2': case '3': |
1024 | 8.15k | case '4': case '5': case '6': case '7': |
1025 | 8.15k | continue; |
1026 | | |
1027 | 4.39k | default: |
1028 | 4.39k | break; |
1029 | 12.5k | } |
1030 | 4.42k | lex_ungetc (ps, mbc); |
1031 | 4.42k | break; |
1032 | 12.5k | } |
1033 | 8.05k | return val; |
1034 | | |
1035 | 26.8k | case 'x': |
1036 | 26.8k | lex_getc (ps, mbc); |
1037 | 26.8k | if (mb_iseof (mbc) || mb_len (mbc) != 1 |
1038 | 26.7k | || !c_isxdigit (mb_ptr (mbc) [0])) |
1039 | 5.84k | break; |
1040 | | |
1041 | 21.0k | val = 0; |
1042 | 21.0k | for (;;) |
1043 | 102k | { |
1044 | 102k | char c = mb_ptr (mbc) [0]; |
1045 | 102k | val *= 16; |
1046 | 102k | if (c_isdigit (c)) |
1047 | | /* Warning: not portable, can't depend on '0'..'9' ordering */ |
1048 | 68.3k | val += c - '0'; |
1049 | 34.2k | else if (c_isupper (c)) |
1050 | | /* Warning: not portable, can't depend on 'A'..'F' ordering */ |
1051 | 16.4k | val += c - 'A' + 10; |
1052 | 17.7k | else |
1053 | | /* Warning: not portable, can't depend on 'a'..'f' ordering */ |
1054 | 17.7k | val += c - 'a' + 10; |
1055 | | |
1056 | 102k | lex_getc (ps, mbc); |
1057 | 102k | if (mb_len (mbc) == 1) |
1058 | 102k | switch (mb_ptr (mbc) [0]) |
1059 | 102k | { |
1060 | 41.0k | case '0': case '1': case '2': case '3': case '4': |
1061 | 58.9k | case '5': case '6': case '7': case '8': case '9': |
1062 | 74.2k | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
1063 | 81.5k | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
1064 | 81.5k | continue; |
1065 | | |
1066 | 20.8k | default: |
1067 | 20.8k | break; |
1068 | 102k | } |
1069 | 21.0k | lex_ungetc (ps, mbc); |
1070 | 21.0k | break; |
1071 | 102k | } |
1072 | 21.0k | return val; |
1073 | | |
1074 | | /* FIXME: \u and \U are not handled. */ |
1075 | 87.1k | } |
1076 | 28.2k | lex_ungetc (ps, mbc); |
1077 | 28.2k | po_gram_error (ps, _("invalid control sequence")); |
1078 | 28.2k | return ' '; |
1079 | 87.4k | } |
1080 | | |
1081 | | |
1082 | | /* Return the next token in the PO file. The return codes are defined |
1083 | | in "read-po-gram.h". Associated data is put in 'po_gram_lval'. */ |
1084 | | int |
1085 | | po_gram_lex (union PO_GRAM_STYPE *lval, struct po_parser_state *ps) |
1086 | 40.2M | { |
1087 | | /* Cache ps->buf and ps->bufmax in local variables. */ |
1088 | 40.2M | char *buf = ps->buf; |
1089 | 40.2M | size_t bufmax = ps->bufmax; |
1090 | | |
1091 | 40.2M | mbchar_t mbc; |
1092 | 40.2M | size_t bufpos; |
1093 | | |
1094 | 40.2M | for (;;) |
1095 | 43.5M | { |
1096 | 43.5M | lex_getc (ps, mbc); |
1097 | | |
1098 | 43.5M | if (mb_iseof (mbc)) |
1099 | | /* Yacc want this for end of file. */ |
1100 | 8.54k | return 0; |
1101 | | |
1102 | 43.5M | if (mb_len (mbc) == 1) |
1103 | 43.4M | switch (mb_ptr (mbc) [0]) |
1104 | 43.4M | { |
1105 | 648k | case '\n': |
1106 | 648k | ps->po_lex_obsolete = false; |
1107 | 648k | ps->po_lex_previous = false; |
1108 | | /* Ignore whitespace, not relevant for the grammar. */ |
1109 | 648k | break; |
1110 | | |
1111 | 576k | case ' ': |
1112 | 2.56M | case '\t': |
1113 | 2.62M | case '\r': |
1114 | 2.66M | case '\f': |
1115 | 2.69M | case '\v': |
1116 | | /* Ignore whitespace, not relevant for the grammar. */ |
1117 | 2.69M | break; |
1118 | | |
1119 | 796k | case '#': |
1120 | 796k | lex_getc (ps, mbc); |
1121 | 796k | if (mb_iseq (mbc, '~')) |
1122 | | /* A pseudo-comment beginning with #~ is found. This is |
1123 | | not a comment. It is the format for obsolete entries. |
1124 | | We simply discard the "#~" prefix. The following |
1125 | | characters are expected to be well formed. */ |
1126 | 6.40k | { |
1127 | 6.40k | ps->po_lex_obsolete = true; |
1128 | | /* A pseudo-comment beginning with #~| denotes a previous |
1129 | | untranslated string in an obsolete entry. This does not |
1130 | | make much sense semantically, and is implemented here |
1131 | | for completeness only. */ |
1132 | 6.40k | lex_getc (ps, mbc); |
1133 | 6.40k | if (mb_iseq (mbc, '|')) |
1134 | 95 | ps->po_lex_previous = true; |
1135 | 6.31k | else |
1136 | 6.31k | lex_ungetc (ps, mbc); |
1137 | 6.40k | break; |
1138 | 6.40k | } |
1139 | 789k | if (mb_iseq (mbc, '|')) |
1140 | | /* A pseudo-comment beginning with #| is found. This is |
1141 | | the previous untranslated string. We discard the "#|" |
1142 | | prefix, but change the keywords and string returns |
1143 | | accordingly. */ |
1144 | 6.74k | { |
1145 | 6.74k | ps->po_lex_previous = true; |
1146 | 6.74k | break; |
1147 | 6.74k | } |
1148 | | |
1149 | | /* Accumulate comments into a buffer. If we have been asked |
1150 | | to pass comments, generate a COMMENT token, otherwise |
1151 | | discard it. */ |
1152 | 783k | ps->signal_eilseq = false; |
1153 | 783k | if (ps->catr->pass_comments) |
1154 | 783k | { |
1155 | 783k | bufpos = 0; |
1156 | 783k | for (;;) |
1157 | 32.2M | { |
1158 | 32.3M | while (bufpos + mb_len (mbc) >= bufmax) |
1159 | 134k | { |
1160 | 134k | bufmax += 100; |
1161 | 134k | buf = xrealloc (buf, bufmax); |
1162 | 134k | ps->bufmax = bufmax; |
1163 | 134k | ps->buf = buf; |
1164 | 134k | } |
1165 | 32.2M | if (mb_iseof (mbc) || mb_iseq (mbc, '\n')) |
1166 | 783k | break; |
1167 | | |
1168 | 31.4M | memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); |
1169 | 31.4M | bufpos += mb_len (mbc); |
1170 | | |
1171 | 31.4M | lex_getc (ps, mbc); |
1172 | 31.4M | } |
1173 | 783k | buf[bufpos] = '\0'; |
1174 | | |
1175 | 783k | lval->string.string = buf; |
1176 | 783k | lval->string.pos = ps->gram_pos; |
1177 | 783k | lval->string.obsolete = ps->po_lex_obsolete; |
1178 | 783k | ps->po_lex_obsolete = false; |
1179 | 783k | ps->signal_eilseq = true; |
1180 | 783k | return COMMENT; |
1181 | 783k | } |
1182 | 0 | else |
1183 | 0 | { |
1184 | | /* We do this in separate loop because collecting large |
1185 | | comments while they get not passed to the upper layers |
1186 | | is not very efficient. */ |
1187 | 0 | while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n')) |
1188 | 0 | lex_getc (ps, mbc); |
1189 | 0 | ps->po_lex_obsolete = false; |
1190 | 0 | ps->signal_eilseq = true; |
1191 | 0 | } |
1192 | 0 | break; |
1193 | | |
1194 | 633k | case '"': |
1195 | | /* Accumulate a string. */ |
1196 | 633k | bufpos = 0; |
1197 | 633k | for (;;) |
1198 | 29.0M | { |
1199 | 29.0M | lex_getc (ps, mbc); |
1200 | 29.1M | while (bufpos + mb_len (mbc) >= bufmax) |
1201 | 107k | { |
1202 | 107k | bufmax += 100; |
1203 | 107k | buf = xrealloc (buf, bufmax); |
1204 | 107k | ps->bufmax = bufmax; |
1205 | 107k | ps->buf = buf; |
1206 | 107k | } |
1207 | 29.0M | if (mb_iseof (mbc)) |
1208 | 1.89k | { |
1209 | 1.89k | po_gram_error_at_line (ps->catr, &ps->gram_pos, |
1210 | 1.89k | _("end-of-file within string")); |
1211 | 1.89k | break; |
1212 | 1.89k | } |
1213 | 29.0M | if (mb_iseq (mbc, '\n')) |
1214 | 195k | { |
1215 | 195k | po_gram_error_at_line (ps->catr, &ps->gram_pos, |
1216 | 195k | _("end-of-line within string")); |
1217 | 195k | break; |
1218 | 195k | } |
1219 | 28.8M | if (mb_iseq (mbc, '"')) |
1220 | 435k | break; |
1221 | 28.3M | if (mb_iseq (mbc, '\\')) |
1222 | 87.4k | { |
1223 | 87.4k | buf[bufpos++] = control_sequence (ps); |
1224 | 87.4k | continue; |
1225 | 87.4k | } |
1226 | | |
1227 | | /* Add mbc to the accumulator. */ |
1228 | 28.2M | memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc)); |
1229 | 28.2M | bufpos += mb_len (mbc); |
1230 | 28.2M | } |
1231 | 633k | buf[bufpos] = '\0'; |
1232 | | |
1233 | | /* Strings cannot contain the msgctxt separator, because it cannot |
1234 | | be faithfully represented in the msgid of a .mo file. */ |
1235 | 633k | if (strchr (buf, MSGCTXT_SEPARATOR) != NULL) |
1236 | 11.1k | po_gram_error_at_line (ps->catr, &ps->gram_pos, |
1237 | 11.1k | _("context separator <EOT> within string")); |
1238 | | |
1239 | | /* FIXME: Treatment of embedded \000 chars is incorrect. */ |
1240 | 633k | lval->string.string = xstrdup (buf); |
1241 | 633k | lval->string.pos = ps->gram_pos; |
1242 | 633k | lval->string.obsolete = ps->po_lex_obsolete; |
1243 | 633k | return (ps->po_lex_previous ? PREV_STRING : STRING); |
1244 | | |
1245 | 499k | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
1246 | 737k | case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
1247 | 1.42M | case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
1248 | 1.69M | case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
1249 | 1.76M | case 'y': case 'z': |
1250 | 2.09M | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
1251 | 2.32M | case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
1252 | 2.57M | case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
1253 | 2.84M | case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
1254 | 2.89M | case 'Y': case 'Z': |
1255 | 2.97M | case '_': case '$': |
1256 | 2.97M | bufpos = 0; |
1257 | 2.97M | for (;;) |
1258 | 19.6M | { |
1259 | 19.6M | char c = mb_ptr (mbc) [0]; |
1260 | 19.6M | if (bufpos + 1 >= bufmax) |
1261 | 52.7k | { |
1262 | 52.7k | bufmax += 100; |
1263 | 52.7k | buf = xrealloc (buf, bufmax); |
1264 | 52.7k | ps->bufmax = bufmax; |
1265 | 52.7k | ps->buf = buf; |
1266 | 52.7k | } |
1267 | 19.6M | buf[bufpos++] = c; |
1268 | 19.6M | lex_getc (ps, mbc); |
1269 | 19.6M | if (mb_len (mbc) == 1) |
1270 | 19.6M | switch (mb_ptr (mbc) [0]) |
1271 | 19.6M | { |
1272 | 2.94M | default: |
1273 | 2.94M | break; |
1274 | 3.44M | case 'a': case 'b': case 'c': case 'd': case 'e': |
1275 | 4.24M | case 'f': case 'g': case 'h': case 'i': case 'j': |
1276 | 4.81M | case 'k': case 'l': case 'm': case 'n': case 'o': |
1277 | 6.07M | case 'p': case 'q': case 'r': case 's': case 't': |
1278 | 7.03M | case 'u': case 'v': case 'w': case 'x': case 'y': |
1279 | 7.42M | case 'z': |
1280 | 7.70M | case 'A': case 'B': case 'C': case 'D': case 'E': |
1281 | 8.06M | case 'F': case 'G': case 'H': case 'I': case 'J': |
1282 | 8.61M | case 'K': case 'L': case 'M': case 'N': case 'O': |
1283 | 9.65M | case 'P': case 'Q': case 'R': case 'S': case 'T': |
1284 | 10.2M | case 'U': case 'V': case 'W': case 'X': case 'Y': |
1285 | 10.2M | case 'Z': |
1286 | 10.2M | case '_': case '$': |
1287 | 16.4M | case '0': case '1': case '2': case '3': case '4': |
1288 | 16.6M | case '5': case '6': case '7': case '8': case '9': |
1289 | 16.6M | continue; |
1290 | 19.6M | } |
1291 | 2.97M | break; |
1292 | 19.6M | } |
1293 | 2.97M | lex_ungetc (ps, mbc); |
1294 | | |
1295 | 2.97M | buf[bufpos] = '\0'; |
1296 | | |
1297 | 2.97M | { |
1298 | 2.97M | int k = keyword_p (ps, buf); |
1299 | 2.97M | if (k == NAME) |
1300 | 2.74M | { |
1301 | 2.74M | lval->string.string = xstrdup (buf); |
1302 | 2.74M | lval->string.pos = ps->gram_pos; |
1303 | 2.74M | lval->string.obsolete = ps->po_lex_obsolete; |
1304 | 2.74M | } |
1305 | 232k | else |
1306 | 232k | { |
1307 | 232k | lval->pos.pos = ps->gram_pos; |
1308 | 232k | lval->pos.obsolete = ps->po_lex_obsolete; |
1309 | 232k | } |
1310 | 2.97M | return k; |
1311 | 2.97M | } |
1312 | | |
1313 | 713k | case '0': case '1': case '2': case '3': case '4': |
1314 | 933k | case '5': case '6': case '7': case '8': case '9': |
1315 | 933k | bufpos = 0; |
1316 | 933k | for (;;) |
1317 | 6.60M | { |
1318 | 6.60M | char c = mb_ptr (mbc) [0]; |
1319 | 6.60M | if (bufpos + 1 >= bufmax) |
1320 | 32.5k | { |
1321 | 32.5k | bufmax += 100; |
1322 | 32.5k | buf = xrealloc (buf, bufmax + 1); |
1323 | 32.5k | ps->bufmax = bufmax; |
1324 | 32.5k | ps->buf = buf; |
1325 | 32.5k | } |
1326 | 6.60M | buf[bufpos++] = c; |
1327 | 6.60M | lex_getc (ps, mbc); |
1328 | 6.60M | if (mb_len (mbc) == 1) |
1329 | 6.60M | switch (mb_ptr (mbc) [0]) |
1330 | 6.60M | { |
1331 | 927k | default: |
1332 | 927k | break; |
1333 | | |
1334 | 4.55M | case '0': case '1': case '2': case '3': case '4': |
1335 | 5.67M | case '5': case '6': case '7': case '8': case '9': |
1336 | 5.67M | continue; |
1337 | 6.60M | } |
1338 | 933k | break; |
1339 | 6.60M | } |
1340 | 933k | lex_ungetc (ps, mbc); |
1341 | | |
1342 | 933k | buf[bufpos] = '\0'; |
1343 | | |
1344 | 933k | lval->number.number = atol (buf); |
1345 | 933k | lval->number.pos = ps->gram_pos; |
1346 | 933k | lval->number.obsolete = ps->po_lex_obsolete; |
1347 | 933k | return NUMBER; |
1348 | | |
1349 | 68.1k | case '[': |
1350 | 68.1k | lval->pos.pos = ps->gram_pos; |
1351 | 68.1k | lval->pos.obsolete = ps->po_lex_obsolete; |
1352 | 68.1k | return '['; |
1353 | | |
1354 | 39.0k | case ']': |
1355 | 39.0k | lval->pos.pos = ps->gram_pos; |
1356 | 39.0k | lval->pos.obsolete = ps->po_lex_obsolete; |
1357 | 39.0k | return ']'; |
1358 | | |
1359 | 34.6M | default: |
1360 | | /* This will cause a syntax error. */ |
1361 | 34.6M | return JUNK; |
1362 | 43.4M | } |
1363 | 124k | else |
1364 | | /* This will cause a syntax error. */ |
1365 | 124k | return JUNK; |
1366 | 43.5M | } |
1367 | 40.2M | } |