/src/binutils-gdb/gas/app.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* This is the Assembler Pre-Processor |
2 | | Copyright (C) 1987-2025 Free Software Foundation, Inc. |
3 | | |
4 | | This file is part of GAS, the GNU Assembler. |
5 | | |
6 | | GAS is free software; you can redistribute it and/or modify |
7 | | it under the terms of the GNU General Public License as published by |
8 | | the Free Software Foundation; either version 3, or (at your option) |
9 | | any later version. |
10 | | |
11 | | GAS is distributed in the hope that it will be useful, but WITHOUT |
12 | | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
13 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
14 | | License for more details. |
15 | | |
16 | | You should have received a copy of the GNU General Public License |
17 | | along with GAS; see the file COPYING. If not, write to the Free |
18 | | Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA |
19 | | 02110-1301, USA. */ |
20 | | |
21 | | /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */ |
22 | | /* App, the assembler pre-processor. This pre-processor strips out |
23 | | excess spaces, turns single-quoted characters into a decimal |
24 | | constant, and turns the # in # <number> <filename> <garbage> into a |
25 | | .linefile. This needs better error-handling. */ |
26 | | |
27 | | #include "as.h" |
28 | | |
29 | | #if (__STDC__ != 1) |
30 | | #ifndef const |
31 | | #define const /* empty */ |
32 | | #endif |
33 | | #endif |
34 | | |
35 | | #ifdef H_TICK_HEX |
36 | | int enable_h_tick_hex = 0; |
37 | | #endif |
38 | | |
39 | | #ifdef TC_M68K |
40 | | /* Whether we are scrubbing in m68k MRI mode. This is different from |
41 | | flag_m68k_mri, because the two flags will be affected by the .mri |
42 | | pseudo-op at different times. */ |
43 | | static int scrub_m68k_mri; |
44 | | |
45 | | /* The pseudo-op which switches in and out of MRI mode. See the |
46 | | comment in do_scrub_chars. */ |
47 | | static const char mri_pseudo[] = ".mri 0"; |
48 | | static const char *mri_state; |
49 | | static char mri_last_ch; |
50 | | #else |
51 | 1.05M | #define scrub_m68k_mri 0 |
52 | | #endif |
53 | | |
54 | | #if defined TC_ARM && defined OBJ_ELF |
55 | | /* The pseudo-op for which we need to special-case `@' characters. |
56 | | See the comment in do_scrub_chars. */ |
57 | | static const char symver_pseudo[] = ".symver"; |
58 | | static const char * symver_state; |
59 | | #endif |
60 | | |
61 | | /* The pseudo-op (without leading dot) at which we want to (perhaps just |
62 | | temporarily) stop processing. See the comments in do_scrub_chars(). */ |
63 | | static const char end_pseudo[] = "end "; |
64 | | static const char * end_state; |
65 | | |
66 | | /* Whether, considering the state at start of assembly, NO_PSEUDO_DOT is |
67 | | active. */ |
68 | | static bool no_pseudo_dot; |
69 | | |
70 | | static char last_char; |
71 | | |
72 | 16.3M | #define LEX_IS_SYMBOL_COMPONENT 1 |
73 | 961k | #define LEX_IS_WHITESPACE 2 |
74 | 1.08M | #define LEX_IS_LINE_SEPARATOR 3 |
75 | 389k | #define LEX_IS_COMMENT_START 4 |
76 | 420k | #define LEX_IS_LINE_COMMENT_START 5 |
77 | 0 | #define LEX_IS_TWOCHAR_COMMENT_1ST 6 |
78 | 451k | #define LEX_IS_STRINGQUOTE 8 |
79 | 584k | #define LEX_IS_COLON 9 |
80 | 6.36M | #define LEX_IS_NEWLINE 10 |
81 | 18.3k | #define LEX_IS_ONECHAR_QUOTE 11 |
82 | | #ifdef TC_V850 |
83 | | #define LEX_IS_DOUBLEDASH_1ST 12 |
84 | | #endif |
85 | | #ifdef DOUBLEBAR_PARALLEL |
86 | | #define LEX_IS_DOUBLEBAR_1ST 13 |
87 | | #endif |
88 | 365k | #define LEX_IS_PARALLEL_SEPARATOR 14 |
89 | | #ifdef H_TICK_HEX |
90 | | #define LEX_IS_H 15 |
91 | | #endif |
92 | 447k | #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT) |
93 | 503k | #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE) |
94 | 765k | #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR) |
95 | 365k | #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR) |
96 | 747k | #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START) |
97 | 747k | #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START) |
98 | | #define IS_TWOCHAR_COMMENT_1ST(c) (lex[c] == LEX_IS_TWOCHAR_COMMENT_1ST) |
99 | 6.14M | #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE) |
100 | | |
101 | | static char lex[256] = { |
102 | | [' '] = LEX_IS_WHITESPACE, |
103 | | ['\t'] = LEX_IS_WHITESPACE, |
104 | | #ifdef CR_EOL |
105 | | ['\r'] = LEX_IS_LINE_SEPARATOR, |
106 | | #else |
107 | | ['\r'] = LEX_IS_WHITESPACE, |
108 | | #endif |
109 | | ['\n'] = LEX_IS_NEWLINE, |
110 | | [':'] = LEX_IS_COLON, |
111 | | ['$'] = LEX_IS_SYMBOL_COMPONENT, |
112 | | ['.'] = LEX_IS_SYMBOL_COMPONENT, |
113 | | ['_'] = LEX_IS_SYMBOL_COMPONENT, |
114 | | ['A'] = LEX_IS_SYMBOL_COMPONENT, ['a'] = LEX_IS_SYMBOL_COMPONENT, |
115 | | ['B'] = LEX_IS_SYMBOL_COMPONENT, ['b'] = LEX_IS_SYMBOL_COMPONENT, |
116 | | ['C'] = LEX_IS_SYMBOL_COMPONENT, ['c'] = LEX_IS_SYMBOL_COMPONENT, |
117 | | ['D'] = LEX_IS_SYMBOL_COMPONENT, ['d'] = LEX_IS_SYMBOL_COMPONENT, |
118 | | ['E'] = LEX_IS_SYMBOL_COMPONENT, ['e'] = LEX_IS_SYMBOL_COMPONENT, |
119 | | ['F'] = LEX_IS_SYMBOL_COMPONENT, ['f'] = LEX_IS_SYMBOL_COMPONENT, |
120 | | ['G'] = LEX_IS_SYMBOL_COMPONENT, ['g'] = LEX_IS_SYMBOL_COMPONENT, |
121 | | ['H'] = LEX_IS_SYMBOL_COMPONENT, ['h'] = LEX_IS_SYMBOL_COMPONENT, |
122 | | ['I'] = LEX_IS_SYMBOL_COMPONENT, ['i'] = LEX_IS_SYMBOL_COMPONENT, |
123 | | ['J'] = LEX_IS_SYMBOL_COMPONENT, ['j'] = LEX_IS_SYMBOL_COMPONENT, |
124 | | ['K'] = LEX_IS_SYMBOL_COMPONENT, ['k'] = LEX_IS_SYMBOL_COMPONENT, |
125 | | ['L'] = LEX_IS_SYMBOL_COMPONENT, ['l'] = LEX_IS_SYMBOL_COMPONENT, |
126 | | ['M'] = LEX_IS_SYMBOL_COMPONENT, ['m'] = LEX_IS_SYMBOL_COMPONENT, |
127 | | ['N'] = LEX_IS_SYMBOL_COMPONENT, ['n'] = LEX_IS_SYMBOL_COMPONENT, |
128 | | ['O'] = LEX_IS_SYMBOL_COMPONENT, ['o'] = LEX_IS_SYMBOL_COMPONENT, |
129 | | ['P'] = LEX_IS_SYMBOL_COMPONENT, ['p'] = LEX_IS_SYMBOL_COMPONENT, |
130 | | ['Q'] = LEX_IS_SYMBOL_COMPONENT, ['q'] = LEX_IS_SYMBOL_COMPONENT, |
131 | | ['R'] = LEX_IS_SYMBOL_COMPONENT, ['r'] = LEX_IS_SYMBOL_COMPONENT, |
132 | | ['S'] = LEX_IS_SYMBOL_COMPONENT, ['s'] = LEX_IS_SYMBOL_COMPONENT, |
133 | | ['T'] = LEX_IS_SYMBOL_COMPONENT, ['t'] = LEX_IS_SYMBOL_COMPONENT, |
134 | | ['U'] = LEX_IS_SYMBOL_COMPONENT, ['u'] = LEX_IS_SYMBOL_COMPONENT, |
135 | | ['V'] = LEX_IS_SYMBOL_COMPONENT, ['v'] = LEX_IS_SYMBOL_COMPONENT, |
136 | | ['W'] = LEX_IS_SYMBOL_COMPONENT, ['w'] = LEX_IS_SYMBOL_COMPONENT, |
137 | | ['X'] = LEX_IS_SYMBOL_COMPONENT, ['x'] = LEX_IS_SYMBOL_COMPONENT, |
138 | | ['Y'] = LEX_IS_SYMBOL_COMPONENT, ['y'] = LEX_IS_SYMBOL_COMPONENT, |
139 | | ['Z'] = LEX_IS_SYMBOL_COMPONENT, ['z'] = LEX_IS_SYMBOL_COMPONENT, |
140 | | ['0'] = LEX_IS_SYMBOL_COMPONENT, |
141 | | ['1'] = LEX_IS_SYMBOL_COMPONENT, |
142 | | ['2'] = LEX_IS_SYMBOL_COMPONENT, |
143 | | ['3'] = LEX_IS_SYMBOL_COMPONENT, |
144 | | ['4'] = LEX_IS_SYMBOL_COMPONENT, |
145 | | ['5'] = LEX_IS_SYMBOL_COMPONENT, |
146 | | ['6'] = LEX_IS_SYMBOL_COMPONENT, |
147 | | ['7'] = LEX_IS_SYMBOL_COMPONENT, |
148 | | ['8'] = LEX_IS_SYMBOL_COMPONENT, |
149 | | ['9'] = LEX_IS_SYMBOL_COMPONENT, |
150 | | #define INIT2(n) [n] = LEX_IS_SYMBOL_COMPONENT, \ |
151 | | [(n) + 1] = LEX_IS_SYMBOL_COMPONENT |
152 | | #define INIT4(n) INIT2 (n), INIT2 ((n) + 2) |
153 | | #define INIT8(n) INIT4 (n), INIT4 ((n) + 4) |
154 | | #define INIT16(n) INIT8 (n), INIT8 ((n) + 8) |
155 | | #define INIT32(n) INIT16 (n), INIT16 ((n) + 16) |
156 | | #define INIT64(n) INIT32 (n), INIT32 ((n) + 32) |
157 | | #define INIT128(n) INIT64 (n), INIT64 ((n) + 64) |
158 | | INIT128 (128), |
159 | | #undef INIT128 |
160 | | #undef INIT64 |
161 | | #undef INIT32 |
162 | | #undef INIT16 |
163 | | #undef INIT8 |
164 | | #undef INIT4 |
165 | | #undef INIT2 |
166 | | }; |
167 | | |
168 | | void |
169 | | do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED) |
170 | 28 | { |
171 | 28 | const char *p; |
172 | | |
173 | | /* Latch this once at start. xtensa uses a hook function, yet context isn't |
174 | | meaningful for scrubbing (or else we'd need to sync scrubber behavior as |
175 | | state changes). */ |
176 | 28 | if (lex['/'] == 0) |
177 | 1 | no_pseudo_dot = NO_PSEUDO_DOT; |
178 | | |
179 | | #ifdef TC_M68K |
180 | | scrub_m68k_mri = m68k_mri; |
181 | | |
182 | | if (! m68k_mri) |
183 | | #endif |
184 | 28 | { |
185 | 28 | lex['"'] = LEX_IS_STRINGQUOTE; |
186 | | |
187 | 28 | #if ! defined (TC_HPPA) |
188 | 28 | lex['\''] = LEX_IS_ONECHAR_QUOTE; |
189 | 28 | #endif |
190 | | |
191 | | #ifdef SINGLE_QUOTE_STRINGS |
192 | | lex['\''] = LEX_IS_STRINGQUOTE; |
193 | | #endif |
194 | 28 | } |
195 | | |
196 | | /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop |
197 | | in state 5 of do_scrub_chars must be changed. */ |
198 | | |
199 | | /* Note that these override the previous defaults, e.g. if ';' is a |
200 | | comment char, then it isn't a line separator. */ |
201 | | |
202 | 28 | #ifdef tc_symbol_chars |
203 | | /* This macro permits the processor to specify all characters which |
204 | | may appears in an operand. This will prevent the scrubber from |
205 | | discarding meaningful whitespace in certain cases. The i386 |
206 | | backend uses this to support prefixes, which can confuse the |
207 | | scrubber as to whether it is parsing operands or opcodes. */ |
208 | 168 | for (p = tc_symbol_chars; *p; ++p) |
209 | 140 | lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; |
210 | 28 | #endif |
211 | | |
212 | | /* The m68k backend wants to be able to change comment_chars. */ |
213 | | #ifndef tc_comment_chars |
214 | | #define tc_comment_chars comment_chars |
215 | | #endif |
216 | 56 | for (p = tc_comment_chars; *p; p++) |
217 | 28 | lex[(unsigned char) *p] = LEX_IS_COMMENT_START; |
218 | | |
219 | | /* While counter intuitive to have more special purpose line comment chars |
220 | | override more general purpose ordinary ones, logic in do_scrub_chars() |
221 | | depends on this ordering. */ |
222 | 84 | for (p = line_comment_chars; *p; p++) |
223 | 56 | lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START; |
224 | | |
225 | 28 | #ifndef tc_line_separator_chars |
226 | 28 | #define tc_line_separator_chars line_separator_chars |
227 | 28 | #endif |
228 | 56 | for (p = tc_line_separator_chars; *p; p++) |
229 | 28 | lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR; |
230 | | |
231 | | #ifdef tc_parallel_separator_chars |
232 | | /* This macro permits the processor to specify all characters which |
233 | | separate parallel insns on the same line. */ |
234 | | for (p = tc_parallel_separator_chars; *p; p++) |
235 | | lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR; |
236 | | #endif |
237 | | |
238 | | /* Only allow slash-star comments if slash is not in use. Certain |
239 | | other cases are dealt with in LEX_IS_LINE_COMMENT_START handling. |
240 | | FIXME: This isn't right. We should always permit them. */ |
241 | 28 | if (lex['/'] == 0) |
242 | 0 | lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST; |
243 | | |
244 | | #ifdef TC_M68K |
245 | | if (m68k_mri) |
246 | | { |
247 | | lex['\''] = LEX_IS_STRINGQUOTE; |
248 | | lex[';'] = LEX_IS_COMMENT_START; |
249 | | lex['*'] = LEX_IS_LINE_COMMENT_START; |
250 | | /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but |
251 | | then it can't be used in an expression. */ |
252 | | lex['!'] = LEX_IS_LINE_COMMENT_START; |
253 | | } |
254 | | #endif |
255 | | |
256 | | #ifdef TC_V850 |
257 | | lex['-'] = LEX_IS_DOUBLEDASH_1ST; |
258 | | #endif |
259 | | #ifdef DOUBLEBAR_PARALLEL |
260 | | lex['|'] = LEX_IS_DOUBLEBAR_1ST; |
261 | | #endif |
262 | | #ifdef TC_D30V |
263 | | /* Must do this is we want VLIW instruction with "->" or "<-". */ |
264 | | lex['-'] = LEX_IS_SYMBOL_COMPONENT; |
265 | | #endif |
266 | | |
267 | | #ifdef H_TICK_HEX |
268 | | if (enable_h_tick_hex) |
269 | | { |
270 | | lex['h'] = LEX_IS_H; |
271 | | lex['H'] = LEX_IS_H; |
272 | | } |
273 | | #endif |
274 | 28 | } |
275 | | |
276 | | /* Saved state of the scrubber. */ |
277 | | static int state; |
278 | | static int old_state; |
279 | | static const char *out_string; |
280 | | static char out_buf[20]; |
281 | | static int add_newlines; |
282 | | static char *saved_input; |
283 | | static size_t saved_input_len; |
284 | | static char input_buffer[32 * 1024]; |
285 | | |
286 | | /* Data structure for saving the state of app across #include's. Note that |
287 | | app is called asynchronously to the parsing of the .include's, so our |
288 | | state at the time .include is interpreted is completely unrelated. |
289 | | That's why we have to save it all. */ |
290 | | |
291 | | struct app_save |
292 | | { |
293 | | int state; |
294 | | int old_state; |
295 | | const char * out_string; |
296 | | char out_buf[sizeof (out_buf)]; |
297 | | int add_newlines; |
298 | | char * saved_input; |
299 | | size_t saved_input_len; |
300 | | const char * end_state; |
301 | | #ifdef TC_M68K |
302 | | int scrub_m68k_mri; |
303 | | const char * mri_state; |
304 | | char mri_last_ch; |
305 | | #endif |
306 | | #if defined TC_ARM && defined OBJ_ELF |
307 | | const char * symver_state; |
308 | | #endif |
309 | | char last_char; |
310 | | }; |
311 | | |
312 | | char * |
313 | | app_push (void) |
314 | 1.38k | { |
315 | 1.38k | struct app_save *saved; |
316 | | |
317 | 1.38k | saved = XNEW (struct app_save); |
318 | 1.38k | saved->state = state; |
319 | 1.38k | saved->old_state = old_state; |
320 | 1.38k | saved->out_string = out_string; |
321 | 1.38k | memcpy (saved->out_buf, out_buf, sizeof (out_buf)); |
322 | 1.38k | saved->add_newlines = add_newlines; |
323 | 1.38k | if (saved_input == NULL) |
324 | 1.04k | saved->saved_input = NULL; |
325 | 337 | else |
326 | 337 | { |
327 | 337 | saved->saved_input = XNEWVEC (char, saved_input_len); |
328 | 337 | memcpy (saved->saved_input, saved_input, saved_input_len); |
329 | 337 | saved->saved_input_len = saved_input_len; |
330 | 337 | } |
331 | 1.38k | saved->end_state = end_state; |
332 | | #ifdef TC_M68K |
333 | | saved->scrub_m68k_mri = scrub_m68k_mri; |
334 | | saved->mri_state = mri_state; |
335 | | saved->mri_last_ch = mri_last_ch; |
336 | | #endif |
337 | | #if defined TC_ARM && defined OBJ_ELF |
338 | | saved->symver_state = symver_state; |
339 | | #endif |
340 | 1.38k | saved->last_char = last_char; |
341 | | |
342 | | /* do_scrub_begin() is not useful, just wastes time. */ |
343 | | |
344 | 1.38k | state = 0; |
345 | 1.38k | saved_input = NULL; |
346 | 1.38k | add_newlines = 0; |
347 | | |
348 | 1.38k | return (char *) saved; |
349 | 1.38k | } |
350 | | |
351 | | void |
352 | | app_pop (char *arg) |
353 | 1.38k | { |
354 | 1.38k | struct app_save *saved = (struct app_save *) arg; |
355 | | |
356 | | /* There is no do_scrub_end (). */ |
357 | 1.38k | state = saved->state; |
358 | 1.38k | old_state = saved->old_state; |
359 | 1.38k | out_string = saved->out_string; |
360 | 1.38k | memcpy (out_buf, saved->out_buf, sizeof (out_buf)); |
361 | 1.38k | add_newlines = saved->add_newlines; |
362 | 1.38k | if (saved->saved_input == NULL) |
363 | 1.04k | saved_input = NULL; |
364 | 337 | else |
365 | 337 | { |
366 | 337 | gas_assert (saved->saved_input_len <= sizeof (input_buffer)); |
367 | 337 | memcpy (input_buffer, saved->saved_input, saved->saved_input_len); |
368 | 337 | saved_input = input_buffer; |
369 | 337 | saved_input_len = saved->saved_input_len; |
370 | 337 | free (saved->saved_input); |
371 | 337 | } |
372 | 1.38k | end_state = saved->end_state; |
373 | | #ifdef TC_M68K |
374 | | scrub_m68k_mri = saved->scrub_m68k_mri; |
375 | | mri_state = saved->mri_state; |
376 | | mri_last_ch = saved->mri_last_ch; |
377 | | #endif |
378 | | #if defined TC_ARM && defined OBJ_ELF |
379 | | symver_state = saved->symver_state; |
380 | | #endif |
381 | 1.38k | last_char = saved->last_char; |
382 | | |
383 | 1.38k | free (arg); |
384 | 1.38k | } |
385 | | |
386 | | /* @@ This assumes that \n &c are the same on host and target. This is not |
387 | | necessarily true. */ |
388 | | |
389 | | static int |
390 | | process_escape (int ch) |
391 | 127 | { |
392 | 127 | switch (ch) |
393 | 127 | { |
394 | 1 | case 'b': |
395 | 1 | return '\b'; |
396 | 10 | case 'f': |
397 | 10 | return '\f'; |
398 | 0 | case 'n': |
399 | 0 | return '\n'; |
400 | 0 | case 'r': |
401 | 0 | return '\r'; |
402 | 0 | case 't': |
403 | 0 | return '\t'; |
404 | 82 | case '\'': |
405 | 82 | return '\''; |
406 | 0 | case '"': |
407 | 0 | return '\"'; |
408 | 34 | default: |
409 | 34 | return ch; |
410 | 127 | } |
411 | 127 | } |
412 | | |
413 | 0 | #define MULTIBYTE_WARN_COUNT_LIMIT 10 |
414 | | static unsigned int multibyte_warn_count = 0; |
415 | | |
416 | | bool |
417 | | scan_for_multibyte_characters (const unsigned char * start, |
418 | | const unsigned char * end, |
419 | | bool warn) |
420 | 0 | { |
421 | 0 | if (end <= start) |
422 | 0 | return false; |
423 | | |
424 | 0 | if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT) |
425 | 0 | return false; |
426 | | |
427 | 0 | bool found = false; |
428 | |
|
429 | 0 | while (start < end) |
430 | 0 | { |
431 | 0 | unsigned char c; |
432 | |
|
433 | 0 | if ((c = * start++) <= 0x7f) |
434 | 0 | continue; |
435 | | |
436 | 0 | if (!warn) |
437 | 0 | return true; |
438 | | |
439 | 0 | found = true; |
440 | |
|
441 | 0 | const char * filename; |
442 | 0 | unsigned int lineno; |
443 | |
|
444 | 0 | filename = as_where (& lineno); |
445 | 0 | if (filename == NULL) |
446 | 0 | as_warn (_("multibyte character (%#x) encountered in input"), c); |
447 | 0 | else if (lineno == 0) |
448 | 0 | as_warn (_("multibyte character (%#x) encountered in %s"), c, filename); |
449 | 0 | else |
450 | 0 | as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno); |
451 | |
|
452 | 0 | if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT) |
453 | 0 | { |
454 | 0 | as_warn (_("further multibyte character warnings suppressed")); |
455 | 0 | break; |
456 | 0 | } |
457 | 0 | } |
458 | | |
459 | 0 | return found; |
460 | 0 | } |
461 | | |
462 | | /* This function is called to process input characters. The GET |
463 | | parameter is used to retrieve more input characters. GET should |
464 | | set its parameter to point to a buffer, and return the length of |
465 | | the buffer; it should return 0 at end of file. The scrubbed output |
466 | | characters are put into the buffer starting at TOSTART; the TOSTART |
467 | | buffer is TOLEN bytes in length. The function returns the number |
468 | | of scrubbed characters put into TOSTART. This will be TOLEN unless |
469 | | end of file was seen. This function is arranged as a state |
470 | | machine, and saves its state so that it may return at any point. |
471 | | This is the way the old code used to work. */ |
472 | | |
473 | | size_t |
474 | | do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen, |
475 | | bool check_multibyte) |
476 | 1.89k | { |
477 | 1.89k | char *to = tostart; |
478 | 1.89k | char *toend = tostart + tolen; |
479 | 1.89k | char *from; |
480 | 1.89k | char *fromend; |
481 | 1.89k | size_t fromlen; |
482 | 1.89k | int ch, ch2 = 0; |
483 | | /* Character that started the string we're working on. */ |
484 | 1.89k | static char quotechar; |
485 | | |
486 | | /*State 0: beginning of normal line |
487 | | 1: After first whitespace on line (flush more white) |
488 | | 2: After first non-white (opcode) on line (keep 1white) |
489 | | 3: after second white on line (into operands) (flush white) |
490 | | 4: after putting out a .linefile, put out digits |
491 | | 5: parsing a string, then go to old-state |
492 | | 6: putting out \ escape in a "d string. |
493 | | 7: no longer used |
494 | | 8: no longer used |
495 | | 9: After seeing symbol char in state 3 (keep 1white after symchar) |
496 | | 10: After seeing whitespace in state 9 (keep white before symchar) |
497 | | 11: After seeing a symbol character in state 0 (eg a label definition) |
498 | | -1: output string in out_string and go to the state in old_state |
499 | | 12: no longer used |
500 | | #ifdef DOUBLEBAR_PARALLEL |
501 | | 13: After seeing a vertical bar, looking for a second |
502 | | vertical bar as a parallel expression separator. |
503 | | #endif |
504 | | #ifdef TC_PREDICATE_START_CHAR |
505 | | 14: After seeing a predicate start character at state 0, looking |
506 | | for a predicate end character as predicate. |
507 | | 15: After seeing a predicate start character at state 1, looking |
508 | | for a predicate end character as predicate. |
509 | | #endif |
510 | | #ifdef TC_Z80 |
511 | | 16: After seeing an 'a' or an 'A' at the start of a symbol |
512 | | 17: After seeing an 'f' or an 'F' in state 16 |
513 | | #endif |
514 | | */ |
515 | | |
516 | | /* I added states 9 and 10 because the MIPS ECOFF assembler uses |
517 | | constructs like ``.loc 1 20''. This was turning into ``.loc |
518 | | 120''. States 9 and 10 ensure that a space is never dropped in |
519 | | between characters which could appear in an identifier. Ian |
520 | | Taylor, ian@cygnus.com. |
521 | | |
522 | | I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works |
523 | | correctly on the PA (and any other target where colons are optional). |
524 | | Jeff Law, law@cs.utah.edu. |
525 | | |
526 | | I added state 13 so that something like "cmp r1, r2 || trap #1" does not |
527 | | get squashed into "cmp r1,r2||trap#1", with the all important space |
528 | | between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */ |
529 | | |
530 | | /* This macro gets the next input character. */ |
531 | | |
532 | 1.89k | #define GET() \ |
533 | 12.6M | (from < fromend \ |
534 | 12.6M | ? * (unsigned char *) (from++) \ |
535 | 12.6M | : (saved_input = NULL, \ |
536 | 2.54k | fromlen = (*get) (input_buffer, sizeof input_buffer), \ |
537 | 2.54k | from = input_buffer, \ |
538 | 2.54k | fromend = from + fromlen, \ |
539 | 2.54k | (fromlen == 0 \ |
540 | 2.54k | ? EOF \ |
541 | 2.54k | : * (unsigned char *) (from++)))) |
542 | | |
543 | | /* This macro pushes a character back on the input stream. */ |
544 | | |
545 | 437k | #define UNGET(uch) (*--from = (uch)) |
546 | | |
547 | | /* This macro puts a character into the output buffer. If this |
548 | | character fills the output buffer, this macro jumps to the label |
549 | | TOFULL. We use this rather ugly approach because we need to |
550 | | handle two different termination conditions: EOF on the input |
551 | | stream, and a full output buffer. It would be simpler if we |
552 | | always read in the entire input stream before processing it, but |
553 | | I don't want to make such a significant change to the assembler's |
554 | | memory usage. */ |
555 | | |
556 | 1.89k | #define PUT(pch) \ |
557 | 6.80M | do \ |
558 | 6.80M | { \ |
559 | 6.80M | *to++ = (pch); \ |
560 | 6.80M | if (to >= toend) \ |
561 | 6.80M | goto tofull; \ |
562 | 6.80M | } \ |
563 | 6.80M | while (0) |
564 | | |
565 | 1.89k | if (saved_input != NULL) |
566 | 612 | { |
567 | 612 | from = saved_input; |
568 | 612 | fromend = from + saved_input_len; |
569 | 612 | } |
570 | 1.28k | else |
571 | 1.28k | { |
572 | 1.28k | fromlen = (*get) (input_buffer, sizeof input_buffer); |
573 | 1.28k | if (fromlen == 0) |
574 | 27 | return 0; |
575 | 1.25k | from = input_buffer; |
576 | 1.25k | fromend = from + fromlen; |
577 | | |
578 | 1.25k | if (check_multibyte) |
579 | 0 | (void) scan_for_multibyte_characters ((const unsigned char *) from, |
580 | 0 | (const unsigned char* ) fromend, |
581 | 0 | true /* Generate warnings. */); |
582 | 1.25k | } |
583 | | |
584 | 6.08M | while (1) |
585 | 6.08M | { |
586 | | /* The cases in this switch end with continue, in order to |
587 | | branch back to the top of this while loop and generate the |
588 | | next output character in the appropriate state. */ |
589 | 6.08M | switch (state) |
590 | 6.08M | { |
591 | 43.4k | case -1: |
592 | 43.4k | ch = *out_string++; |
593 | 43.4k | if (*out_string == '\0') |
594 | 19.2k | { |
595 | 19.2k | state = old_state; |
596 | 19.2k | old_state = 3; |
597 | 19.2k | } |
598 | 43.4k | PUT (ch); |
599 | 43.4k | continue; |
600 | | |
601 | 43.4k | case 4: |
602 | 6.19k | ch = GET (); |
603 | 6.19k | if (ch == EOF) |
604 | 0 | goto fromeof; |
605 | 6.19k | else if (ch >= '0' && ch <= '9') |
606 | 4.10k | PUT (ch); |
607 | 2.09k | else |
608 | 2.09k | { |
609 | 2.17k | while (ch != EOF && IS_WHITESPACE (ch)) |
610 | 87 | ch = GET (); |
611 | 2.09k | if (ch == '"') |
612 | 1.66k | { |
613 | 1.66k | quotechar = ch; |
614 | 1.66k | state = 5; |
615 | 1.66k | old_state = 3; |
616 | 1.66k | PUT (ch); |
617 | 1.66k | } |
618 | 425 | else |
619 | 425 | { |
620 | 15.2k | while (ch != EOF && ch != '\n') |
621 | 14.8k | ch = GET (); |
622 | 425 | state = 0; |
623 | 425 | PUT (ch); |
624 | 425 | } |
625 | 2.09k | } |
626 | 6.19k | continue; |
627 | | |
628 | 890k | case 5: |
629 | | /* We are going to copy everything up to a quote character, |
630 | | with special handling for a backslash. We try to |
631 | | optimize the copying in the simple case without using the |
632 | | GET and PUT macros. */ |
633 | 890k | { |
634 | 890k | char *s; |
635 | 890k | ptrdiff_t len; |
636 | | |
637 | 14.6M | for (s = from; s < fromend; s++) |
638 | 14.6M | { |
639 | 14.6M | ch = *s; |
640 | 14.6M | if (ch == '\\' |
641 | 14.6M | || ch == quotechar |
642 | 14.6M | || ch == '\n') |
643 | 890k | break; |
644 | 14.6M | } |
645 | 890k | len = s - from; |
646 | 890k | if (len > toend - to) |
647 | 209 | len = toend - to; |
648 | 890k | if (len > 0) |
649 | 221k | { |
650 | 221k | memcpy (to, from, len); |
651 | 221k | to += len; |
652 | 221k | from += len; |
653 | 221k | if (to >= toend) |
654 | 243 | goto tofull; |
655 | 221k | } |
656 | 890k | } |
657 | | |
658 | 890k | ch = GET (); |
659 | 890k | if (ch == EOF) |
660 | 212 | { |
661 | | /* This buffer is here specifically so |
662 | | that the UNGET below will work. */ |
663 | 212 | static char one_char_buf[1]; |
664 | | |
665 | 212 | as_warn (_("end of file in string; '%c' inserted"), quotechar); |
666 | 212 | state = old_state; |
667 | 212 | from = fromend = one_char_buf + 1; |
668 | 212 | fromlen = 1; |
669 | 212 | UNGET ('\n'); |
670 | 212 | PUT (quotechar); |
671 | 212 | } |
672 | 890k | else if (ch == quotechar) |
673 | 452k | { |
674 | 452k | state = old_state; |
675 | 452k | PUT (ch); |
676 | 452k | } |
677 | 437k | else if (TC_STRING_ESCAPES && ch == '\\') |
678 | 22.3k | { |
679 | 22.3k | state = 6; |
680 | 22.3k | PUT (ch); |
681 | 22.3k | } |
682 | 415k | else if (scrub_m68k_mri && ch == '\n') |
683 | 0 | { |
684 | | /* Just quietly terminate the string. This permits lines like |
685 | | bne label loop if we haven't reach end yet. */ |
686 | 0 | state = old_state; |
687 | 0 | UNGET (ch); |
688 | 0 | PUT ('\''); |
689 | 0 | } |
690 | 415k | else |
691 | 415k | { |
692 | 415k | PUT (ch); |
693 | 415k | } |
694 | 890k | continue; |
695 | | |
696 | 890k | case 6: |
697 | 22.3k | state = 5; |
698 | 22.3k | ch = GET (); |
699 | 22.3k | switch (ch) |
700 | 22.3k | { |
701 | | /* Handle strings broken across lines, by turning '\n' into |
702 | | '\\' and 'n'. */ |
703 | 368 | case '\n': |
704 | 368 | UNGET ('n'); |
705 | 368 | add_newlines++; |
706 | 368 | PUT ('\\'); |
707 | 368 | continue; |
708 | | |
709 | 368 | case EOF: |
710 | 0 | as_warn (_("end of file in string; '%c' inserted"), quotechar); |
711 | 0 | PUT (quotechar); |
712 | 0 | continue; |
713 | | |
714 | | /* These two are used inside macros. */ |
715 | 31 | case '@': |
716 | 82 | case '+': |
717 | 82 | break; |
718 | | |
719 | 167 | case '"': |
720 | 3.11k | case '\\': |
721 | 3.27k | case 'b': |
722 | 3.40k | case 'f': |
723 | 4.07k | case 'n': |
724 | 4.32k | case 'r': |
725 | 4.40k | case 't': |
726 | 4.45k | case 'v': |
727 | 4.54k | case 'x': |
728 | 4.57k | case 'X': |
729 | 4.68k | case '0': |
730 | 4.73k | case '1': |
731 | 4.88k | case '2': |
732 | 5.49k | case '3': |
733 | 5.57k | case '4': |
734 | 5.60k | case '5': |
735 | 5.64k | case '6': |
736 | 5.69k | case '7': |
737 | 5.69k | break; |
738 | | |
739 | 16.2k | default: |
740 | | #ifdef ONLY_STANDARD_ESCAPES |
741 | | as_warn (_("unknown escape '\\%c' in string; ignored"), ch); |
742 | | #endif |
743 | 16.2k | break; |
744 | 22.3k | } |
745 | 21.9k | PUT (ch); |
746 | 21.9k | continue; |
747 | | |
748 | | #ifdef DOUBLEBAR_PARALLEL |
749 | | case 13: |
750 | | ch = GET (); |
751 | | if (ch != '|') |
752 | | abort (); |
753 | | |
754 | | /* Reset back to state 1 and pretend that we are parsing a |
755 | | line from just after the first white space. */ |
756 | | state = 1; |
757 | | PUT ('|'); |
758 | | continue; |
759 | | #endif |
760 | | #ifdef TC_Z80 |
761 | | case 16: |
762 | | /* We have seen an 'a' at the start of a symbol, look for an 'f'. */ |
763 | | ch = GET (); |
764 | | if (ch == 'f' || ch == 'F') |
765 | | { |
766 | | state = 17; |
767 | | PUT (ch); |
768 | | } |
769 | | else |
770 | | { |
771 | | if (ch != EOF) |
772 | | UNGET (ch); |
773 | | state = 9; |
774 | | break; |
775 | | } |
776 | | /* Fall through. */ |
777 | | case 17: |
778 | | /* We have seen "af" at the start of a symbol, |
779 | | a ' here is a part of that symbol. */ |
780 | | ch = GET (); |
781 | | state = 9; |
782 | | if (ch == '\'') |
783 | | /* Change to avoid warning about unclosed string. */ |
784 | | PUT ('`'); |
785 | | else if (ch != EOF) |
786 | | UNGET (ch); |
787 | | break; |
788 | | #endif |
789 | 6.08M | } |
790 | | |
791 | | /* OK, we are somewhere in states 0 through 4 or 9 through 11. */ |
792 | | |
793 | | /* flushchar: */ |
794 | 5.12M | ch = GET (); |
795 | | |
796 | | #ifdef TC_PREDICATE_START_CHAR |
797 | | if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1)) |
798 | | { |
799 | | state += 14; |
800 | | PUT (ch); |
801 | | continue; |
802 | | } |
803 | | else if (state == 14 || state == 15) |
804 | | { |
805 | | if (ch == TC_PREDICATE_END_CHAR) |
806 | | { |
807 | | state -= 14; |
808 | | PUT (ch); |
809 | | ch = GET (); |
810 | | } |
811 | | else |
812 | | { |
813 | | PUT (ch); |
814 | | continue; |
815 | | } |
816 | | } |
817 | | #endif |
818 | | |
819 | 5.36M | recycle: |
820 | | |
821 | | /* We need to watch out for .end directives: We should in particular not |
822 | | issue diagnostics for anything after an active one. */ |
823 | 5.36M | if (ch == EOF) |
824 | 1.15k | end_state = NULL; |
825 | 5.36M | else if (end_state == NULL) |
826 | 5.24M | { |
827 | 5.24M | if ((state == 0 || state == 1) |
828 | 5.24M | && (ch == '.' |
829 | 824k | || (no_pseudo_dot && ch == end_pseudo[0]))) |
830 | 101k | end_state = end_pseudo + (ch != '.'); |
831 | 5.24M | } |
832 | 122k | else if (ch != '\0' |
833 | 122k | && (*end_state == ch |
834 | | /* Avoid triggering on directives like .endif or .endr. */ |
835 | 121k | || (*end_state == ' ' && !IS_SYMBOL_COMPONENT (ch)))) |
836 | 20.6k | { |
837 | 20.6k | if (IS_NEWLINE (ch) || IS_LINE_SEPARATOR (ch)) |
838 | 0 | goto end_end; |
839 | 20.6k | ++end_state; |
840 | 20.6k | } |
841 | 101k | else if (*end_state != '\0') |
842 | | /* We did not get the expected character, or we didn't |
843 | | get a valid terminating character after seeing the |
844 | | entire pseudo-op, so we must go back to the beginning. */ |
845 | 101k | end_state = NULL; |
846 | 53 | else if (IS_NEWLINE (ch) || IS_LINE_SEPARATOR (ch)) |
847 | 53 | { |
848 | 53 | end_end: |
849 | | /* We've read the entire pseudo-op. If this is the end of the line, |
850 | | bail out now by (ab)using the output-full path. This allows the |
851 | | caller to process input up to here and terminate processing if this |
852 | | directive is actually active (not on the false branch of a |
853 | | conditional and not in a macro definition). */ |
854 | 53 | end_state = NULL; |
855 | 53 | state = 0; |
856 | 53 | PUT (ch); |
857 | 53 | goto tofull; |
858 | 53 | } |
859 | | |
860 | | #if defined TC_ARM && defined OBJ_ELF |
861 | | /* We need to watch out for .symver directives. See the comment later |
862 | | in this function. */ |
863 | | if (ch == EOF) |
864 | | symver_state = NULL; |
865 | | else if (symver_state == NULL) |
866 | | { |
867 | | if ((state == 0 || state == 1) |
868 | | && strchr (tc_comment_chars, '@') != NULL |
869 | | && ch == symver_pseudo[0]) |
870 | | symver_state = symver_pseudo + 1; |
871 | | } |
872 | | else |
873 | | { |
874 | | /* We advance to the next state if we find the right |
875 | | character. */ |
876 | | if (ch != '\0' && (*symver_state == ch)) |
877 | | ++symver_state; |
878 | | else if (*symver_state != '\0') |
879 | | /* We did not get the expected character, or we didn't |
880 | | get a valid terminating character after seeing the |
881 | | entire pseudo-op, so we must go back to the beginning. */ |
882 | | symver_state = NULL; |
883 | | else |
884 | | { |
885 | | /* We've read the entire pseudo-op. If this is the end |
886 | | of the line, go back to the beginning. */ |
887 | | if (IS_NEWLINE (ch) || IS_LINE_SEPARATOR (ch)) |
888 | | symver_state = NULL; |
889 | | } |
890 | | } |
891 | | #endif /* TC_ARM && OBJ_ELF */ |
892 | | |
893 | | #ifdef TC_M68K |
894 | | /* We want to have pseudo-ops which control whether we are in |
895 | | MRI mode or not. Unfortunately, since m68k MRI mode affects |
896 | | the scrubber, that means that we need a special purpose |
897 | | recognizer here. */ |
898 | | if (ch == EOF) |
899 | | mri_state = NULL; |
900 | | else if (mri_state == NULL) |
901 | | { |
902 | | if ((state == 0 || state == 1) |
903 | | && ch == mri_pseudo[0]) |
904 | | mri_state = mri_pseudo + 1; |
905 | | } |
906 | | else |
907 | | { |
908 | | /* We advance to the next state if we find the right |
909 | | character, or if we need a space character and we get any |
910 | | whitespace character, or if we need a '0' and we get a |
911 | | '1' (this is so that we only need one state to handle |
912 | | ``.mri 0'' and ``.mri 1''). */ |
913 | | if (ch != '\0' |
914 | | && (*mri_state == ch |
915 | | || (*mri_state == ' ' |
916 | | && IS_WHITESPACE (ch)) |
917 | | || (*mri_state == '0' |
918 | | && ch == '1'))) |
919 | | { |
920 | | mri_last_ch = ch; |
921 | | ++mri_state; |
922 | | } |
923 | | else if (*mri_state != '\0' |
924 | | || (!IS_WHITESPACE (ch) |
925 | | && !IS_LINE_SEPARATOR (ch) |
926 | | && !IS_NEWLINE (ch))) |
927 | | { |
928 | | /* We did not get the expected character, or we didn't |
929 | | get a valid terminating character after seeing the |
930 | | entire pseudo-op, so we must go back to the |
931 | | beginning. */ |
932 | | mri_state = NULL; |
933 | | } |
934 | | else |
935 | | { |
936 | | /* We've read the entire pseudo-op. mri_last_ch is |
937 | | either '0' or '1' indicating whether to enter or |
938 | | leave MRI mode. */ |
939 | | do_scrub_begin (mri_last_ch == '1'); |
940 | | mri_state = NULL; |
941 | | |
942 | | /* We continue handling the character as usual. The |
943 | | main gas reader must also handle the .mri pseudo-op |
944 | | to control expression parsing and the like. */ |
945 | | } |
946 | | } |
947 | | #endif |
948 | | |
949 | 5.36M | if (ch == EOF) |
950 | 1.15k | { |
951 | 1.15k | if (state != 0) |
952 | 18 | { |
953 | 18 | as_warn (_("end of file not at end of a line; newline inserted")); |
954 | 18 | state = 0; |
955 | 18 | PUT ('\n'); |
956 | 18 | } |
957 | 1.15k | goto fromeof; |
958 | 1.15k | } |
959 | | |
960 | 5.36M | switch (lex[ch]) |
961 | 5.36M | { |
962 | 457k | case LEX_IS_WHITESPACE: |
963 | 457k | do |
964 | 495k | { |
965 | 495k | ch = GET (); |
966 | 495k | } |
967 | 495k | while (ch != EOF && IS_WHITESPACE (ch)); |
968 | 457k | if (ch == EOF) |
969 | 1 | goto fromeof; |
970 | | |
971 | 457k | if (state == 0) |
972 | 83.4k | { |
973 | | /* Preserve a single whitespace character at the |
974 | | beginning of a line. */ |
975 | 83.4k | state = 1; |
976 | 83.4k | UNGET (ch); |
977 | 83.4k | PUT (' '); |
978 | 83.4k | break; |
979 | 83.4k | } |
980 | | |
981 | | #ifdef KEEP_WHITE_AROUND_COLON |
982 | | if (lex[ch] == LEX_IS_COLON) |
983 | | { |
984 | | /* Only keep this white if there's no white *after* the |
985 | | colon. */ |
986 | | ch2 = GET (); |
987 | | if (ch2 != EOF) |
988 | | UNGET (ch2); |
989 | | if (!IS_WHITESPACE (ch2)) |
990 | | { |
991 | | state = 9; |
992 | | UNGET (ch); |
993 | | PUT (' '); |
994 | | break; |
995 | | } |
996 | | } |
997 | | #endif |
998 | | |
999 | | /* Prune trailing whitespace. */ |
1000 | 373k | if (IS_COMMENT (ch) |
1001 | 373k | || (IS_LINE_COMMENT (ch) |
1002 | 373k | && (state < 1 || strchr (tc_comment_chars, ch))) |
1003 | 373k | || IS_NEWLINE (ch) |
1004 | 373k | || IS_LINE_SEPARATOR (ch) |
1005 | 373k | || IS_PARALLEL_SEPARATOR (ch)) |
1006 | 8.38k | { |
1007 | 8.38k | if (scrub_m68k_mri) |
1008 | 0 | { |
1009 | | /* In MRI mode, we keep these spaces. */ |
1010 | 0 | UNGET (ch); |
1011 | 0 | PUT (' '); |
1012 | 0 | break; |
1013 | 0 | } |
1014 | 8.38k | goto recycle; |
1015 | 8.38k | } |
1016 | | #ifdef DOUBLESLASH_LINE_COMMENTS |
1017 | | if (IS_TWOCHAR_COMMENT_1ST (ch)) |
1018 | | { |
1019 | | ch2 = GET (); |
1020 | | if (ch2 != EOF) |
1021 | | UNGET (ch2); |
1022 | | if (ch2 == '/') |
1023 | | goto recycle; |
1024 | | } |
1025 | | #endif |
1026 | | |
1027 | | /* If we're in state 2 or 11, we've seen a non-white |
1028 | | character followed by whitespace. If the next character |
1029 | | is ':', this is whitespace after a label name which we |
1030 | | normally must ignore. In MRI mode, though, spaces are |
1031 | | not permitted between the label and the colon. */ |
1032 | 365k | if ((state == 2 || state == 11) |
1033 | 365k | && lex[ch] == LEX_IS_COLON |
1034 | 365k | && ! scrub_m68k_mri) |
1035 | 131 | { |
1036 | 131 | state = 1; |
1037 | 131 | PUT (ch); |
1038 | 131 | break; |
1039 | 131 | } |
1040 | | |
1041 | 365k | switch (state) |
1042 | 365k | { |
1043 | 1.10k | case 1: |
1044 | | /* We can arrive here if we leave a leading whitespace |
1045 | | character at the beginning of a line. */ |
1046 | 1.10k | goto recycle; |
1047 | 49.5k | case 2: |
1048 | 49.5k | state = 3; |
1049 | 49.5k | if (to + 1 < toend) |
1050 | 49.5k | { |
1051 | | /* Optimize common case by skipping UNGET/GET. */ |
1052 | 49.5k | PUT (' '); /* Sp after opco */ |
1053 | 49.5k | goto recycle; |
1054 | 49.5k | } |
1055 | 0 | UNGET (ch); |
1056 | 0 | PUT (' '); |
1057 | 0 | break; |
1058 | 14.6k | case 3: |
1059 | 14.6k | #ifndef TC_KEEP_OPERAND_SPACES |
1060 | | /* For TI C6X, we keep these spaces as they may separate |
1061 | | functional unit specifiers from operands. */ |
1062 | 14.6k | if (scrub_m68k_mri) |
1063 | 0 | #endif |
1064 | 0 | { |
1065 | | /* In MRI mode, we keep these spaces. */ |
1066 | 0 | UNGET (ch); |
1067 | 0 | PUT (' '); |
1068 | 0 | break; |
1069 | 0 | } |
1070 | 14.6k | goto recycle; /* Sp in operands */ |
1071 | 165k | case 9: |
1072 | 165k | case 10: |
1073 | 165k | #ifndef TC_KEEP_OPERAND_SPACES |
1074 | 165k | if (scrub_m68k_mri) |
1075 | 0 | #endif |
1076 | 0 | { |
1077 | | /* In MRI mode, we keep these spaces. */ |
1078 | 0 | state = 3; |
1079 | 0 | UNGET (ch); |
1080 | 0 | PUT (' '); |
1081 | 0 | break; |
1082 | 0 | } |
1083 | 165k | state = 10; /* Sp after symbol char */ |
1084 | 165k | goto recycle; |
1085 | 134k | case 11: |
1086 | 134k | if (LABELS_WITHOUT_COLONS || flag_m68k_mri) |
1087 | 0 | state = 1; |
1088 | 134k | else |
1089 | 134k | { |
1090 | | /* We know that ch is not ':', since we tested that |
1091 | | case above. Therefore this is not a label, so it |
1092 | | must be the opcode, and we've just seen the |
1093 | | whitespace after it. */ |
1094 | 134k | state = 3; |
1095 | 134k | } |
1096 | 134k | UNGET (ch); |
1097 | 134k | PUT (' '); /* Sp after label definition. */ |
1098 | 134k | break; |
1099 | 134k | default: |
1100 | 0 | BAD_CASE (state); |
1101 | 365k | } |
1102 | 134k | break; |
1103 | | |
1104 | 134k | case LEX_IS_TWOCHAR_COMMENT_1ST: |
1105 | 0 | ch2 = GET (); |
1106 | 0 | if (ch2 == '*') |
1107 | 0 | { |
1108 | 375 | twochar_comment: |
1109 | 375 | for (;;) |
1110 | 5.94k | { |
1111 | 5.94k | do |
1112 | 1.38M | { |
1113 | 1.38M | ch2 = GET (); |
1114 | 1.38M | if (ch2 != EOF && IS_NEWLINE (ch2)) |
1115 | 6.86k | add_newlines++; |
1116 | 1.38M | } |
1117 | 1.38M | while (ch2 != EOF && ch2 != '*'); |
1118 | | |
1119 | 11.9k | while (ch2 == '*') |
1120 | 5.99k | ch2 = GET (); |
1121 | | |
1122 | 5.94k | if (ch2 == EOF || ch2 == '/') |
1123 | 375 | break; |
1124 | | |
1125 | | /* This UNGET will ensure that we count newlines |
1126 | | correctly. */ |
1127 | 5.56k | UNGET (ch2); |
1128 | 5.56k | } |
1129 | | |
1130 | 375 | if (ch2 == EOF) |
1131 | 1 | as_warn (_("end of file in multiline comment")); |
1132 | | |
1133 | 375 | ch = ' '; |
1134 | 375 | goto recycle; |
1135 | 0 | } |
1136 | | #ifdef DOUBLESLASH_LINE_COMMENTS |
1137 | | else if (ch2 == '/') |
1138 | | { |
1139 | | do |
1140 | | { |
1141 | | ch = GET (); |
1142 | | } |
1143 | | while (ch != EOF && !IS_NEWLINE (ch)); |
1144 | | if (ch == EOF) |
1145 | | as_warn ("end of file in comment; newline inserted"); |
1146 | | state = 0; |
1147 | | PUT ('\n'); |
1148 | | break; |
1149 | | } |
1150 | | #endif |
1151 | 0 | else |
1152 | 0 | { |
1153 | 0 | if (ch2 != EOF) |
1154 | 0 | UNGET (ch2); |
1155 | 0 | if (state == 9 || state == 10) |
1156 | 0 | state = 3; |
1157 | 0 | PUT (ch); |
1158 | 0 | } |
1159 | 0 | break; |
1160 | | |
1161 | 451k | case LEX_IS_STRINGQUOTE: |
1162 | 451k | quotechar = ch; |
1163 | 451k | if (state == 10) |
1164 | 834 | { |
1165 | | /* Preserve the whitespace in foo "bar". */ |
1166 | 834 | UNGET (ch); |
1167 | 834 | state = 3; |
1168 | 834 | PUT (' '); |
1169 | | |
1170 | | /* PUT didn't jump out. We could just break, but we |
1171 | | know what will happen, so optimize a bit. */ |
1172 | 834 | ch = GET (); |
1173 | 834 | old_state = 9; |
1174 | 834 | } |
1175 | 450k | else if (state == 3) |
1176 | 6.96k | old_state = 9; |
1177 | 443k | else if (state == 0) |
1178 | 6.29k | old_state = 11; /* Now seeing label definition. */ |
1179 | 437k | else |
1180 | 437k | old_state = state; |
1181 | 451k | state = 5; |
1182 | 451k | PUT (ch); |
1183 | 451k | break; |
1184 | | |
1185 | 451k | case LEX_IS_ONECHAR_QUOTE: |
1186 | | #ifdef H_TICK_HEX |
1187 | | if (state == 9 && enable_h_tick_hex) |
1188 | | { |
1189 | | char c; |
1190 | | |
1191 | | c = GET (); |
1192 | | as_warn ("'%c found after symbol", c); |
1193 | | UNGET (c); |
1194 | | } |
1195 | | #endif |
1196 | 18.2k | if (state == 10) |
1197 | 132 | { |
1198 | | /* Preserve the whitespace in foo 'b'. */ |
1199 | 132 | UNGET (ch); |
1200 | 132 | state = 3; |
1201 | 132 | PUT (' '); |
1202 | 132 | break; |
1203 | 132 | } |
1204 | 18.1k | ch = GET (); |
1205 | 18.1k | if (ch == EOF) |
1206 | 1 | { |
1207 | 1 | as_warn (_("end of file after a one-character quote; \\0 inserted")); |
1208 | 1 | ch = 0; |
1209 | 1 | } |
1210 | 18.1k | if (ch == '\\') |
1211 | 127 | { |
1212 | 127 | ch = GET (); |
1213 | 127 | if (ch == EOF) |
1214 | 0 | { |
1215 | 0 | as_warn (_("end of file in escape character")); |
1216 | 0 | ch = '\\'; |
1217 | 0 | } |
1218 | 127 | else |
1219 | 127 | ch = process_escape (ch); |
1220 | 127 | } |
1221 | 18.1k | sprintf (out_buf, "%d", (int) (unsigned char) ch); |
1222 | | |
1223 | | /* None of these 'x constants for us. We want 'x'. */ |
1224 | 18.1k | if ((ch = GET ()) != '\'') |
1225 | 17.3k | { |
1226 | | #ifdef REQUIRE_CHAR_CLOSE_QUOTE |
1227 | | as_warn (_("missing close quote; (assumed)")); |
1228 | | #else |
1229 | 17.3k | if (ch != EOF) |
1230 | 17.3k | UNGET (ch); |
1231 | 17.3k | #endif |
1232 | 17.3k | } |
1233 | 18.1k | if (strlen (out_buf) == 1) |
1234 | 1.01k | { |
1235 | 1.01k | PUT (out_buf[0]); |
1236 | 1.01k | break; |
1237 | 1.01k | } |
1238 | 17.1k | if (state == 9) |
1239 | 8.01k | old_state = 3; |
1240 | 9.10k | else |
1241 | 9.10k | old_state = state; |
1242 | 17.1k | state = -1; |
1243 | 17.1k | out_string = out_buf; |
1244 | 17.1k | PUT (*out_string++); |
1245 | 17.1k | break; |
1246 | | |
1247 | 35.3k | case LEX_IS_COLON: |
1248 | | #ifdef KEEP_WHITE_AROUND_COLON |
1249 | | state = 9; |
1250 | | #else |
1251 | 35.3k | if (state == 9 || state == 10) |
1252 | 8.93k | state = 3; |
1253 | 26.3k | else if (state != 3) |
1254 | 23.6k | state = 1; |
1255 | 35.3k | #endif |
1256 | 35.3k | PUT (ch); |
1257 | 35.3k | break; |
1258 | | |
1259 | 617k | case LEX_IS_NEWLINE: |
1260 | | /* Roll out a bunch of newlines from inside comments, etc. */ |
1261 | 617k | if (add_newlines) |
1262 | 7.23k | { |
1263 | 7.23k | --add_newlines; |
1264 | 7.23k | UNGET (ch); |
1265 | 7.23k | } |
1266 | | /* Fall through. */ |
1267 | | |
1268 | 695k | case LEX_IS_LINE_SEPARATOR: |
1269 | 695k | state = 0; |
1270 | 695k | PUT (ch); |
1271 | 695k | break; |
1272 | | |
1273 | 695k | case LEX_IS_PARALLEL_SEPARATOR: |
1274 | 0 | state = 1; |
1275 | 0 | PUT (ch); |
1276 | 0 | break; |
1277 | | |
1278 | | #ifdef TC_V850 |
1279 | | case LEX_IS_DOUBLEDASH_1ST: |
1280 | | ch2 = GET (); |
1281 | | if (ch2 != '-') |
1282 | | { |
1283 | | if (ch2 != EOF) |
1284 | | UNGET (ch2); |
1285 | | goto de_fault; |
1286 | | } |
1287 | | /* Read and skip to end of line. */ |
1288 | | do |
1289 | | { |
1290 | | ch = GET (); |
1291 | | } |
1292 | | while (ch != EOF && ch != '\n'); |
1293 | | |
1294 | | if (ch == EOF) |
1295 | | as_warn (_("end of file in comment; newline inserted")); |
1296 | | |
1297 | | state = 0; |
1298 | | PUT ('\n'); |
1299 | | break; |
1300 | | #endif |
1301 | | #ifdef DOUBLEBAR_PARALLEL |
1302 | | case LEX_IS_DOUBLEBAR_1ST: |
1303 | | ch2 = GET (); |
1304 | | if (ch2 != EOF) |
1305 | | UNGET (ch2); |
1306 | | if (ch2 != '|') |
1307 | | goto de_fault; |
1308 | | |
1309 | | /* Handle '||' in two states as invoking PUT twice might |
1310 | | result in the first one jumping out of this loop. We'd |
1311 | | then lose track of the state and one '|' char. */ |
1312 | | state = 13; |
1313 | | PUT ('|'); |
1314 | | break; |
1315 | | #endif |
1316 | 46.9k | case LEX_IS_LINE_COMMENT_START: |
1317 | | /* FIXME-someday: The two character comment stuff was badly |
1318 | | thought out. On i386, we want '/' as line comment start |
1319 | | AND we want C style comments. hence this hack. The |
1320 | | whole lexical process should be reworked. xoxorich. */ |
1321 | 46.9k | if (ch == '/') |
1322 | 26.6k | { |
1323 | 26.6k | ch2 = GET (); |
1324 | 26.6k | if (ch2 == '*') |
1325 | 375 | goto twochar_comment; |
1326 | 26.2k | if (ch2 != EOF) |
1327 | 26.2k | UNGET (ch2); |
1328 | 26.2k | } |
1329 | | |
1330 | 46.5k | if (state == 0 || state == 1) /* Only comment at start of line. */ |
1331 | 5.92k | { |
1332 | 5.92k | int startch; |
1333 | | |
1334 | 5.92k | startch = ch; |
1335 | | |
1336 | 5.92k | do |
1337 | 5.93k | { |
1338 | 5.93k | ch = GET (); |
1339 | 5.93k | } |
1340 | 5.93k | while (ch != EOF && IS_WHITESPACE (ch)); |
1341 | | |
1342 | 5.92k | if (ch == EOF) |
1343 | 0 | { |
1344 | 0 | as_warn (_("end of file in comment; newline inserted")); |
1345 | 0 | PUT ('\n'); |
1346 | 0 | break; |
1347 | 0 | } |
1348 | | |
1349 | 5.92k | if (ch < '0' || ch > '9' || state != 0 || startch != '#') |
1350 | 3.83k | { |
1351 | | /* Not a cpp line. */ |
1352 | 283k | while (ch != EOF && !IS_NEWLINE (ch)) |
1353 | 280k | ch = GET (); |
1354 | 3.83k | if (ch == EOF) |
1355 | 0 | { |
1356 | 0 | as_warn (_("end of file in comment; newline inserted")); |
1357 | 0 | PUT ('\n'); |
1358 | 0 | } |
1359 | 3.83k | else /* IS_NEWLINE (ch) */ |
1360 | 3.83k | { |
1361 | | /* To process non-zero add_newlines. */ |
1362 | 3.83k | UNGET (ch); |
1363 | 3.83k | } |
1364 | 3.83k | state = 0; |
1365 | 3.83k | break; |
1366 | 3.83k | } |
1367 | | /* Looks like `# 123 "filename"' from cpp. */ |
1368 | 2.09k | UNGET (ch); |
1369 | 2.09k | old_state = 4; |
1370 | 2.09k | state = -1; |
1371 | 2.09k | if (scrub_m68k_mri) |
1372 | 0 | out_string = "\tlinefile "; |
1373 | 2.09k | else |
1374 | 2.09k | out_string = "\t.linefile "; |
1375 | 2.09k | PUT (*out_string++); |
1376 | 2.09k | break; |
1377 | 2.09k | } |
1378 | | |
1379 | | #ifdef TC_D10V |
1380 | | /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true. |
1381 | | Trap is the only short insn that has a first operand that is |
1382 | | neither register nor label. |
1383 | | We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 . |
1384 | | We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is |
1385 | | already LEX_IS_LINE_COMMENT_START. However, it is the |
1386 | | only character in line_comment_chars for d10v, hence we |
1387 | | can recognize it as such. */ |
1388 | | /* An alternative approach would be to reset the state to 1 when |
1389 | | we see '||', '<'- or '->', but that seems to be overkill. */ |
1390 | | if (state == 10) |
1391 | | PUT (' '); |
1392 | | #endif |
1393 | | /* We have a line comment character which is not at the |
1394 | | start of a line. If this is also a normal comment |
1395 | | character, fall through. Otherwise treat it as a default |
1396 | | character. */ |
1397 | 40.6k | if (strchr (tc_comment_chars, ch) == NULL) |
1398 | 24.7k | goto de_fault; |
1399 | 15.8k | if (scrub_m68k_mri |
1400 | 15.8k | && (ch == '!' || ch == '*' || ch == '#')) |
1401 | 0 | goto de_fault; |
1402 | | /* Fall through. */ |
1403 | 15.8k | case LEX_IS_COMMENT_START: |
1404 | | #if defined TC_ARM && defined OBJ_ELF |
1405 | | /* On the ARM, `@' is the comment character. |
1406 | | Unfortunately this is also a special character in ELF .symver |
1407 | | directives (and .type, though we deal with those another way). |
1408 | | So we check if this line is such a directive, and treat |
1409 | | the character as default if so. This is a hack. */ |
1410 | | if ((symver_state != NULL) && (*symver_state == 0)) |
1411 | | goto de_fault; |
1412 | | #endif |
1413 | | |
1414 | | /* Care is needed not to damage occurrences of \<comment-char> |
1415 | | by stripping the <comment-char> onwards. Yuck. */ |
1416 | 15.8k | if ((to > tostart ? to[-1] : last_char) == '\\') |
1417 | | /* Do not treat the <comment-char> as a start-of-comment. */ |
1418 | 55 | goto de_fault; |
1419 | | |
1420 | | #ifdef WARN_COMMENTS |
1421 | | if (!found_comment) |
1422 | | found_comment_file = as_where (&found_comment); |
1423 | | #endif |
1424 | 15.8k | do |
1425 | 3.68M | { |
1426 | 3.68M | ch = GET (); |
1427 | 3.68M | } |
1428 | 3.68M | while (ch != EOF && !IS_NEWLINE (ch)); |
1429 | 15.8k | if (ch == EOF) |
1430 | 3 | as_warn (_("end of file in comment; newline inserted")); |
1431 | 15.8k | state = 0; |
1432 | 15.8k | PUT ('\n'); |
1433 | 15.8k | break; |
1434 | | |
1435 | | #ifdef H_TICK_HEX |
1436 | | case LEX_IS_H: |
1437 | | /* Look for strings like H'[0-9A-Fa-f] and if found, replace |
1438 | | the H' with 0x to make them gas-style hex characters. */ |
1439 | | if (enable_h_tick_hex) |
1440 | | { |
1441 | | char quot; |
1442 | | |
1443 | | quot = GET (); |
1444 | | if (quot == '\'') |
1445 | | { |
1446 | | UNGET ('x'); |
1447 | | ch = '0'; |
1448 | | } |
1449 | | else |
1450 | | UNGET (quot); |
1451 | | } |
1452 | | #endif |
1453 | | /* Fall through. */ |
1454 | | |
1455 | 1.16M | case LEX_IS_SYMBOL_COMPONENT: |
1456 | 1.16M | if (state == 10) |
1457 | 155k | { |
1458 | | /* This is a symbol character following another symbol |
1459 | | character, with whitespace in between. We skipped |
1460 | | the whitespace earlier, so output it now. */ |
1461 | 155k | UNGET (ch); |
1462 | 155k | state = 3; |
1463 | 155k | PUT (' '); |
1464 | 155k | break; |
1465 | 155k | } |
1466 | | |
1467 | | #ifdef TC_Z80 |
1468 | | /* "af'" is a symbol containing '\''. */ |
1469 | | if (state == 3 && (ch == 'a' || ch == 'A')) |
1470 | | { |
1471 | | state = 16; |
1472 | | PUT (ch); |
1473 | | ch = GET (); |
1474 | | if (ch == 'f' || ch == 'F') |
1475 | | { |
1476 | | state = 17; |
1477 | | PUT (ch); |
1478 | | break; |
1479 | | } |
1480 | | else |
1481 | | { |
1482 | | state = 9; |
1483 | | if (ch == EOF || !IS_SYMBOL_COMPONENT (ch)) |
1484 | | { |
1485 | | if (ch != EOF) |
1486 | | UNGET (ch); |
1487 | | break; |
1488 | | } |
1489 | | } |
1490 | | } |
1491 | | #endif |
1492 | 1.00M | if (state == 3) |
1493 | 396k | state = 9; |
1494 | | |
1495 | | /* This is a common case. Quickly copy CH and all the |
1496 | | following symbol component or normal characters. */ |
1497 | 1.00M | if (to + 1 < toend |
1498 | | #ifdef TC_M68K |
1499 | | && mri_state == NULL |
1500 | | #endif |
1501 | | #if defined TC_ARM && defined OBJ_ELF |
1502 | | && symver_state == NULL |
1503 | | #endif |
1504 | 1.00M | && end_state == NULL) |
1505 | 883k | { |
1506 | 883k | char *s; |
1507 | 883k | ptrdiff_t len; |
1508 | | |
1509 | 19.8M | for (s = from; s < fromend; s++) |
1510 | 19.8M | { |
1511 | 19.8M | int type; |
1512 | | |
1513 | 19.8M | ch2 = *(unsigned char *) s; |
1514 | 19.8M | type = lex[ch2]; |
1515 | 19.8M | if (type != 0 |
1516 | 19.8M | && type != LEX_IS_SYMBOL_COMPONENT) |
1517 | 883k | break; |
1518 | 19.8M | } |
1519 | | |
1520 | 883k | if (s > from) |
1521 | | /* Handle the last character normally, for |
1522 | | simplicity. */ |
1523 | 742k | --s; |
1524 | | |
1525 | 883k | len = s - from; |
1526 | | |
1527 | 883k | if (len > (toend - to) - 1) |
1528 | 210 | len = (toend - to) - 1; |
1529 | | |
1530 | 883k | if (len > 0) |
1531 | 673k | { |
1532 | 673k | PUT (ch); |
1533 | 673k | memcpy (to, from, len); |
1534 | 673k | to += len; |
1535 | 673k | from += len; |
1536 | 673k | if (to >= toend) |
1537 | 215 | goto tofull; |
1538 | 672k | ch = GET (); |
1539 | 672k | } |
1540 | 883k | } |
1541 | | |
1542 | | /* Fall through. */ |
1543 | 3.50M | default: |
1544 | 3.52M | de_fault: |
1545 | | /* Some relatively `normal' character. */ |
1546 | 3.52M | if (state == 0) |
1547 | 376k | { |
1548 | 376k | state = 11; /* Now seeing label definition. */ |
1549 | 376k | } |
1550 | 3.15M | else if (state == 1) |
1551 | 67.2k | { |
1552 | 67.2k | state = 2; /* Ditto. */ |
1553 | 67.2k | } |
1554 | 3.08M | else if (state == 9) |
1555 | 441k | { |
1556 | 441k | if (!IS_SYMBOL_COMPONENT (ch)) |
1557 | 52.3k | state = 3; |
1558 | 441k | } |
1559 | 2.64M | else if (state == 10) |
1560 | 8.95k | { |
1561 | 8.95k | if (ch == '\\') |
1562 | 722 | { |
1563 | | /* Special handling for backslash: a backslash may |
1564 | | be the beginning of a formal parameter (of a |
1565 | | macro) following another symbol character, with |
1566 | | whitespace in between. If that is the case, we |
1567 | | output a space before the parameter. Strictly |
1568 | | speaking, correct handling depends upon what the |
1569 | | macro parameter expands into; if the parameter |
1570 | | expands into something which does not start with |
1571 | | an operand character, then we don't want to keep |
1572 | | the space. We don't have enough information to |
1573 | | make the right choice, so here we are making the |
1574 | | choice which is more likely to be correct. */ |
1575 | 722 | if (to + 1 >= toend) |
1576 | 0 | { |
1577 | | /* If we're near the end of the buffer, save the |
1578 | | character for the next time round. Otherwise |
1579 | | we'll lose our state. */ |
1580 | 0 | UNGET (ch); |
1581 | 0 | goto tofull; |
1582 | 0 | } |
1583 | 722 | *to++ = ' '; |
1584 | 722 | } |
1585 | | |
1586 | 8.95k | state = 3; |
1587 | 8.95k | } |
1588 | 3.52M | PUT (ch); |
1589 | 3.52M | break; |
1590 | 5.36M | } |
1591 | 5.36M | } |
1592 | | |
1593 | | /*NOTREACHED*/ |
1594 | | |
1595 | 1.15k | fromeof: |
1596 | | /* We have reached the end of the input. */ |
1597 | 1.15k | if (to > tostart) |
1598 | 1.15k | last_char = to[-1]; |
1599 | 1.15k | return to - tostart; |
1600 | | |
1601 | 707 | tofull: |
1602 | | /* The output buffer is full. Save any input we have not yet |
1603 | | processed. */ |
1604 | 707 | if (fromend > from) |
1605 | 612 | { |
1606 | 612 | saved_input = from; |
1607 | 612 | saved_input_len = fromend - from; |
1608 | 612 | } |
1609 | 95 | else |
1610 | 95 | saved_input = NULL; |
1611 | | |
1612 | 707 | if (to > tostart) |
1613 | 707 | last_char = to[-1]; |
1614 | 707 | return to - tostart; |
1615 | 1.86k | } |
1616 | | |
1617 | | /* Return amount of pending input. */ |
1618 | | |
1619 | | size_t |
1620 | | do_scrub_pending (void) |
1621 | 2.56k | { |
1622 | 2.56k | size_t len = 0; |
1623 | 2.56k | if (saved_input) |
1624 | 51 | len += saved_input_len; |
1625 | 2.56k | if (state == -1) |
1626 | 0 | len += strlen (out_string); |
1627 | 2.56k | return len; |
1628 | 2.56k | } |