/src/binutils-gdb/gas/app.c
Line | Count | Source |
1 | | /* This is the Assembler Pre-Processor |
2 | | Copyright (C) 1987-2026 Free Software Foundation, Inc. |
3 | | |
4 | | This file is part of GAS, the GNU Assembler. |
5 | | |
6 | | GAS is free software; you can redistribute it and/or modify |
7 | | it under the terms of the GNU General Public License as published by |
8 | | the Free Software Foundation; either version 3, or (at your option) |
9 | | any later version. |
10 | | |
11 | | GAS is distributed in the hope that it will be useful, but WITHOUT |
12 | | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
13 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
14 | | License for more details. |
15 | | |
16 | | You should have received a copy of the GNU General Public License |
17 | | along with GAS; see the file COPYING. If not, write to the Free |
18 | | Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA |
19 | | 02110-1301, USA. */ |
20 | | |
21 | | /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */ |
22 | | /* App, the assembler pre-processor. This pre-processor strips out |
23 | | excess spaces, turns single-quoted characters into a decimal |
24 | | constant, and turns the # in # <number> <filename> <garbage> into a |
25 | | .linefile. This needs better error-handling. */ |
26 | | |
27 | | #include "as.h" |
28 | | |
29 | | #if (__STDC__ != 1) |
30 | | #ifndef const |
31 | | #define const /* empty */ |
32 | | #endif |
33 | | #endif |
34 | | |
35 | | #ifdef H_TICK_HEX |
36 | | int enable_h_tick_hex = 0; |
37 | | #endif |
38 | | |
39 | | #ifdef TC_M68K |
40 | | /* Whether we are scrubbing in m68k MRI mode. This is different from |
41 | | flag_m68k_mri, because the two flags will be affected by the .mri |
42 | | pseudo-op at different times. */ |
43 | | static int scrub_m68k_mri; |
44 | | |
45 | | /* The pseudo-op which switches in and out of MRI mode. See the |
46 | | comment in do_scrub_chars. */ |
47 | | static const char mri_pseudo[] = ".mri 0"; |
48 | | static const char *mri_state; |
49 | | static char mri_last_ch; |
50 | | #else |
51 | 80.4k | #define scrub_m68k_mri 0 |
52 | | #endif |
53 | | |
54 | | #if defined TC_ARM && defined OBJ_ELF |
55 | | /* The pseudo-op for which we need to special-case `@' characters. |
56 | | See the comment in do_scrub_chars. */ |
57 | | static const char symver_pseudo[] = ".symver"; |
58 | | static const char * symver_state; |
59 | | #endif |
60 | | |
61 | | /* The pseudo-op (without leading dot) at which we want to (perhaps just |
62 | | temporarily) stop processing. See the comments in do_scrub_chars(). */ |
63 | | static const char end_pseudo[] = "end "; |
64 | | static const char * end_state; |
65 | | |
66 | | /* Whether, considering the state at start of assembly, NO_PSEUDO_DOT is |
67 | | active. */ |
68 | | static bool no_pseudo_dot; |
69 | | |
70 | | static char last_char; |
71 | | |
72 | 1.29M | #define LEX_IS_SYMBOL_COMPONENT 1 |
73 | 157k | #define LEX_IS_WHITESPACE 2 |
74 | 123k | #define LEX_IS_LINE_SEPARATOR 3 |
75 | 61.0k | #define LEX_IS_COMMENT_START 4 |
76 | 65.2k | #define LEX_IS_LINE_COMMENT_START 5 |
77 | 0 | #define LEX_IS_TWOCHAR_COMMENT_1ST 6 |
78 | 23.4k | #define LEX_IS_STRINGQUOTE 8 |
79 | 98.3k | #define LEX_IS_COLON 9 |
80 | 136k | #define LEX_IS_NEWLINE 10 |
81 | 1.05k | #define LEX_IS_ONECHAR_QUOTE 11 |
82 | | #ifdef TC_V850 |
83 | | #define LEX_IS_DOUBLEDASH_1ST 12 |
84 | | #endif |
85 | | #ifdef DOUBLEBAR_PARALLEL |
86 | | #define LEX_IS_DOUBLEBAR_1ST 13 |
87 | | #endif |
88 | 58.9k | #define LEX_IS_PARALLEL_SEPARATOR 14 |
89 | | #ifdef H_TICK_HEX |
90 | | #define LEX_IS_H 15 |
91 | | #endif |
92 | 69.8k | #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT) |
93 | 81.8k | #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE) |
94 | 120k | #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR) |
95 | 58.9k | #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR) |
96 | 120k | #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START) |
97 | 120k | #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START) |
98 | | #define IS_TWOCHAR_COMMENT_1ST(c) (lex[c] == LEX_IS_TWOCHAR_COMMENT_1ST) |
99 | 147k | #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE) |
100 | | |
101 | | static char lex[256] = { |
102 | | [' '] = LEX_IS_WHITESPACE, |
103 | | ['\t'] = LEX_IS_WHITESPACE, |
104 | | #ifdef CR_EOL |
105 | | ['\r'] = LEX_IS_LINE_SEPARATOR, |
106 | | #else |
107 | | ['\r'] = LEX_IS_WHITESPACE, |
108 | | #endif |
109 | | ['\n'] = LEX_IS_NEWLINE, |
110 | | [':'] = LEX_IS_COLON, |
111 | | ['$'] = LEX_IS_SYMBOL_COMPONENT, |
112 | | ['.'] = LEX_IS_SYMBOL_COMPONENT, |
113 | | ['_'] = LEX_IS_SYMBOL_COMPONENT, |
114 | | ['A'] = LEX_IS_SYMBOL_COMPONENT, ['a'] = LEX_IS_SYMBOL_COMPONENT, |
115 | | ['B'] = LEX_IS_SYMBOL_COMPONENT, ['b'] = LEX_IS_SYMBOL_COMPONENT, |
116 | | ['C'] = LEX_IS_SYMBOL_COMPONENT, ['c'] = LEX_IS_SYMBOL_COMPONENT, |
117 | | ['D'] = LEX_IS_SYMBOL_COMPONENT, ['d'] = LEX_IS_SYMBOL_COMPONENT, |
118 | | ['E'] = LEX_IS_SYMBOL_COMPONENT, ['e'] = LEX_IS_SYMBOL_COMPONENT, |
119 | | ['F'] = LEX_IS_SYMBOL_COMPONENT, ['f'] = LEX_IS_SYMBOL_COMPONENT, |
120 | | ['G'] = LEX_IS_SYMBOL_COMPONENT, ['g'] = LEX_IS_SYMBOL_COMPONENT, |
121 | | ['H'] = LEX_IS_SYMBOL_COMPONENT, ['h'] = LEX_IS_SYMBOL_COMPONENT, |
122 | | ['I'] = LEX_IS_SYMBOL_COMPONENT, ['i'] = LEX_IS_SYMBOL_COMPONENT, |
123 | | ['J'] = LEX_IS_SYMBOL_COMPONENT, ['j'] = LEX_IS_SYMBOL_COMPONENT, |
124 | | ['K'] = LEX_IS_SYMBOL_COMPONENT, ['k'] = LEX_IS_SYMBOL_COMPONENT, |
125 | | ['L'] = LEX_IS_SYMBOL_COMPONENT, ['l'] = LEX_IS_SYMBOL_COMPONENT, |
126 | | ['M'] = LEX_IS_SYMBOL_COMPONENT, ['m'] = LEX_IS_SYMBOL_COMPONENT, |
127 | | ['N'] = LEX_IS_SYMBOL_COMPONENT, ['n'] = LEX_IS_SYMBOL_COMPONENT, |
128 | | ['O'] = LEX_IS_SYMBOL_COMPONENT, ['o'] = LEX_IS_SYMBOL_COMPONENT, |
129 | | ['P'] = LEX_IS_SYMBOL_COMPONENT, ['p'] = LEX_IS_SYMBOL_COMPONENT, |
130 | | ['Q'] = LEX_IS_SYMBOL_COMPONENT, ['q'] = LEX_IS_SYMBOL_COMPONENT, |
131 | | ['R'] = LEX_IS_SYMBOL_COMPONENT, ['r'] = LEX_IS_SYMBOL_COMPONENT, |
132 | | ['S'] = LEX_IS_SYMBOL_COMPONENT, ['s'] = LEX_IS_SYMBOL_COMPONENT, |
133 | | ['T'] = LEX_IS_SYMBOL_COMPONENT, ['t'] = LEX_IS_SYMBOL_COMPONENT, |
134 | | ['U'] = LEX_IS_SYMBOL_COMPONENT, ['u'] = LEX_IS_SYMBOL_COMPONENT, |
135 | | ['V'] = LEX_IS_SYMBOL_COMPONENT, ['v'] = LEX_IS_SYMBOL_COMPONENT, |
136 | | ['W'] = LEX_IS_SYMBOL_COMPONENT, ['w'] = LEX_IS_SYMBOL_COMPONENT, |
137 | | ['X'] = LEX_IS_SYMBOL_COMPONENT, ['x'] = LEX_IS_SYMBOL_COMPONENT, |
138 | | ['Y'] = LEX_IS_SYMBOL_COMPONENT, ['y'] = LEX_IS_SYMBOL_COMPONENT, |
139 | | ['Z'] = LEX_IS_SYMBOL_COMPONENT, ['z'] = LEX_IS_SYMBOL_COMPONENT, |
140 | | ['0'] = LEX_IS_SYMBOL_COMPONENT, |
141 | | ['1'] = LEX_IS_SYMBOL_COMPONENT, |
142 | | ['2'] = LEX_IS_SYMBOL_COMPONENT, |
143 | | ['3'] = LEX_IS_SYMBOL_COMPONENT, |
144 | | ['4'] = LEX_IS_SYMBOL_COMPONENT, |
145 | | ['5'] = LEX_IS_SYMBOL_COMPONENT, |
146 | | ['6'] = LEX_IS_SYMBOL_COMPONENT, |
147 | | ['7'] = LEX_IS_SYMBOL_COMPONENT, |
148 | | ['8'] = LEX_IS_SYMBOL_COMPONENT, |
149 | | ['9'] = LEX_IS_SYMBOL_COMPONENT, |
150 | | #define INIT2(n) [n] = LEX_IS_SYMBOL_COMPONENT, \ |
151 | | [(n) + 1] = LEX_IS_SYMBOL_COMPONENT |
152 | | #define INIT4(n) INIT2 (n), INIT2 ((n) + 2) |
153 | | #define INIT8(n) INIT4 (n), INIT4 ((n) + 4) |
154 | | #define INIT16(n) INIT8 (n), INIT8 ((n) + 8) |
155 | | #define INIT32(n) INIT16 (n), INIT16 ((n) + 16) |
156 | | #define INIT64(n) INIT32 (n), INIT32 ((n) + 32) |
157 | | #define INIT128(n) INIT64 (n), INIT64 ((n) + 64) |
158 | | INIT128 (128), |
159 | | #undef INIT128 |
160 | | #undef INIT64 |
161 | | #undef INIT32 |
162 | | #undef INIT16 |
163 | | #undef INIT8 |
164 | | #undef INIT4 |
165 | | #undef INIT2 |
166 | | }; |
167 | | |
168 | | void |
169 | | do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED) |
170 | 207 | { |
171 | 207 | const char *p; |
172 | | |
173 | | /* Latch this once at start. xtensa uses a hook function, yet context isn't |
174 | | meaningful for scrubbing (or else we'd need to sync scrubber behavior as |
175 | | state changes). */ |
176 | 207 | if (lex['/'] == 0) |
177 | 1 | no_pseudo_dot = NO_PSEUDO_DOT; |
178 | | |
179 | | #ifdef TC_M68K |
180 | | scrub_m68k_mri = m68k_mri; |
181 | | |
182 | | if (! m68k_mri) |
183 | | #endif |
184 | 207 | { |
185 | 207 | lex['"'] = LEX_IS_STRINGQUOTE; |
186 | | |
187 | 207 | #if ! defined (TC_HPPA) |
188 | 207 | lex['\''] = LEX_IS_ONECHAR_QUOTE; |
189 | 207 | #endif |
190 | | |
191 | | #ifdef SINGLE_QUOTE_STRINGS |
192 | | lex['\''] = LEX_IS_STRINGQUOTE; |
193 | | #endif |
194 | 207 | } |
195 | | |
196 | | /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop |
197 | | in state 5 of do_scrub_chars must be changed. */ |
198 | | |
199 | | /* Note that these override the previous defaults, e.g. if ';' is a |
200 | | comment char, then it isn't a line separator. */ |
201 | | |
202 | 207 | #ifdef tc_symbol_chars |
203 | | /* This macro permits the processor to specify all characters which |
204 | | may appears in an operand. This will prevent the scrubber from |
205 | | discarding meaningful whitespace in certain cases. The i386 |
206 | | backend uses this to support prefixes, which can confuse the |
207 | | scrubber as to whether it is parsing operands or opcodes. */ |
208 | 1.24k | for (p = tc_symbol_chars; *p; ++p) |
209 | 1.03k | lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; |
210 | 207 | #endif |
211 | | |
212 | | /* The m68k backend wants to be able to change comment_chars. */ |
213 | | #ifndef tc_comment_chars |
214 | | #define tc_comment_chars comment_chars |
215 | | #endif |
216 | 414 | for (p = tc_comment_chars; *p; p++) |
217 | 207 | lex[(unsigned char) *p] = LEX_IS_COMMENT_START; |
218 | | |
219 | | /* While counter intuitive to have more special purpose line comment chars |
220 | | override more general purpose ordinary ones, logic in do_scrub_chars() |
221 | | depends on this ordering. */ |
222 | 621 | for (p = line_comment_chars; *p; p++) |
223 | 414 | lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START; |
224 | | |
225 | 207 | #ifndef tc_line_separator_chars |
226 | 207 | #define tc_line_separator_chars line_separator_chars |
227 | 207 | #endif |
228 | 414 | for (p = tc_line_separator_chars; *p; p++) |
229 | 207 | lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR; |
230 | | |
231 | | #ifdef tc_parallel_separator_chars |
232 | | /* This macro permits the processor to specify all characters which |
233 | | separate parallel insns on the same line. */ |
234 | | for (p = tc_parallel_separator_chars; *p; p++) |
235 | | lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR; |
236 | | #endif |
237 | | |
238 | | /* Only allow slash-star comments if slash is not in use. Certain |
239 | | other cases are dealt with in LEX_IS_LINE_COMMENT_START handling. |
240 | | FIXME: This isn't right. We should always permit them. */ |
241 | 207 | if (lex['/'] == 0) |
242 | 0 | lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST; |
243 | | |
244 | | #ifdef TC_M68K |
245 | | if (m68k_mri) |
246 | | { |
247 | | lex['\''] = LEX_IS_STRINGQUOTE; |
248 | | lex[';'] = LEX_IS_COMMENT_START; |
249 | | lex['*'] = LEX_IS_LINE_COMMENT_START; |
250 | | /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but |
251 | | then it can't be used in an expression. */ |
252 | | lex['!'] = LEX_IS_LINE_COMMENT_START; |
253 | | } |
254 | | #endif |
255 | | |
256 | | #ifdef TC_V850 |
257 | | lex['-'] = LEX_IS_DOUBLEDASH_1ST; |
258 | | #endif |
259 | | #ifdef DOUBLEBAR_PARALLEL |
260 | | lex['|'] = LEX_IS_DOUBLEBAR_1ST; |
261 | | #endif |
262 | | |
263 | | #ifdef H_TICK_HEX |
264 | | if (enable_h_tick_hex) |
265 | | { |
266 | | lex['h'] = LEX_IS_H; |
267 | | lex['H'] = LEX_IS_H; |
268 | | } |
269 | | #endif |
270 | 207 | } |
271 | | |
272 | | /* Saved state of the scrubber. */ |
273 | | static int state; |
274 | | static int old_state; |
275 | | static const char *out_string; |
276 | | static char out_buf[20]; |
277 | | static int add_newlines; |
278 | | static char *saved_input; |
279 | | static size_t saved_input_len; |
280 | | static char input_buffer[32 * 1024]; |
281 | | |
282 | | /* Data structure for saving the state of app across #include's. Note that |
283 | | app is called asynchronously to the parsing of the .include's, so our |
284 | | state at the time .include is interpreted is completely unrelated. |
285 | | That's why we have to save it all. */ |
286 | | |
287 | | struct app_save |
288 | | { |
289 | | int state; |
290 | | int old_state; |
291 | | const char * out_string; |
292 | | char out_buf[sizeof (out_buf)]; |
293 | | int add_newlines; |
294 | | char * saved_input; |
295 | | size_t saved_input_len; |
296 | | const char * end_state; |
297 | | #ifdef TC_M68K |
298 | | int scrub_m68k_mri; |
299 | | const char * mri_state; |
300 | | char mri_last_ch; |
301 | | #endif |
302 | | #if defined TC_ARM && defined OBJ_ELF |
303 | | const char * symver_state; |
304 | | #endif |
305 | | char last_char; |
306 | | }; |
307 | | |
308 | | char * |
309 | | app_push (void) |
310 | 113 | { |
311 | 113 | struct app_save *saved; |
312 | | |
313 | 113 | saved = XNEW (struct app_save); |
314 | 113 | saved->state = state; |
315 | 113 | saved->old_state = old_state; |
316 | 113 | saved->out_string = out_string; |
317 | 113 | memcpy (saved->out_buf, out_buf, sizeof (out_buf)); |
318 | 113 | saved->add_newlines = add_newlines; |
319 | 113 | if (saved_input == NULL) |
320 | 113 | saved->saved_input = NULL; |
321 | 0 | else |
322 | 0 | { |
323 | 0 | saved->saved_input = XNEWVEC (char, saved_input_len); |
324 | 0 | memcpy (saved->saved_input, saved_input, saved_input_len); |
325 | 0 | saved->saved_input_len = saved_input_len; |
326 | 0 | } |
327 | 113 | saved->end_state = end_state; |
328 | | #ifdef TC_M68K |
329 | | saved->scrub_m68k_mri = scrub_m68k_mri; |
330 | | saved->mri_state = mri_state; |
331 | | saved->mri_last_ch = mri_last_ch; |
332 | | #endif |
333 | | #if defined TC_ARM && defined OBJ_ELF |
334 | | saved->symver_state = symver_state; |
335 | | #endif |
336 | 113 | saved->last_char = last_char; |
337 | | |
338 | | /* do_scrub_begin() is not useful, just wastes time. */ |
339 | | |
340 | 113 | state = 0; |
341 | 113 | saved_input = NULL; |
342 | 113 | add_newlines = 0; |
343 | | |
344 | 113 | return (char *) saved; |
345 | 113 | } |
346 | | |
347 | | void |
348 | | app_pop (char *arg) |
349 | 113 | { |
350 | 113 | struct app_save *saved = (struct app_save *) arg; |
351 | | |
352 | | /* There is no do_scrub_end (). */ |
353 | 113 | state = saved->state; |
354 | 113 | old_state = saved->old_state; |
355 | 113 | out_string = saved->out_string; |
356 | 113 | memcpy (out_buf, saved->out_buf, sizeof (out_buf)); |
357 | 113 | add_newlines = saved->add_newlines; |
358 | 113 | if (saved->saved_input == NULL) |
359 | 113 | saved_input = NULL; |
360 | 0 | else |
361 | 0 | { |
362 | 0 | gas_assert (saved->saved_input_len <= sizeof (input_buffer)); |
363 | 0 | memcpy (input_buffer, saved->saved_input, saved->saved_input_len); |
364 | 0 | saved_input = input_buffer; |
365 | 0 | saved_input_len = saved->saved_input_len; |
366 | 0 | free (saved->saved_input); |
367 | 0 | } |
368 | 113 | end_state = saved->end_state; |
369 | | #ifdef TC_M68K |
370 | | scrub_m68k_mri = saved->scrub_m68k_mri; |
371 | | mri_state = saved->mri_state; |
372 | | mri_last_ch = saved->mri_last_ch; |
373 | | #endif |
374 | | #if defined TC_ARM && defined OBJ_ELF |
375 | | symver_state = saved->symver_state; |
376 | | #endif |
377 | 113 | last_char = saved->last_char; |
378 | | |
379 | 113 | free (arg); |
380 | 113 | } |
381 | | |
382 | | /* @@ This assumes that \n &c are the same on host and target. This is not |
383 | | necessarily true. */ |
384 | | |
385 | | static int |
386 | | process_escape (int ch) |
387 | 2 | { |
388 | 2 | switch (ch) |
389 | 2 | { |
390 | 0 | case 'b': |
391 | 0 | return '\b'; |
392 | 0 | case 'f': |
393 | 0 | return '\f'; |
394 | 2 | case 'n': |
395 | 2 | return '\n'; |
396 | 0 | case 'r': |
397 | 0 | return '\r'; |
398 | 0 | case 't': |
399 | 0 | return '\t'; |
400 | 0 | case '\'': |
401 | 0 | return '\''; |
402 | 0 | case '"': |
403 | 0 | return '\"'; |
404 | 0 | default: |
405 | 0 | return ch; |
406 | 2 | } |
407 | 2 | } |
408 | | |
409 | 0 | #define MULTIBYTE_WARN_COUNT_LIMIT 10 |
410 | | static unsigned int multibyte_warn_count = 0; |
411 | | |
412 | | bool |
413 | | scan_for_multibyte_characters (const unsigned char * start, |
414 | | const unsigned char * end, |
415 | | bool warn) |
416 | 0 | { |
417 | 0 | if (end <= start) |
418 | 0 | return false; |
419 | | |
420 | 0 | if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT) |
421 | 0 | return false; |
422 | | |
423 | 0 | bool found = false; |
424 | |
|
425 | 0 | while (start < end) |
426 | 0 | { |
427 | 0 | unsigned char c; |
428 | |
|
429 | 0 | if ((c = * start++) <= 0x7f) |
430 | 0 | continue; |
431 | | |
432 | 0 | if (!warn) |
433 | 0 | return true; |
434 | | |
435 | 0 | found = true; |
436 | |
|
437 | 0 | const char * filename; |
438 | 0 | unsigned int lineno; |
439 | |
|
440 | 0 | filename = as_where (& lineno); |
441 | 0 | if (filename == NULL) |
442 | 0 | as_warn (_("multibyte character (%#x) encountered in input"), c); |
443 | 0 | else if (lineno == 0) |
444 | 0 | as_warn (_("multibyte character (%#x) encountered in %s"), c, filename); |
445 | 0 | else |
446 | 0 | as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno); |
447 | |
|
448 | 0 | if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT) |
449 | 0 | { |
450 | 0 | as_warn (_("further multibyte character warnings suppressed")); |
451 | 0 | break; |
452 | 0 | } |
453 | 0 | } |
454 | | |
455 | 0 | return found; |
456 | 0 | } |
457 | | |
458 | | /* This function is called to process input characters. The GET |
459 | | parameter is used to retrieve more input characters. GET should |
460 | | set its parameter to point to a buffer, and return the length of |
461 | | the buffer; it should return 0 at end of file. The scrubbed output |
462 | | characters are put into the buffer starting at TOSTART; the TOSTART |
463 | | buffer is TOLEN bytes in length. The function returns the number |
464 | | of scrubbed characters put into TOSTART. This will be TOLEN unless |
465 | | end of file was seen. This function is arranged as a state |
466 | | machine, and saves its state so that it may return at any point. |
467 | | This is the way the old code used to work. */ |
468 | | |
469 | | size_t |
470 | | do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen, |
471 | | bool check_multibyte) |
472 | 525 | { |
473 | 525 | char *to = tostart; |
474 | 525 | char *toend = tostart + tolen; |
475 | 525 | char *from; |
476 | 525 | char *fromend; |
477 | 525 | size_t fromlen; |
478 | 525 | int ch, ch2 = 0; |
479 | | /* Character that started the string we're working on. */ |
480 | 525 | static char quotechar; |
481 | | |
482 | | /*State 0: beginning of normal line |
483 | | 1: After first whitespace on line (flush more white) |
484 | | 2: After first non-white (opcode) on line (keep 1white) |
485 | | 3: after second white on line (into operands) (flush white) |
486 | | 4: after putting out a .linefile, put out digits |
487 | | 5: parsing a string, then go to old-state |
488 | | 6: putting out \ escape in a "d string. |
489 | | 7: no longer used |
490 | | 8: no longer used |
491 | | 9: After seeing symbol char in state 3 (keep 1white after symchar) |
492 | | 10: After seeing whitespace in state 9 (keep white before symchar) |
493 | | 11: After seeing a symbol character in state 0 (eg a label definition) |
494 | | -1: output string in out_string and go to the state in old_state |
495 | | 12: no longer used |
496 | | #ifdef DOUBLEBAR_PARALLEL |
497 | | 13: After seeing a vertical bar, looking for a second |
498 | | vertical bar as a parallel expression separator. |
499 | | #endif |
500 | | #ifdef TC_PREDICATE_START_CHAR |
501 | | 14: After seeing a predicate start character at state 0, looking |
502 | | for a predicate end character as predicate. |
503 | | 15: After seeing a predicate start character at state 1, looking |
504 | | for a predicate end character as predicate. |
505 | | #endif |
506 | | #ifdef TC_Z80 |
507 | | 16: After seeing an 'a' or an 'A' at the start of a symbol |
508 | | 17: After seeing an 'f' or an 'F' in state 16 |
509 | | #endif |
510 | | */ |
511 | | |
512 | | /* I added states 9 and 10 because the MIPS ECOFF assembler uses |
513 | | constructs like ``.loc 1 20''. This was turning into ``.loc |
514 | | 120''. States 9 and 10 ensure that a space is never dropped in |
515 | | between characters which could appear in an identifier. Ian |
516 | | Taylor, ian@cygnus.com. |
517 | | |
518 | | I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works |
519 | | correctly on the PA (and any other target where colons are optional). |
520 | | Jeff Law, law@cs.utah.edu. |
521 | | |
522 | | I added state 13 so that something like "cmp r1, r2 || trap #1" does not |
523 | | get squashed into "cmp r1,r2||trap#1", with the all important space |
524 | | between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */ |
525 | | |
526 | | /* This macro gets the next input character. */ |
527 | | |
528 | 525 | #define GET() \ |
529 | 597k | (from < fromend \ |
530 | 597k | ? * (unsigned char *) (from++) \ |
531 | 597k | : (saved_input = NULL, \ |
532 | 436 | fromlen = (*get) (input_buffer, sizeof input_buffer), \ |
533 | 436 | from = input_buffer, \ |
534 | 436 | fromend = from + fromlen, \ |
535 | 436 | (fromlen == 0 \ |
536 | 436 | ? EOF \ |
537 | 436 | : * (unsigned char *) (from++)))) |
538 | | |
539 | | /* This macro pushes a character back on the input stream. */ |
540 | | |
541 | 65.1k | #define UNGET(uch) (*--from = (uch)) |
542 | | |
543 | | /* This macro puts a character into the output buffer. If this |
544 | | character fills the output buffer, this macro jumps to the label |
545 | | TOFULL. We use this rather ugly approach because we need to |
546 | | handle two different termination conditions: EOF on the input |
547 | | stream, and a full output buffer. It would be simpler if we |
548 | | always read in the entire input stream before processing it, but |
549 | | I don't want to make such a significant change to the assembler's |
550 | | memory usage. */ |
551 | | |
552 | 525 | #define PUT(pch) \ |
553 | 506k | do \ |
554 | 506k | { \ |
555 | 506k | *to++ = (pch); \ |
556 | 506k | if (to >= toend) \ |
557 | 506k | goto tofull; \ |
558 | 506k | } \ |
559 | 506k | while (0) |
560 | | |
561 | 525 | if (saved_input != NULL) |
562 | 33 | { |
563 | 33 | from = saved_input; |
564 | 33 | fromend = from + saved_input_len; |
565 | 33 | } |
566 | 492 | else |
567 | 492 | { |
568 | 492 | fromlen = (*get) (input_buffer, sizeof input_buffer); |
569 | 492 | if (fromlen == 0) |
570 | 207 | return 0; |
571 | 285 | from = input_buffer; |
572 | 285 | fromend = from + fromlen; |
573 | | |
574 | 285 | if (check_multibyte) |
575 | 0 | (void) scan_for_multibyte_characters ((const unsigned char *) from, |
576 | 0 | (const unsigned char *) fromend, |
577 | 0 | true /* Generate warnings. */); |
578 | 285 | } |
579 | | |
580 | 394k | while (1) |
581 | 394k | { |
582 | | /* The cases in this switch end with continue, in order to |
583 | | branch back to the top of this while loop and generate the |
584 | | next output character in the appropriate state. */ |
585 | 394k | switch (state) |
586 | 394k | { |
587 | 7.35k | case -1: |
588 | 7.35k | ch = *out_string++; |
589 | 7.35k | if (*out_string == '\0') |
590 | 1.45k | { |
591 | 1.45k | state = old_state; |
592 | 1.45k | old_state = 3; |
593 | 1.45k | } |
594 | 7.35k | PUT (ch); |
595 | 7.35k | continue; |
596 | | |
597 | 7.35k | case 4: |
598 | 1.49k | ch = GET (); |
599 | 1.49k | if (ch == EOF) |
600 | 0 | goto fromeof; |
601 | 1.49k | else if (ch >= '0' && ch <= '9') |
602 | 868 | PUT (ch); |
603 | 624 | else |
604 | 624 | { |
605 | 624 | while (ch != EOF && IS_WHITESPACE (ch)) |
606 | 0 | ch = GET (); |
607 | 624 | if (ch == '"') |
608 | 314 | { |
609 | 314 | quotechar = ch; |
610 | 314 | state = 5; |
611 | 314 | old_state = 3; |
612 | 314 | PUT (ch); |
613 | 314 | } |
614 | 310 | else |
615 | 310 | { |
616 | 1.09k | while (ch != EOF && ch != '\n') |
617 | 780 | ch = GET (); |
618 | 310 | state = 0; |
619 | 310 | PUT (ch); |
620 | 310 | } |
621 | 624 | } |
622 | 1.49k | continue; |
623 | | |
624 | 51.3k | case 5: |
625 | | /* We are going to copy everything up to a quote character, |
626 | | with special handling for a backslash. We try to |
627 | | optimize the copying in the simple case without using the |
628 | | GET and PUT macros. */ |
629 | 51.3k | { |
630 | 51.3k | char *s; |
631 | 51.3k | ptrdiff_t len; |
632 | | |
633 | 1.40M | for (s = from; s < fromend; s++) |
634 | 1.40M | { |
635 | 1.40M | ch = *s; |
636 | 1.40M | if (ch == '\\' |
637 | 1.40M | || ch == quotechar |
638 | 1.38M | || ch == '\n') |
639 | 51.2k | break; |
640 | 1.40M | } |
641 | 51.3k | len = s - from; |
642 | 51.3k | if (len > toend - to) |
643 | 2 | len = toend - to; |
644 | 51.3k | if (len > 0) |
645 | 42.9k | { |
646 | 42.9k | memcpy (to, from, len); |
647 | 42.9k | to += len; |
648 | 42.9k | from += len; |
649 | 42.9k | if (to >= toend) |
650 | 2 | goto tofull; |
651 | 42.9k | } |
652 | 51.3k | } |
653 | | |
654 | 51.3k | ch = GET (); |
655 | 51.3k | if (ch == EOF) |
656 | 84 | { |
657 | | /* This buffer is here specifically so |
658 | | that the UNGET below will work. */ |
659 | 84 | static char one_char_buf[1]; |
660 | | |
661 | 84 | as_warn (_("end of file in string; '%c' inserted"), quotechar); |
662 | 84 | state = old_state; |
663 | 84 | from = fromend = one_char_buf + 1; |
664 | 84 | fromlen = 1; |
665 | 84 | UNGET ('\n'); |
666 | 84 | PUT (quotechar); |
667 | 84 | } |
668 | 51.3k | else if (ch == quotechar) |
669 | 23.4k | { |
670 | 23.4k | state = old_state; |
671 | 23.4k | PUT (ch); |
672 | 23.4k | } |
673 | 27.8k | else if (TC_STRING_ESCAPES && ch == '\\') |
674 | 694 | { |
675 | 694 | state = 6; |
676 | 694 | PUT (ch); |
677 | 694 | } |
678 | 27.1k | else if (scrub_m68k_mri && ch == '\n') |
679 | 0 | { |
680 | | /* Just quietly terminate the string. This permits lines like |
681 | | bne label loop if we haven't reach end yet. */ |
682 | 0 | state = old_state; |
683 | 0 | UNGET (ch); |
684 | 0 | PUT ('\''); |
685 | 0 | } |
686 | 27.1k | else |
687 | 27.1k | { |
688 | 27.1k | PUT (ch); |
689 | 27.1k | } |
690 | 51.3k | continue; |
691 | | |
692 | 51.3k | case 6: |
693 | 694 | state = 5; |
694 | 694 | ch = GET (); |
695 | 694 | switch (ch) |
696 | 694 | { |
697 | | /* Handle strings broken across lines, by turning '\n' into |
698 | | '\\' and 'n'. */ |
699 | 1 | case '\n': |
700 | 1 | UNGET ('n'); |
701 | 1 | add_newlines++; |
702 | 1 | PUT ('\\'); |
703 | 1 | continue; |
704 | | |
705 | 1 | case EOF: |
706 | 0 | as_warn (_("end of file in string; '%c' inserted"), quotechar); |
707 | 0 | PUT (quotechar); |
708 | 0 | continue; |
709 | | |
710 | | /* These two are used inside macros. */ |
711 | 0 | case '@': |
712 | 0 | case '+': |
713 | 0 | break; |
714 | | |
715 | 1 | case '"': |
716 | 33 | case '\\': |
717 | 45 | case 'b': |
718 | 48 | case 'f': |
719 | 50 | case 'n': |
720 | 123 | case 'r': |
721 | 126 | case 't': |
722 | 126 | case 'v': |
723 | 126 | case 'x': |
724 | 126 | case 'X': |
725 | 181 | case '0': |
726 | 183 | case '1': |
727 | 184 | case '2': |
728 | 187 | case '3': |
729 | 189 | case '4': |
730 | 191 | case '5': |
731 | 195 | case '6': |
732 | 195 | case '7': |
733 | 195 | break; |
734 | | |
735 | 498 | default: |
736 | | #ifdef ONLY_STANDARD_ESCAPES |
737 | | as_warn (_("unknown escape '\\%c' in string; ignored"), ch); |
738 | | #endif |
739 | 498 | break; |
740 | 694 | } |
741 | 693 | PUT (ch); |
742 | 693 | continue; |
743 | | |
744 | | #ifdef DOUBLEBAR_PARALLEL |
745 | | case 13: |
746 | | ch = GET (); |
747 | | if (ch != '|') |
748 | | abort (); |
749 | | |
750 | | /* Reset back to state 1 and pretend that we are parsing a |
751 | | line from just after the first white space. */ |
752 | | state = 1; |
753 | | PUT ('|'); |
754 | | continue; |
755 | | #endif |
756 | | #ifdef TC_Z80 |
757 | | case 16: |
758 | | /* We have seen an 'a' at the start of a symbol, look for an 'f'. */ |
759 | | ch = GET (); |
760 | | if (ch == 'f' || ch == 'F') |
761 | | { |
762 | | state = 17; |
763 | | PUT (ch); |
764 | | } |
765 | | else |
766 | | { |
767 | | if (ch != EOF) |
768 | | UNGET (ch); |
769 | | state = 9; |
770 | | break; |
771 | | } |
772 | | /* Fall through. */ |
773 | | case 17: |
774 | | /* We have seen "af" at the start of a symbol, |
775 | | a ' here is a part of that symbol. */ |
776 | | ch = GET (); |
777 | | state = 9; |
778 | | if (ch == '\'') |
779 | | /* Change to avoid warning about unclosed string. */ |
780 | | PUT ('`'); |
781 | | else if (ch != EOF) |
782 | | UNGET (ch); |
783 | | break; |
784 | | #endif |
785 | 394k | } |
786 | | |
787 | | /* OK, we are somewhere in states 0 through 4 or 9 through 11. */ |
788 | | |
789 | | /* flushchar: */ |
790 | 333k | ch = GET (); |
791 | | |
792 | | #ifdef TC_PREDICATE_START_CHAR |
793 | | if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1)) |
794 | | { |
795 | | state += 14; |
796 | | PUT (ch); |
797 | | continue; |
798 | | } |
799 | | else if (state == 14 || state == 15) |
800 | | { |
801 | | if (ch == TC_PREDICATE_END_CHAR) |
802 | | { |
803 | | state -= 14; |
804 | | PUT (ch); |
805 | | ch = GET (); |
806 | | } |
807 | | else |
808 | | { |
809 | | PUT (ch); |
810 | | continue; |
811 | | } |
812 | | } |
813 | | #endif |
814 | | |
815 | 371k | recycle: |
816 | | |
817 | | /* We need to watch out for .end directives: We should in particular not |
818 | | issue diagnostics for anything after an active one. */ |
819 | 371k | if (ch == EOF) |
820 | 272 | end_state = NULL; |
821 | 370k | else if (end_state == NULL) |
822 | 338k | { |
823 | 338k | if ((state == 0 || state == 1) |
824 | 82.9k | && (ch == '.' |
825 | 51.4k | || (no_pseudo_dot && ch == end_pseudo[0]))) |
826 | 31.5k | end_state = end_pseudo + (ch != '.'); |
827 | 338k | } |
828 | 32.7k | else if (ch != '\0' |
829 | 32.7k | && (*end_state == ch |
830 | | /* Avoid triggering on directives like .endif or .endr. */ |
831 | 31.6k | || (*end_state == ' ' && !IS_SYMBOL_COMPONENT (ch)))) |
832 | 1.05k | { |
833 | 1.05k | if (IS_NEWLINE (ch) || IS_LINE_SEPARATOR (ch)) |
834 | 1 | goto end_end; |
835 | 1.05k | ++end_state; |
836 | 1.05k | } |
837 | 31.6k | else if (*end_state != '\0') |
838 | | /* We did not get the expected character, or we didn't |
839 | | get a valid terminating character after seeing the |
840 | | entire pseudo-op, so we must go back to the beginning. */ |
841 | 31.5k | end_state = NULL; |
842 | 144 | else if (IS_NEWLINE (ch) || IS_LINE_SEPARATOR (ch)) |
843 | 18 | { |
844 | 19 | end_end: |
845 | | /* We've read the entire pseudo-op. If this is the end of the line, |
846 | | bail out now by (ab)using the output-full path. This allows the |
847 | | caller to process input up to here and terminate processing if this |
848 | | directive is actually active (not on the false branch of a |
849 | | conditional and not in a macro definition). */ |
850 | 19 | end_state = NULL; |
851 | 19 | state = 0; |
852 | 19 | PUT (ch); |
853 | 19 | goto tofull; |
854 | 19 | } |
855 | | |
856 | | #if defined TC_ARM && defined OBJ_ELF |
857 | | /* We need to watch out for .symver directives. See the comment later |
858 | | in this function. */ |
859 | | if (ch == EOF) |
860 | | symver_state = NULL; |
861 | | else if (symver_state == NULL) |
862 | | { |
863 | | if ((state == 0 || state == 1) |
864 | | && strchr (tc_comment_chars, '@') != NULL |
865 | | && ch == symver_pseudo[0]) |
866 | | symver_state = symver_pseudo + 1; |
867 | | } |
868 | | else |
869 | | { |
870 | | /* We advance to the next state if we find the right |
871 | | character. */ |
872 | | if (ch != '\0' && (*symver_state == ch)) |
873 | | ++symver_state; |
874 | | else if (*symver_state != '\0') |
875 | | /* We did not get the expected character, or we didn't |
876 | | get a valid terminating character after seeing the |
877 | | entire pseudo-op, so we must go back to the beginning. */ |
878 | | symver_state = NULL; |
879 | | else |
880 | | { |
881 | | /* We've read the entire pseudo-op. If this is the end |
882 | | of the line, go back to the beginning. */ |
883 | | if (IS_NEWLINE (ch) || IS_LINE_SEPARATOR (ch)) |
884 | | symver_state = NULL; |
885 | | } |
886 | | } |
887 | | #endif /* TC_ARM && OBJ_ELF */ |
888 | | |
889 | | #ifdef TC_M68K |
890 | | /* We want to have pseudo-ops which control whether we are in |
891 | | MRI mode or not. Unfortunately, since m68k MRI mode affects |
892 | | the scrubber, that means that we need a special purpose |
893 | | recognizer here. */ |
894 | | if (ch == EOF) |
895 | | mri_state = NULL; |
896 | | else if (mri_state == NULL) |
897 | | { |
898 | | if ((state == 0 || state == 1) |
899 | | && ch == mri_pseudo[0]) |
900 | | mri_state = mri_pseudo + 1; |
901 | | } |
902 | | else |
903 | | { |
904 | | /* We advance to the next state if we find the right |
905 | | character, or if we need a space character and we get any |
906 | | whitespace character, or if we need a '0' and we get a |
907 | | '1' (this is so that we only need one state to handle |
908 | | ``.mri 0'' and ``.mri 1''). */ |
909 | | if (ch != '\0' |
910 | | && (*mri_state == ch |
911 | | || (*mri_state == ' ' |
912 | | && IS_WHITESPACE (ch)) |
913 | | || (*mri_state == '0' |
914 | | && ch == '1'))) |
915 | | { |
916 | | mri_last_ch = ch; |
917 | | ++mri_state; |
918 | | } |
919 | | else if (*mri_state != '\0' |
920 | | || (!IS_WHITESPACE (ch) |
921 | | && !IS_LINE_SEPARATOR (ch) |
922 | | && !IS_NEWLINE (ch))) |
923 | | { |
924 | | /* We did not get the expected character, or we didn't |
925 | | get a valid terminating character after seeing the |
926 | | entire pseudo-op, so we must go back to the |
927 | | beginning. */ |
928 | | mri_state = NULL; |
929 | | } |
930 | | else |
931 | | { |
932 | | /* We've read the entire pseudo-op. mri_last_ch is |
933 | | either '0' or '1' indicating whether to enter or |
934 | | leave MRI mode. */ |
935 | | do_scrub_begin (mri_last_ch == '1'); |
936 | | mri_state = NULL; |
937 | | |
938 | | /* We continue handling the character as usual. The |
939 | | main gas reader must also handle the .mri pseudo-op |
940 | | to control expression parsing and the like. */ |
941 | | } |
942 | | } |
943 | | #endif |
944 | | |
945 | 371k | if (ch == EOF) |
946 | 272 | { |
947 | 272 | if (state != 0) |
948 | 113 | { |
949 | 113 | as_warn (_("end of file not at end of a line; newline inserted")); |
950 | 113 | state = 0; |
951 | 113 | PUT ('\n'); |
952 | 113 | } |
953 | 272 | goto fromeof; |
954 | 272 | } |
955 | | |
956 | 370k | switch (lex[ch]) |
957 | 370k | { |
958 | 75.9k | case LEX_IS_WHITESPACE: |
959 | 75.9k | do |
960 | 79.8k | { |
961 | 79.8k | ch = GET (); |
962 | 79.8k | } |
963 | 79.8k | while (ch != EOF && IS_WHITESPACE (ch)); |
964 | 75.9k | if (ch == EOF) |
965 | 12 | goto fromeof; |
966 | | |
967 | 75.9k | if (state == 0) |
968 | 15.8k | { |
969 | | /* Preserve a single whitespace character at the |
970 | | beginning of a line. */ |
971 | 15.8k | state = 1; |
972 | 15.8k | UNGET (ch); |
973 | 15.8k | PUT (' '); |
974 | 15.8k | break; |
975 | 15.8k | } |
976 | | |
977 | | #ifdef KEEP_WHITE_AROUND_COLON |
978 | | if (lex[ch] == LEX_IS_COLON) |
979 | | { |
980 | | /* Only keep this white if there's no white *after* the |
981 | | colon. */ |
982 | | ch2 = GET (); |
983 | | if (ch2 != EOF) |
984 | | UNGET (ch2); |
985 | | if (!IS_WHITESPACE (ch2)) |
986 | | { |
987 | | state = 9; |
988 | | UNGET (ch); |
989 | | PUT (' '); |
990 | | break; |
991 | | } |
992 | | } |
993 | | #endif |
994 | | |
995 | | /* Prune trailing whitespace. */ |
996 | 60.0k | if (IS_COMMENT (ch) |
997 | 60.0k | || (IS_LINE_COMMENT (ch) |
998 | 237 | && (state < 1 || strchr (tc_comment_chars, ch))) |
999 | 59.8k | || IS_NEWLINE (ch) |
1000 | 59.2k | || IS_LINE_SEPARATOR (ch) |
1001 | 58.9k | || IS_PARALLEL_SEPARATOR (ch)) |
1002 | 1.16k | { |
1003 | 1.16k | if (scrub_m68k_mri) |
1004 | 0 | { |
1005 | | /* In MRI mode, we keep these spaces. */ |
1006 | 0 | UNGET (ch); |
1007 | 0 | PUT (' '); |
1008 | 0 | break; |
1009 | 0 | } |
1010 | 1.16k | goto recycle; |
1011 | 1.16k | } |
1012 | | #ifdef DOUBLESLASH_LINE_COMMENTS |
1013 | | if (IS_TWOCHAR_COMMENT_1ST (ch)) |
1014 | | { |
1015 | | ch2 = GET (); |
1016 | | if (ch2 != EOF) |
1017 | | UNGET (ch2); |
1018 | | if (ch2 == '/') |
1019 | | goto recycle; |
1020 | | } |
1021 | | #endif |
1022 | | |
1023 | | /* If we're in state 2 or 11, we've seen a non-white |
1024 | | character followed by whitespace. If the next character |
1025 | | is ':', this is whitespace after a label name which we |
1026 | | normally must ignore. In MRI mode, though, spaces are |
1027 | | not permitted between the label and the colon. */ |
1028 | 58.9k | if ((state == 2 || state == 11) |
1029 | 36.0k | && lex[ch] == LEX_IS_COLON |
1030 | 0 | && ! scrub_m68k_mri) |
1031 | 12 | { |
1032 | 12 | state = 1; |
1033 | 12 | PUT (ch); |
1034 | 12 | break; |
1035 | 12 | } |
1036 | | |
1037 | 58.9k | switch (state) |
1038 | 58.9k | { |
1039 | 8 | case 1: |
1040 | | /* We can arrive here if we leave a leading whitespace |
1041 | | character at the beginning of a line. */ |
1042 | 8 | goto recycle; |
1043 | 13.6k | case 2: |
1044 | 13.6k | state = 3; |
1045 | 13.6k | if (to + 1 < toend) |
1046 | 13.6k | { |
1047 | | /* Optimize common case by skipping UNGET/GET. */ |
1048 | 13.6k | PUT (' '); /* Sp after opco */ |
1049 | 13.6k | goto recycle; |
1050 | 13.6k | } |
1051 | 0 | UNGET (ch); |
1052 | 0 | PUT (' '); |
1053 | 0 | break; |
1054 | 427 | case 3: |
1055 | 427 | #ifndef TC_KEEP_OPERAND_SPACES |
1056 | | /* For TI C6X, we keep these spaces as they may separate |
1057 | | functional unit specifiers from operands. */ |
1058 | 427 | if (scrub_m68k_mri) |
1059 | 0 | #endif |
1060 | 0 | { |
1061 | | /* In MRI mode, we keep these spaces. */ |
1062 | 0 | UNGET (ch); |
1063 | 0 | PUT (' '); |
1064 | 0 | break; |
1065 | 0 | } |
1066 | 427 | goto recycle; /* Sp in operands */ |
1067 | 22.4k | case 9: |
1068 | 22.4k | case 10: |
1069 | 22.4k | #ifndef TC_KEEP_OPERAND_SPACES |
1070 | 22.4k | if (scrub_m68k_mri) |
1071 | 0 | #endif |
1072 | 0 | { |
1073 | | /* In MRI mode, we keep these spaces. */ |
1074 | 0 | state = 3; |
1075 | 0 | UNGET (ch); |
1076 | 0 | PUT (' '); |
1077 | 0 | break; |
1078 | 0 | } |
1079 | 22.4k | state = 10; /* Sp after symbol char */ |
1080 | 22.4k | goto recycle; |
1081 | 22.3k | case 11: |
1082 | 22.3k | if (LABELS_WITHOUT_COLONS || flag_m68k_mri) |
1083 | 0 | state = 1; |
1084 | 22.3k | else |
1085 | 22.3k | { |
1086 | | /* We know that ch is not ':', since we tested that |
1087 | | case above. Therefore this is not a label, so it |
1088 | | must be the opcode, and we've just seen the |
1089 | | whitespace after it. */ |
1090 | 22.3k | state = 3; |
1091 | 22.3k | } |
1092 | 22.3k | UNGET (ch); |
1093 | 22.3k | PUT (' '); /* Sp after label definition. */ |
1094 | 22.3k | break; |
1095 | 22.3k | default: |
1096 | 0 | BAD_CASE (state); |
1097 | 58.9k | } |
1098 | 22.3k | break; |
1099 | | |
1100 | 22.3k | case LEX_IS_TWOCHAR_COMMENT_1ST: |
1101 | 0 | ch2 = GET (); |
1102 | 0 | if (ch2 == '*') |
1103 | 0 | { |
1104 | 0 | twochar_comment: |
1105 | 0 | for (;;) |
1106 | 0 | { |
1107 | 0 | do |
1108 | 0 | { |
1109 | 0 | ch2 = GET (); |
1110 | 0 | if (ch2 != EOF && IS_NEWLINE (ch2)) |
1111 | 0 | add_newlines++; |
1112 | 0 | } |
1113 | 0 | while (ch2 != EOF && ch2 != '*'); |
1114 | |
|
1115 | 0 | while (ch2 == '*') |
1116 | 0 | ch2 = GET (); |
1117 | |
|
1118 | 0 | if (ch2 == EOF || ch2 == '/') |
1119 | 0 | break; |
1120 | | |
1121 | | /* This UNGET will ensure that we count newlines |
1122 | | correctly. */ |
1123 | 0 | UNGET (ch2); |
1124 | 0 | } |
1125 | |
|
1126 | 0 | if (ch2 == EOF) |
1127 | 0 | as_warn (_("end of file in multiline comment")); |
1128 | |
|
1129 | 0 | ch = ' '; |
1130 | 0 | goto recycle; |
1131 | 0 | } |
1132 | | #ifdef DOUBLESLASH_LINE_COMMENTS |
1133 | | else if (ch2 == '/') |
1134 | | { |
1135 | | do |
1136 | | { |
1137 | | ch = GET (); |
1138 | | } |
1139 | | while (ch != EOF && !IS_NEWLINE (ch)); |
1140 | | if (ch == EOF) |
1141 | | as_warn ("end of file in comment; newline inserted"); |
1142 | | state = 0; |
1143 | | PUT ('\n'); |
1144 | | break; |
1145 | | } |
1146 | | #endif |
1147 | 0 | else |
1148 | 0 | { |
1149 | 0 | if (ch2 != EOF) |
1150 | 0 | UNGET (ch2); |
1151 | 0 | if (state == 9 || state == 10) |
1152 | 0 | state = 3; |
1153 | 0 | PUT (ch); |
1154 | 0 | } |
1155 | 0 | break; |
1156 | | |
1157 | 23.2k | case LEX_IS_STRINGQUOTE: |
1158 | 23.2k | quotechar = ch; |
1159 | 23.2k | if (state == 10) |
1160 | 7.40k | { |
1161 | | /* Preserve the whitespace in foo "bar". */ |
1162 | 7.40k | UNGET (ch); |
1163 | 7.40k | state = 3; |
1164 | 7.40k | PUT (' '); |
1165 | | |
1166 | | /* PUT didn't jump out. We could just break, but we |
1167 | | know what will happen, so optimize a bit. */ |
1168 | 7.40k | ch = GET (); |
1169 | 7.40k | old_state = 9; |
1170 | 7.40k | } |
1171 | 15.8k | else if (state == 3) |
1172 | 408 | old_state = 9; |
1173 | 15.4k | else if (state == 0) |
1174 | 28 | old_state = 11; /* Now seeing label definition. */ |
1175 | 15.4k | else |
1176 | 15.4k | old_state = state; |
1177 | 23.2k | state = 5; |
1178 | 23.2k | PUT (ch); |
1179 | 23.2k | break; |
1180 | | |
1181 | 23.2k | case LEX_IS_ONECHAR_QUOTE: |
1182 | | #ifdef H_TICK_HEX |
1183 | | if (state == 9 && enable_h_tick_hex) |
1184 | | { |
1185 | | char c; |
1186 | | |
1187 | | c = GET (); |
1188 | | as_warn ("'%c found after symbol", c); |
1189 | | UNGET (c); |
1190 | | } |
1191 | | #endif |
1192 | 850 | if (state == 10) |
1193 | 3 | { |
1194 | | /* Preserve the whitespace in foo 'b'. */ |
1195 | 3 | UNGET (ch); |
1196 | 3 | state = 3; |
1197 | 3 | PUT (' '); |
1198 | 3 | break; |
1199 | 3 | } |
1200 | 847 | ch = GET (); |
1201 | 847 | if (ch == EOF) |
1202 | 1 | { |
1203 | 1 | as_warn (_("end of file after a one-character quote; \\0 inserted")); |
1204 | 1 | ch = 0; |
1205 | 1 | } |
1206 | 847 | if (ch == '\\') |
1207 | 2 | { |
1208 | 2 | ch = GET (); |
1209 | 2 | if (ch == EOF) |
1210 | 0 | { |
1211 | 0 | as_warn (_("end of file in escape character")); |
1212 | 0 | ch = '\\'; |
1213 | 0 | } |
1214 | 2 | else |
1215 | 2 | ch = process_escape (ch); |
1216 | 2 | } |
1217 | 847 | sprintf (out_buf, "%d", ch & 0xff); |
1218 | | |
1219 | | /* None of these 'x constants for us. We want 'x'. */ |
1220 | 847 | if ((ch = GET ()) != '\'') |
1221 | 791 | { |
1222 | | #ifdef REQUIRE_CHAR_CLOSE_QUOTE |
1223 | | as_warn (_("missing close quote; (assumed)")); |
1224 | | #else |
1225 | 791 | if (ch != EOF) |
1226 | 790 | UNGET (ch); |
1227 | 791 | #endif |
1228 | 791 | } |
1229 | 847 | if (strlen (out_buf) == 1) |
1230 | 14 | { |
1231 | 14 | PUT (out_buf[0]); |
1232 | 14 | break; |
1233 | 14 | } |
1234 | 833 | if (state == 9) |
1235 | 548 | old_state = 3; |
1236 | 285 | else |
1237 | 285 | old_state = state; |
1238 | 833 | state = -1; |
1239 | 833 | out_string = out_buf; |
1240 | 833 | PUT (*out_string++); |
1241 | 833 | break; |
1242 | | |
1243 | 3.37k | case LEX_IS_COLON: |
1244 | | #ifdef KEEP_WHITE_AROUND_COLON |
1245 | | state = 9; |
1246 | | #else |
1247 | 3.37k | if (state == 9 || state == 10) |
1248 | 927 | state = 3; |
1249 | 2.44k | else if (state != 3) |
1250 | 2.34k | state = 1; |
1251 | 3.37k | #endif |
1252 | 3.37k | PUT (ch); |
1253 | 3.37k | break; |
1254 | | |
1255 | 50.1k | case LEX_IS_NEWLINE: |
1256 | | /* Roll out a bunch of newlines from inside comments, etc. */ |
1257 | 50.1k | if (add_newlines) |
1258 | 1 | { |
1259 | 1 | --add_newlines; |
1260 | 1 | UNGET (ch); |
1261 | 1 | } |
1262 | | /* Fall through. */ |
1263 | | |
1264 | 62.6k | case LEX_IS_LINE_SEPARATOR: |
1265 | 62.6k | state = 0; |
1266 | 62.6k | PUT (ch); |
1267 | 62.6k | break; |
1268 | | |
1269 | 62.6k | case LEX_IS_PARALLEL_SEPARATOR: |
1270 | 0 | state = 1; |
1271 | 0 | PUT (ch); |
1272 | 0 | break; |
1273 | | |
1274 | | #ifdef TC_V850 |
1275 | | case LEX_IS_DOUBLEDASH_1ST: |
1276 | | ch2 = GET (); |
1277 | | if (ch2 != '-') |
1278 | | { |
1279 | | if (ch2 != EOF) |
1280 | | UNGET (ch2); |
1281 | | goto de_fault; |
1282 | | } |
1283 | | /* Read and skip to end of line. */ |
1284 | | do |
1285 | | { |
1286 | | ch = GET (); |
1287 | | } |
1288 | | while (ch != EOF && ch != '\n'); |
1289 | | |
1290 | | if (ch == EOF) |
1291 | | as_warn (_("end of file in comment; newline inserted")); |
1292 | | |
1293 | | state = 0; |
1294 | | PUT ('\n'); |
1295 | | break; |
1296 | | #endif |
1297 | | #ifdef DOUBLEBAR_PARALLEL |
1298 | | case LEX_IS_DOUBLEBAR_1ST: |
1299 | | ch2 = GET (); |
1300 | | if (ch2 != EOF) |
1301 | | UNGET (ch2); |
1302 | | if (ch2 != '|') |
1303 | | goto de_fault; |
1304 | | |
1305 | | /* Handle '||' in two states as invoking PUT twice might |
1306 | | result in the first one jumping out of this loop. We'd |
1307 | | then lose track of the state and one '|' char. */ |
1308 | | state = 13; |
1309 | | PUT ('|'); |
1310 | | break; |
1311 | | #endif |
1312 | 4.72k | case LEX_IS_LINE_COMMENT_START: |
1313 | | /* FIXME-someday: The two character comment stuff was badly |
1314 | | thought out. On i386, we want '/' as line comment start |
1315 | | AND we want C style comments. hence this hack. The |
1316 | | whole lexical process should be reworked. xoxorich. */ |
1317 | 4.72k | if (ch == '/') |
1318 | 2.57k | { |
1319 | 2.57k | ch2 = GET (); |
1320 | 2.57k | if (ch2 == '*') |
1321 | 0 | goto twochar_comment; |
1322 | 2.57k | if (ch2 != EOF) |
1323 | 2.57k | UNGET (ch2); |
1324 | 2.57k | } |
1325 | | |
1326 | 4.72k | if (state == 0 || state == 1) /* Only comment at start of line. */ |
1327 | 1.38k | { |
1328 | 1.38k | int startch; |
1329 | | |
1330 | 1.38k | startch = ch; |
1331 | | |
1332 | 1.38k | do |
1333 | 1.39k | { |
1334 | 1.39k | ch = GET (); |
1335 | 1.39k | } |
1336 | 1.39k | while (ch != EOF && IS_WHITESPACE (ch)); |
1337 | | |
1338 | 1.38k | if (ch == EOF) |
1339 | 0 | { |
1340 | 0 | as_warn (_("end of file in comment; newline inserted")); |
1341 | 0 | PUT ('\n'); |
1342 | 0 | break; |
1343 | 0 | } |
1344 | | |
1345 | 1.38k | if (ch < '0' || ch > '9' || state != 0 || startch != '#') |
1346 | 764 | { |
1347 | | /* Not a cpp line. */ |
1348 | 2.43k | while (ch != EOF && !IS_NEWLINE (ch)) |
1349 | 1.66k | ch = GET (); |
1350 | 764 | if (ch == EOF) |
1351 | 2 | { |
1352 | 2 | as_warn (_("end of file in comment; newline inserted")); |
1353 | 2 | PUT ('\n'); |
1354 | 2 | } |
1355 | 762 | else /* IS_NEWLINE (ch) */ |
1356 | 762 | { |
1357 | | /* To process non-zero add_newlines. */ |
1358 | 762 | UNGET (ch); |
1359 | 762 | } |
1360 | 764 | state = 0; |
1361 | 764 | break; |
1362 | 764 | } |
1363 | | /* Looks like `# 123 "filename"' from cpp. */ |
1364 | 624 | UNGET (ch); |
1365 | 624 | old_state = 4; |
1366 | 624 | state = -1; |
1367 | 624 | if (scrub_m68k_mri) |
1368 | 0 | out_string = "\tlinefile "; |
1369 | 624 | else |
1370 | 624 | out_string = "\t.linefile "; |
1371 | 624 | PUT (*out_string++); |
1372 | 624 | break; |
1373 | 624 | } |
1374 | | |
1375 | | #ifdef TC_D10V |
1376 | | /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true. |
1377 | | Trap is the only short insn that has a first operand that is |
1378 | | neither register nor label. |
1379 | | We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 . |
1380 | | We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is |
1381 | | already LEX_IS_LINE_COMMENT_START. However, it is the |
1382 | | only character in line_comment_chars for d10v, hence we |
1383 | | can recognize it as such. */ |
1384 | | /* An alternative approach would be to reset the state to 1 when |
1385 | | we see '||', '<'- or '->', but that seems to be overkill. */ |
1386 | | if (state == 10) |
1387 | | PUT (' '); |
1388 | | #endif |
1389 | | /* We have a line comment character which is not at the |
1390 | | start of a line. If this is also a normal comment |
1391 | | character, fall through. Otherwise treat it as a default |
1392 | | character. */ |
1393 | 3.33k | if (strchr (tc_comment_chars, ch) == NULL) |
1394 | 2.57k | goto de_fault; |
1395 | 764 | if (scrub_m68k_mri |
1396 | 0 | && (ch == '!' || ch == '*' || ch == '#')) |
1397 | 0 | goto de_fault; |
1398 | | /* Fall through. */ |
1399 | 764 | case LEX_IS_COMMENT_START: |
1400 | | #if defined TC_ARM && defined OBJ_ELF |
1401 | | /* On the ARM, `@' is the comment character. |
1402 | | Unfortunately this is also a special character in ELF .symver |
1403 | | directives (and .type, though we deal with those another way). |
1404 | | So we check if this line is such a directive, and treat |
1405 | | the character as default if so. This is a hack. */ |
1406 | | if ((symver_state != NULL) && (*symver_state == 0)) |
1407 | | goto de_fault; |
1408 | | #endif |
1409 | | |
1410 | | /* Care is needed not to damage occurrences of \<comment-char> |
1411 | | by stripping the <comment-char> onwards. Yuck. */ |
1412 | 764 | if ((to > tostart ? to[-1] : last_char) == '\\') |
1413 | | /* Do not treat the <comment-char> as a start-of-comment. */ |
1414 | 0 | goto de_fault; |
1415 | | |
1416 | | #ifdef WARN_COMMENTS |
1417 | | if (!found_comment) |
1418 | | found_comment_file = as_where (&found_comment); |
1419 | | #endif |
1420 | 764 | do |
1421 | 22.7k | { |
1422 | 22.7k | ch = GET (); |
1423 | 22.7k | } |
1424 | 22.7k | while (ch != EOF && !IS_NEWLINE (ch)); |
1425 | 764 | if (ch == EOF) |
1426 | 6 | as_warn (_("end of file in comment; newline inserted")); |
1427 | 764 | state = 0; |
1428 | 764 | PUT ('\n'); |
1429 | 764 | break; |
1430 | | |
1431 | | #ifdef H_TICK_HEX |
1432 | | case LEX_IS_H: |
1433 | | /* Look for strings like H'[0-9A-Fa-f] and if found, replace |
1434 | | the H' with 0x to make them gas-style hex characters. */ |
1435 | | if (enable_h_tick_hex) |
1436 | | { |
1437 | | char quot; |
1438 | | |
1439 | | quot = GET (); |
1440 | | if (quot == '\'') |
1441 | | { |
1442 | | UNGET ('x'); |
1443 | | ch = '0'; |
1444 | | } |
1445 | | else |
1446 | | UNGET (quot); |
1447 | | } |
1448 | | #endif |
1449 | | /* Fall through. */ |
1450 | | |
1451 | 176k | case LEX_IS_SYMBOL_COMPONENT: |
1452 | 176k | if (state == 10) |
1453 | 14.7k | { |
1454 | | /* This is a symbol character following another symbol |
1455 | | character, with whitespace in between. We skipped |
1456 | | the whitespace earlier, so output it now. */ |
1457 | 14.7k | UNGET (ch); |
1458 | 14.7k | state = 3; |
1459 | 14.7k | PUT (' '); |
1460 | 14.7k | break; |
1461 | 14.7k | } |
1462 | | |
1463 | | #ifdef TC_Z80 |
1464 | | /* "af'" is a symbol containing '\''. */ |
1465 | | if (state == 3 && (ch == 'a' || ch == 'A')) |
1466 | | { |
1467 | | state = 16; |
1468 | | PUT (ch); |
1469 | | ch = GET (); |
1470 | | if (ch == 'f' || ch == 'F') |
1471 | | { |
1472 | | state = 17; |
1473 | | PUT (ch); |
1474 | | break; |
1475 | | } |
1476 | | else |
1477 | | { |
1478 | | state = 9; |
1479 | | if (ch == EOF || !IS_SYMBOL_COMPONENT (ch)) |
1480 | | { |
1481 | | if (ch != EOF) |
1482 | | UNGET (ch); |
1483 | | break; |
1484 | | } |
1485 | | } |
1486 | | } |
1487 | | #endif |
1488 | 161k | if (state == 3) |
1489 | 56.8k | state = 9; |
1490 | | |
1491 | | /* This is a common case. Quickly copy CH and all the |
1492 | | following symbol component or normal characters. */ |
1493 | 161k | if (to + 1 < toend |
1494 | | #ifdef TC_M68K |
1495 | | && mri_state == NULL |
1496 | | #endif |
1497 | | #if defined TC_ARM && defined OBJ_ELF |
1498 | | && symver_state == NULL |
1499 | | #endif |
1500 | 161k | && end_state == NULL) |
1501 | 129k | { |
1502 | 129k | char *s; |
1503 | 129k | ptrdiff_t len; |
1504 | | |
1505 | 1.13M | for (s = from; s < fromend; s++) |
1506 | 1.13M | { |
1507 | 1.13M | int type; |
1508 | | |
1509 | 1.13M | ch2 = *(unsigned char *) s; |
1510 | 1.13M | type = lex[ch2]; |
1511 | 1.13M | if (type != 0 |
1512 | 1.04M | && type != LEX_IS_SYMBOL_COMPONENT) |
1513 | 129k | break; |
1514 | 1.13M | } |
1515 | | |
1516 | 129k | if (s > from) |
1517 | | /* Handle the last character normally, for |
1518 | | simplicity. */ |
1519 | 102k | --s; |
1520 | | |
1521 | 129k | len = s - from; |
1522 | | |
1523 | 129k | if (len > (toend - to) - 1) |
1524 | 0 | len = (toend - to) - 1; |
1525 | | |
1526 | 129k | if (len > 0) |
1527 | 92.5k | { |
1528 | 92.5k | PUT (ch); |
1529 | 92.5k | memcpy (to, from, len); |
1530 | 92.5k | to += len; |
1531 | 92.5k | from += len; |
1532 | 92.5k | if (to >= toend) |
1533 | 1 | goto tofull; |
1534 | 92.5k | ch = GET (); |
1535 | 92.5k | } |
1536 | 129k | } |
1537 | | |
1538 | | /* Fall through. */ |
1539 | 185k | default: |
1540 | 187k | de_fault: |
1541 | | /* Some relatively `normal' character. */ |
1542 | 187k | if (state == 0) |
1543 | 40.9k | { |
1544 | 40.9k | state = 11; /* Now seeing label definition. */ |
1545 | 40.9k | } |
1546 | 146k | else if (state == 1) |
1547 | 18.0k | { |
1548 | 18.0k | state = 2; /* Ditto. */ |
1549 | 18.0k | } |
1550 | 128k | else if (state == 9) |
1551 | 69.6k | { |
1552 | 69.6k | if (!IS_SYMBOL_COMPONENT (ch)) |
1553 | 9.48k | state = 3; |
1554 | 69.6k | } |
1555 | 59.0k | else if (state == 10) |
1556 | 189 | { |
1557 | 189 | if (ch == '\\') |
1558 | 1 | { |
1559 | | /* Special handling for backslash: a backslash may |
1560 | | be the beginning of a formal parameter (of a |
1561 | | macro) following another symbol character, with |
1562 | | whitespace in between. If that is the case, we |
1563 | | output a space before the parameter. Strictly |
1564 | | speaking, correct handling depends upon what the |
1565 | | macro parameter expands into; if the parameter |
1566 | | expands into something which does not start with |
1567 | | an operand character, then we don't want to keep |
1568 | | the space. We don't have enough information to |
1569 | | make the right choice, so here we are making the |
1570 | | choice which is more likely to be correct. */ |
1571 | 1 | if (to + 1 >= toend) |
1572 | 0 | { |
1573 | | /* If we're near the end of the buffer, save the |
1574 | | character for the next time round. Otherwise |
1575 | | we'll lose our state. */ |
1576 | 0 | UNGET (ch); |
1577 | 0 | goto tofull; |
1578 | 0 | } |
1579 | 1 | *to++ = ' '; |
1580 | 1 | } |
1581 | | |
1582 | 189 | state = 3; |
1583 | 189 | } |
1584 | 187k | PUT (ch); |
1585 | 187k | break; |
1586 | 370k | } |
1587 | 370k | } |
1588 | | |
1589 | | /*NOTREACHED*/ |
1590 | | |
1591 | 284 | fromeof: |
1592 | | /* We have reached the end of the input. */ |
1593 | 284 | if (to > tostart) |
1594 | 284 | last_char = to[-1]; |
1595 | 284 | return to - tostart; |
1596 | | |
1597 | 34 | tofull: |
1598 | | /* The output buffer is full. Save any input we have not yet |
1599 | | processed. */ |
1600 | 34 | if (fromend > from) |
1601 | 33 | { |
1602 | 33 | saved_input = from; |
1603 | 33 | saved_input_len = fromend - from; |
1604 | 33 | } |
1605 | 1 | else |
1606 | 1 | saved_input = NULL; |
1607 | | |
1608 | 34 | if (to > tostart) |
1609 | 34 | last_char = to[-1]; |
1610 | 34 | return to - tostart; |
1611 | 318 | } |
1612 | | |
1613 | | /* Return amount of pending input. */ |
1614 | | |
1615 | | size_t |
1616 | | do_scrub_pending (void) |
1617 | 209 | { |
1618 | 209 | size_t len = 0; |
1619 | 209 | if (saved_input) |
1620 | 30 | len += saved_input_len; |
1621 | 209 | if (state == -1) |
1622 | 0 | len += strlen (out_string); |
1623 | 209 | return len; |
1624 | 209 | } |