1"""
2 pygments.lexers.asm
3 ~~~~~~~~~~~~~~~~~~~
4
5 Lexers for assembly languages.
6
7 :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
8 :license: BSD, see LICENSE for details.
9"""
10
11import re
12
13from pygments.lexer import RegexLexer, include, bygroups, using, words, \
14 DelegatingLexer, default
15from pygments.lexers.c_cpp import CppLexer, CLexer
16from pygments.lexers.d import DLexer
17from pygments.token import Text, Name, Number, String, Comment, Punctuation, \
18 Other, Keyword, Operator, Whitespace
19
20__all__ = ['GasLexer', 'ObjdumpLexer', 'DObjdumpLexer', 'CppObjdumpLexer',
21 'CObjdumpLexer', 'HsailLexer', 'LlvmLexer', 'LlvmMirBodyLexer',
22 'LlvmMirLexer', 'NasmLexer', 'NasmObjdumpLexer', 'TasmLexer',
23 'Ca65Lexer', 'Dasm16Lexer']
24
25
26class GasLexer(RegexLexer):
27 """
28 For Gas (AT&T) assembly code.
29 """
30 name = 'GAS'
31 aliases = ['gas', 'asm']
32 filenames = ['*.s', '*.S']
33 mimetypes = ['text/x-gas']
34 url = 'https://www.gnu.org/software/binutils'
35 version_added = ''
36
37 #: optional Comment or Whitespace
38 string = r'"(\\"|[^"])*"'
39 char = r'[\w$.@-]'
40 identifier = r'(?:[a-zA-Z$_]' + char + r'*|\.' + char + '+)'
41 number = r'(?:0[xX][a-fA-F0-9]+|#?-?\d+)'
42 register = '%' + identifier + r'\b'
43
44 tokens = {
45 'root': [
46 include('whitespace'),
47 (identifier + ':', Name.Label),
48 (r'\.' + identifier, Name.Attribute, 'directive-args'),
49 (r'lock|rep(n?z)?|data\d+', Name.Attribute),
50 (identifier, Name.Function, 'instruction-args'),
51 (r'[\r\n]+', Text)
52 ],
53 'directive-args': [
54 (identifier, Name.Constant),
55 (string, String),
56 ('@' + identifier, Name.Attribute),
57 (number, Number.Integer),
58 (register, Name.Variable),
59 (r'[\r\n]+', Whitespace, '#pop'),
60 (r'([;#]|//).*?\n', Comment.Single, '#pop'),
61 (r'/[*].*?[*]/', Comment.Multiline),
62 (r'/[*].*?\n[\w\W]*?[*]/', Comment.Multiline, '#pop'),
63
64 include('punctuation'),
65 include('whitespace')
66 ],
67 'instruction-args': [
68 # For objdump-disassembled code, shouldn't occur in
69 # actual assembler input
70 ('([a-z0-9]+)( )(<)('+identifier+')(>)',
71 bygroups(Number.Hex, Text, Punctuation, Name.Constant,
72 Punctuation)),
73 ('([a-z0-9]+)( )(<)('+identifier+')([-+])('+number+')(>)',
74 bygroups(Number.Hex, Text, Punctuation, Name.Constant,
75 Punctuation, Number.Integer, Punctuation)),
76
77 # Address constants
78 (identifier, Name.Constant),
79 (number, Number.Integer),
80 # Registers
81 (register, Name.Variable),
82 # Numeric constants
83 ('$'+number, Number.Integer),
84 (r"$'(.|\\')'", String.Char),
85 (r'[\r\n]+', Whitespace, '#pop'),
86 (r'([;#]|//).*?\n', Comment.Single, '#pop'),
87 (r'/[*].*?[*]/', Comment.Multiline),
88 (r'/[*].*?\n[\w\W]*?[*]/', Comment.Multiline, '#pop'),
89
90 include('punctuation'),
91 include('whitespace')
92 ],
93 'whitespace': [
94 (r'\n', Whitespace),
95 (r'\s+', Whitespace),
96 (r'([;#]|//).*?\n', Comment.Single),
97 (r'/[*][\w\W]*?[*]/', Comment.Multiline)
98 ],
99 'punctuation': [
100 (r'[-*,.()\[\]!:{}]+', Punctuation)
101 ]
102 }
103
104 def analyse_text(text):
105 if re.search(r'^\.(text|data|section)', text, re.M):
106 return True
107 elif re.search(r'^\.\w+', text, re.M):
108 return 0.1
109
110
111def _objdump_lexer_tokens(asm_lexer):
112 """
113 Common objdump lexer tokens to wrap an ASM lexer.
114 """
115 hex_re = r'[0-9A-Za-z]'
116 return {
117 'root': [
118 # File name & format:
119 ('(.*?)(:)( +file format )(.*?)$',
120 bygroups(Name.Label, Punctuation, Text, String)),
121 # Section header
122 ('(Disassembly of section )(.*?)(:)$',
123 bygroups(Text, Name.Label, Punctuation)),
124 # Function labels
125 # (With offset)
126 ('('+hex_re+'+)( )(<)(.*?)([-+])(0[xX][A-Za-z0-9]+)(>:)$',
127 bygroups(Number.Hex, Whitespace, Punctuation, Name.Function,
128 Punctuation, Number.Hex, Punctuation)),
129 # (Without offset)
130 ('('+hex_re+'+)( )(<)(.*?)(>:)$',
131 bygroups(Number.Hex, Whitespace, Punctuation, Name.Function,
132 Punctuation)),
133 # Code line with disassembled instructions
134 ('( *)('+hex_re+r'+:)(\t)((?:'+hex_re+hex_re+' )+)( *\t)([a-zA-Z].*?)$',
135 bygroups(Whitespace, Name.Label, Whitespace, Number.Hex, Whitespace,
136 using(asm_lexer))),
137 # Code line without raw instructions (objdump --no-show-raw-insn)
138 ('( *)('+hex_re+r'+:)( *\t)([a-zA-Z].*?)$',
139 bygroups(Whitespace, Name.Label, Whitespace,
140 using(asm_lexer))),
141 # Code line with ascii
142 ('( *)('+hex_re+r'+:)(\t)((?:'+hex_re+hex_re+' )+)( *)(.*?)$',
143 bygroups(Whitespace, Name.Label, Whitespace, Number.Hex, Whitespace, String)),
144 # Continued code line, only raw opcodes without disassembled
145 # instruction
146 ('( *)('+hex_re+r'+:)(\t)((?:'+hex_re+hex_re+' )+)$',
147 bygroups(Whitespace, Name.Label, Whitespace, Number.Hex)),
148 # Skipped a few bytes
149 (r'\t\.\.\.$', Text),
150 # Relocation line
151 # (With offset)
152 (r'(\t\t\t)('+hex_re+r'+:)( )([^\t]+)(\t)(.*?)([-+])(0x'+hex_re+'+)$',
153 bygroups(Whitespace, Name.Label, Whitespace, Name.Property, Whitespace,
154 Name.Constant, Punctuation, Number.Hex)),
155 # (Without offset)
156 (r'(\t\t\t)('+hex_re+r'+:)( )([^\t]+)(\t)(.*?)$',
157 bygroups(Whitespace, Name.Label, Whitespace, Name.Property, Whitespace,
158 Name.Constant)),
159 (r'[^\n]+\n', Other)
160 ]
161 }
162
163
164class ObjdumpLexer(RegexLexer):
165 """
166 For the output of ``objdump -dr``.
167 """
168 name = 'objdump'
169 aliases = ['objdump']
170 filenames = ['*.objdump']
171 mimetypes = ['text/x-objdump']
172 url = 'https://www.gnu.org/software/binutils'
173 version_added = ''
174
175 tokens = _objdump_lexer_tokens(GasLexer)
176
177
178class DObjdumpLexer(DelegatingLexer):
179 """
180 For the output of ``objdump -Sr`` on compiled D files.
181 """
182 name = 'd-objdump'
183 aliases = ['d-objdump']
184 filenames = ['*.d-objdump']
185 mimetypes = ['text/x-d-objdump']
186 url = 'https://www.gnu.org/software/binutils'
187 version_added = ''
188
189 def __init__(self, **options):
190 super().__init__(DLexer, ObjdumpLexer, **options)
191
192
193class CppObjdumpLexer(DelegatingLexer):
194 """
195 For the output of ``objdump -Sr`` on compiled C++ files.
196 """
197 name = 'cpp-objdump'
198 aliases = ['cpp-objdump', 'c++-objdumb', 'cxx-objdump']
199 filenames = ['*.cpp-objdump', '*.c++-objdump', '*.cxx-objdump']
200 mimetypes = ['text/x-cpp-objdump']
201 url = 'https://www.gnu.org/software/binutils'
202 version_added = ''
203
204 def __init__(self, **options):
205 super().__init__(CppLexer, ObjdumpLexer, **options)
206
207
208class CObjdumpLexer(DelegatingLexer):
209 """
210 For the output of ``objdump -Sr`` on compiled C files.
211 """
212 name = 'c-objdump'
213 aliases = ['c-objdump']
214 filenames = ['*.c-objdump']
215 mimetypes = ['text/x-c-objdump']
216 url = 'https://www.gnu.org/software/binutils'
217 version_added = ''
218
219
220 def __init__(self, **options):
221 super().__init__(CLexer, ObjdumpLexer, **options)
222
223
224class HsailLexer(RegexLexer):
225 """
226 For HSAIL assembly code.
227 """
228 name = 'HSAIL'
229 aliases = ['hsail', 'hsa']
230 filenames = ['*.hsail']
231 mimetypes = ['text/x-hsail']
232 url = 'https://en.wikipedia.org/wiki/Heterogeneous_System_Architecture#HSA_Intermediate_Layer'
233 version_added = '2.2'
234
235 string = r'"[^"]*?"'
236 identifier = r'[a-zA-Z_][\w.]*'
237 # Registers
238 register_number = r'[0-9]+'
239 register = r'(\$(c|s|d|q)' + register_number + r')\b'
240 # Qualifiers
241 alignQual = r'(align\(\d+\))'
242 widthQual = r'(width\((\d+|all)\))'
243 allocQual = r'(alloc\(agent\))'
244 # Instruction Modifiers
245 roundingMod = (r'((_ftz)?(_up|_down|_zero|_near))')
246 datatypeMod = (r'_('
247 # packedTypes
248 r'u8x4|s8x4|u16x2|s16x2|u8x8|s8x8|u16x4|s16x4|u32x2|s32x2|'
249 r'u8x16|s8x16|u16x8|s16x8|u32x4|s32x4|u64x2|s64x2|'
250 r'f16x2|f16x4|f16x8|f32x2|f32x4|f64x2|'
251 # baseTypes
252 r'u8|s8|u16|s16|u32|s32|u64|s64|'
253 r'b128|b8|b16|b32|b64|b1|'
254 r'f16|f32|f64|'
255 # opaqueType
256 r'roimg|woimg|rwimg|samp|sig32|sig64)')
257
258 # Numeric Constant
259 float = r'((\d+\.)|(\d*\.\d+))[eE][+-]?\d+'
260 hexfloat = r'0[xX](([0-9a-fA-F]+\.[0-9a-fA-F]*)|([0-9a-fA-F]*\.[0-9a-fA-F]+))[pP][+-]?\d+'
261 ieeefloat = r'0((h|H)[0-9a-fA-F]{4}|(f|F)[0-9a-fA-F]{8}|(d|D)[0-9a-fA-F]{16})'
262
263 tokens = {
264 'root': [
265 include('whitespace'),
266 include('comments'),
267
268 (string, String),
269
270 (r'@' + identifier + ':?', Name.Label),
271
272 (register, Name.Variable.Anonymous),
273
274 include('keyword'),
275
276 (r'&' + identifier, Name.Variable.Global),
277 (r'%' + identifier, Name.Variable),
278
279 (hexfloat, Number.Hex),
280 (r'0[xX][a-fA-F0-9]+', Number.Hex),
281 (ieeefloat, Number.Float),
282 (float, Number.Float),
283 (r'\d+', Number.Integer),
284
285 (r'[=<>{}\[\]()*.,:;!]|x\b', Punctuation)
286 ],
287 'whitespace': [
288 (r'(\n|\s)+', Whitespace),
289 ],
290 'comments': [
291 (r'/\*.*?\*/', Comment.Multiline),
292 (r'//.*?\n', Comment.Single),
293 ],
294 'keyword': [
295 # Types
296 (r'kernarg' + datatypeMod, Keyword.Type),
297
298 # Regular keywords
299 (r'\$(full|base|small|large|default|zero|near)', Keyword),
300 (words((
301 'module', 'extension', 'pragma', 'prog', 'indirect', 'signature',
302 'decl', 'kernel', 'function', 'enablebreakexceptions',
303 'enabledetectexceptions', 'maxdynamicgroupsize', 'maxflatgridsize',
304 'maxflatworkgroupsize', 'requireddim', 'requiredgridsize',
305 'requiredworkgroupsize', 'requirenopartialworkgroups'),
306 suffix=r'\b'), Keyword),
307
308 # instructions
309 (roundingMod, Keyword),
310 (datatypeMod, Keyword),
311 (r'_(' + alignQual + '|' + widthQual + ')', Keyword),
312 (r'_kernarg', Keyword),
313 (r'(nop|imagefence)\b', Keyword),
314 (words((
315 'cleardetectexcept', 'clock', 'cuid', 'debugtrap', 'dim',
316 'getdetectexcept', 'groupbaseptr', 'kernargbaseptr', 'laneid',
317 'maxcuid', 'maxwaveid', 'packetid', 'setdetectexcept', 'waveid',
318 'workitemflatabsid', 'workitemflatid', 'nullptr', 'abs', 'bitrev',
319 'currentworkgroupsize', 'currentworkitemflatid', 'fract', 'ncos',
320 'neg', 'nexp2', 'nlog2', 'nrcp', 'nrsqrt', 'nsin', 'nsqrt',
321 'gridgroups', 'gridsize', 'not', 'sqrt', 'workgroupid',
322 'workgroupsize', 'workitemabsid', 'workitemid', 'ceil', 'floor',
323 'rint', 'trunc', 'add', 'bitmask', 'borrow', 'carry', 'copysign',
324 'div', 'rem', 'sub', 'shl', 'shr', 'and', 'or', 'xor', 'unpackhi',
325 'unpacklo', 'max', 'min', 'fma', 'mad', 'bitextract', 'bitselect',
326 'shuffle', 'cmov', 'bitalign', 'bytealign', 'lerp', 'nfma', 'mul',
327 'mulhi', 'mul24hi', 'mul24', 'mad24', 'mad24hi', 'bitinsert',
328 'combine', 'expand', 'lda', 'mov', 'pack', 'unpack', 'packcvt',
329 'unpackcvt', 'sad', 'sementp', 'ftos', 'stof', 'cmp', 'ld', 'st',
330 '_eq', '_ne', '_lt', '_le', '_gt', '_ge', '_equ', '_neu', '_ltu',
331 '_leu', '_gtu', '_geu', '_num', '_nan', '_seq', '_sne', '_slt',
332 '_sle', '_sgt', '_sge', '_snum', '_snan', '_sequ', '_sneu', '_sltu',
333 '_sleu', '_sgtu', '_sgeu', 'atomic', '_ld', '_st', '_cas', '_add',
334 '_and', '_exch', '_max', '_min', '_or', '_sub', '_wrapdec',
335 '_wrapinc', '_xor', 'ret', 'cvt', '_readonly', '_kernarg', '_global',
336 'br', 'cbr', 'sbr', '_scacq', '_screl', '_scar', '_rlx', '_wave',
337 '_wg', '_agent', '_system', 'ldimage', 'stimage', '_v2', '_v3', '_v4',
338 '_1d', '_2d', '_3d', '_1da', '_2da', '_1db', '_2ddepth', '_2dadepth',
339 '_width', '_height', '_depth', '_array', '_channelorder',
340 '_channeltype', 'querysampler', '_coord', '_filter', '_addressing',
341 'barrier', 'wavebarrier', 'initfbar', 'joinfbar', 'waitfbar',
342 'arrivefbar', 'leavefbar', 'releasefbar', 'ldf', 'activelaneid',
343 'activelanecount', 'activelanemask', 'activelanepermute', 'call',
344 'scall', 'icall', 'alloca', 'packetcompletionsig',
345 'addqueuewriteindex', 'casqueuewriteindex', 'ldqueuereadindex',
346 'stqueuereadindex', 'readonly', 'global', 'private', 'group',
347 'spill', 'arg', '_upi', '_downi', '_zeroi', '_neari', '_upi_sat',
348 '_downi_sat', '_zeroi_sat', '_neari_sat', '_supi', '_sdowni',
349 '_szeroi', '_sneari', '_supi_sat', '_sdowni_sat', '_szeroi_sat',
350 '_sneari_sat', '_pp', '_ps', '_sp', '_ss', '_s', '_p', '_pp_sat',
351 '_ps_sat', '_sp_sat', '_ss_sat', '_s_sat', '_p_sat')), Keyword),
352
353 # Integer types
354 (r'i[1-9]\d*', Keyword)
355 ]
356 }
357
358
359class LlvmLexer(RegexLexer):
360 """
361 For LLVM assembly code.
362 """
363 name = 'LLVM'
364 url = 'https://llvm.org/docs/LangRef.html'
365 aliases = ['llvm']
366 filenames = ['*.ll']
367 mimetypes = ['text/x-llvm']
368 version_added = ''
369
370 #: optional Comment or Whitespace
371 string = r'"[^"]*?"'
372 identifier = r'([-a-zA-Z$._][\w\-$.]*|' + string + ')'
373 block_label = r'(' + identifier + r'|(\d+))'
374
375 tokens = {
376 'root': [
377 include('whitespace'),
378
379 # Before keywords, because keywords are valid label names :(...
380 (block_label + r'\s*:', Name.Label),
381
382 include('keyword'),
383
384 (r'%' + identifier, Name.Variable),
385 (r'@' + identifier, Name.Variable.Global),
386 (r'%\d+', Name.Variable.Anonymous),
387 (r'@\d+', Name.Variable.Global),
388 (r'#\d+', Name.Variable.Global),
389 (r'!' + identifier, Name.Variable),
390 (r'!\d+', Name.Variable.Anonymous),
391 (r'c?' + string, String),
392
393 (r'0[xX][KLMHR]?[a-fA-F0-9]+', Number),
394 (r'-?\d+(?:[.]\d+)?(?:[eE][-+]?\d+(?:[.]\d+)?)?', Number),
395
396 (r'[=<>{}\[\]()*.,!]|x\b', Punctuation)
397 ],
398 'whitespace': [
399 (r'(\n|\s+)+', Whitespace),
400 (r';.*?\n', Comment)
401 ],
402 'keyword': [
403 # Regular keywords
404 (words((
405 'aarch64_sve_vector_pcs', 'aarch64_vector_pcs', 'acq_rel',
406 'acquire', 'add', 'addrspace', 'addrspacecast', 'afn', 'alias',
407 'aliasee', 'align', 'alignLog2', 'alignstack', 'alloca',
408 'allocsize', 'allOnes', 'alwaysinline', 'alwaysInline',
409 'amdgpu_cs', 'amdgpu_es', 'amdgpu_gfx', 'amdgpu_gs',
410 'amdgpu_hs', 'amdgpu_kernel', 'amdgpu_ls', 'amdgpu_ps',
411 'amdgpu_vs', 'and', 'any', 'anyregcc', 'appending', 'arcp',
412 'argmemonly', 'args', 'arm_aapcs_vfpcc', 'arm_aapcscc',
413 'arm_apcscc', 'ashr', 'asm', 'atomic', 'atomicrmw',
414 'attributes', 'available_externally', 'avr_intrcc',
415 'avr_signalcc', 'bit', 'bitcast', 'bitMask', 'blockaddress',
416 'blockcount', 'br', 'branchFunnel', 'builtin', 'byArg',
417 'byref', 'byte', 'byteArray', 'byval', 'c', 'call', 'callbr',
418 'callee', 'caller', 'calls', 'canAutoHide', 'catch',
419 'catchpad', 'catchret', 'catchswitch', 'cc', 'ccc',
420 'cfguard_checkcc', 'cleanup', 'cleanuppad', 'cleanupret',
421 'cmpxchg', 'cold', 'coldcc', 'comdat', 'common', 'constant',
422 'contract', 'convergent', 'critical', 'cxx_fast_tlscc',
423 'datalayout', 'declare', 'default', 'define', 'deplibs',
424 'dereferenceable', 'dereferenceable_or_null', 'distinct',
425 'dllexport', 'dllimport', 'dso_local', 'dso_local_equivalent',
426 'dso_preemptable', 'dsoLocal', 'eq', 'exact', 'exactmatch',
427 'extern_weak', 'external', 'externally_initialized',
428 'extractelement', 'extractvalue', 'fadd', 'false', 'fast',
429 'fastcc', 'fcmp', 'fdiv', 'fence', 'filter', 'flags', 'fmul',
430 'fneg', 'fpext', 'fptosi', 'fptoui', 'fptrunc', 'freeze',
431 'frem', 'from', 'fsub', 'funcFlags', 'function', 'gc',
432 'getelementptr', 'ghccc', 'global', 'guid', 'gv', 'hash',
433 'hhvm_ccc', 'hhvmcc', 'hidden', 'hot', 'hotness', 'icmp',
434 'ifunc', 'inaccessiblemem_or_argmemonly',
435 'inaccessiblememonly', 'inalloca', 'inbounds', 'indir',
436 'indirectbr', 'info', 'initialexec', 'inline', 'inlineBits',
437 'inlinehint', 'inrange', 'inreg', 'insertelement',
438 'insertvalue', 'insts', 'intel_ocl_bicc', 'inteldialect',
439 'internal', 'inttoptr', 'invoke', 'jumptable', 'kind',
440 'landingpad', 'largest', 'linkage', 'linkonce', 'linkonce_odr',
441 'live', 'load', 'local_unnamed_addr', 'localdynamic',
442 'localexec', 'lshr', 'max', 'metadata', 'min', 'minsize',
443 'module', 'monotonic', 'msp430_intrcc', 'mul', 'mustprogress',
444 'musttail', 'naked', 'name', 'nand', 'ne', 'nest', 'ninf',
445 'nnan', 'noalias', 'nobuiltin', 'nocallback', 'nocapture',
446 'nocf_check', 'noduplicate', 'noduplicates', 'nofree',
447 'noimplicitfloat', 'noinline', 'noInline', 'nomerge', 'none',
448 'nonlazybind', 'nonnull', 'noprofile', 'norecurse',
449 'noRecurse', 'noredzone', 'noreturn', 'nosync', 'notail',
450 'notEligibleToImport', 'noundef', 'nounwind', 'nsw',
451 'nsz', 'null', 'null_pointer_is_valid', 'nuw', 'oeq', 'offset',
452 'oge', 'ogt', 'ole', 'olt', 'one', 'opaque', 'optforfuzzing',
453 'optnone', 'optsize', 'or', 'ord', 'param', 'params',
454 'partition', 'path', 'personality', 'phi', 'poison',
455 'preallocated', 'prefix', 'preserve_allcc', 'preserve_mostcc',
456 'private', 'prologue', 'protected', 'ptrtoint', 'ptx_device',
457 'ptx_kernel', 'readnone', 'readNone', 'readonly', 'readOnly',
458 'reassoc', 'refs', 'relbf', 'release', 'resByArg', 'resume',
459 'ret', 'returnDoesNotAlias', 'returned', 'returns_twice',
460 'safestack', 'samesize', 'sanitize_address',
461 'sanitize_hwaddress', 'sanitize_memory', 'sanitize_memtag',
462 'sanitize_thread', 'sdiv', 'section', 'select', 'seq_cst',
463 'sext', 'sge', 'sgt', 'shadowcallstack', 'shl',
464 'shufflevector', 'sideeffect', 'signext', 'single',
465 'singleImpl', 'singleImplName', 'sitofp', 'sizeM1',
466 'sizeM1BitWidth', 'sle', 'slt', 'source_filename',
467 'speculatable', 'speculative_load_hardening', 'spir_func',
468 'spir_kernel', 'splat', 'srem', 'sret', 'ssp', 'sspreq',
469 'sspstrong', 'store', 'strictfp', 'sub', 'summaries',
470 'summary', 'swiftcc', 'swifterror', 'swiftself', 'switch',
471 'syncscope', 'tail', 'tailcc', 'target', 'thread_local', 'to',
472 'token', 'triple', 'true', 'trunc', 'type',
473 'typeCheckedLoadConstVCalls', 'typeCheckedLoadVCalls',
474 'typeid', 'typeidCompatibleVTable', 'typeIdInfo',
475 'typeTestAssumeConstVCalls', 'typeTestAssumeVCalls',
476 'typeTestRes', 'typeTests', 'udiv', 'ueq', 'uge', 'ugt',
477 'uitofp', 'ule', 'ult', 'umax', 'umin', 'undef', 'une',
478 'uniformRetVal', 'uniqueRetVal', 'unknown', 'unnamed_addr',
479 'uno', 'unordered', 'unreachable', 'unsat', 'unwind', 'urem',
480 'uselistorder', 'uselistorder_bb', 'uwtable', 'va_arg',
481 'varFlags', 'variable', 'vcall_visibility', 'vFuncId',
482 'virtFunc', 'virtualConstProp', 'void', 'volatile', 'vscale',
483 'vTableFuncs', 'weak', 'weak_odr', 'webkit_jscc', 'win64cc',
484 'within', 'wpdRes', 'wpdResolutions', 'writeonly', 'x',
485 'x86_64_sysvcc', 'x86_fastcallcc', 'x86_intrcc', 'x86_mmx',
486 'x86_regcallcc', 'x86_stdcallcc', 'x86_thiscallcc',
487 'x86_vectorcallcc', 'xchg', 'xor', 'zeroext',
488 'zeroinitializer', 'zext', 'immarg', 'willreturn'),
489 suffix=r'\b'), Keyword),
490
491 # Types
492 (words(('void', 'half', 'bfloat', 'float', 'double', 'fp128',
493 'x86_fp80', 'ppc_fp128', 'label', 'metadata', 'x86_mmx',
494 'x86_amx', 'token', 'ptr')),
495 Keyword.Type),
496
497 # Integer types
498 (r'i[1-9]\d*', Keyword.Type)
499 ]
500 }
501
502
503class LlvmMirBodyLexer(RegexLexer):
504 """
505 For LLVM MIR examples without the YAML wrapper.
506 """
507 name = 'LLVM-MIR Body'
508 url = 'https://llvm.org/docs/MIRLangRef.html'
509 aliases = ['llvm-mir-body']
510 filenames = []
511 mimetypes = []
512 version_added = '2.6'
513
514 tokens = {
515 'root': [
516 # Attributes on basic blocks
517 (words(('liveins', 'successors'), suffix=':'), Keyword),
518 # Basic Block Labels
519 (r'bb\.[0-9]+(\.[a-zA-Z0-9_.-]+)?( \(address-taken\))?:', Name.Label),
520 (r'bb\.[0-9]+ \(%[a-zA-Z0-9_.-]+\)( \(address-taken\))?:', Name.Label),
521 (r'%bb\.[0-9]+(\.\w+)?', Name.Label),
522 # Stack references
523 (r'%stack\.[0-9]+(\.\w+\.addr)?', Name),
524 # Subreg indices
525 (r'%subreg\.\w+', Name),
526 # Virtual registers
527 (r'%[a-zA-Z0-9_]+ *', Name.Variable, 'vreg'),
528 # Reference to LLVM-IR global
529 include('global'),
530 # Reference to Intrinsic
531 (r'intrinsic\(\@[a-zA-Z0-9_.]+\)', Name.Variable.Global),
532 # Comparison predicates
533 (words(('eq', 'ne', 'sgt', 'sge', 'slt', 'sle', 'ugt', 'uge', 'ult',
534 'ule'), prefix=r'intpred\(', suffix=r'\)'), Name.Builtin),
535 (words(('oeq', 'one', 'ogt', 'oge', 'olt', 'ole', 'ugt', 'uge',
536 'ult', 'ule'), prefix=r'floatpred\(', suffix=r'\)'),
537 Name.Builtin),
538 # Physical registers
539 (r'\$\w+', String.Single),
540 # Assignment operator
541 (r'=', Operator),
542 # gMIR Opcodes
543 (r'(G_ANYEXT|G_[SZ]EXT|G_SEXT_INREG|G_TRUNC|G_IMPLICIT_DEF|G_PHI|'
544 r'G_FRAME_INDEX|G_GLOBAL_VALUE|G_INTTOPTR|G_PTRTOINT|G_BITCAST|'
545 r'G_CONSTANT|G_FCONSTANT|G_VASTART|G_VAARG|G_CTLZ|G_CTLZ_ZERO_UNDEF|'
546 r'G_CTTZ|G_CTTZ_ZERO_UNDEF|G_CTPOP|G_BSWAP|G_BITREVERSE|'
547 r'G_ADDRSPACE_CAST|G_BLOCK_ADDR|G_JUMP_TABLE|G_DYN_STACKALLOC|'
548 r'G_ADD|G_SUB|G_MUL|G_[SU]DIV|G_[SU]REM|G_AND|G_OR|G_XOR|G_SHL|'
549 r'G_[LA]SHR|G_[IF]CMP|G_SELECT|G_GEP|G_PTR_MASK|G_SMIN|G_SMAX|'
550 r'G_UMIN|G_UMAX|G_[US]ADDO|G_[US]ADDE|G_[US]SUBO|G_[US]SUBE|'
551 r'G_[US]MULO|G_[US]MULH|G_FNEG|G_FPEXT|G_FPTRUNC|G_FPTO[US]I|'
552 r'G_[US]ITOFP|G_FABS|G_FCOPYSIGN|G_FCANONICALIZE|G_FMINNUM|'
553 r'G_FMAXNUM|G_FMINNUM_IEEE|G_FMAXNUM_IEEE|G_FMINIMUM|G_FMAXIMUM|'
554 r'G_FADD|G_FSUB|G_FMUL|G_FMA|G_FMAD|G_FDIV|G_FREM|G_FPOW|G_FEXP|'
555 r'G_FEXP2|G_FLOG|G_FLOG2|G_FLOG10|G_FCEIL|G_FCOS|G_FSIN|G_FSQRT|'
556 r'G_FFLOOR|G_FRINT|G_FNEARBYINT|G_INTRINSIC_TRUNC|'
557 r'G_INTRINSIC_ROUND|G_LOAD|G_[ZS]EXTLOAD|G_INDEXED_LOAD|'
558 r'G_INDEXED_[ZS]EXTLOAD|G_STORE|G_INDEXED_STORE|'
559 r'G_ATOMIC_CMPXCHG_WITH_SUCCESS|G_ATOMIC_CMPXCHG|'
560 r'G_ATOMICRMW_(XCHG|ADD|SUB|AND|NAND|OR|XOR|MAX|MIN|UMAX|UMIN|FADD|'
561 r'FSUB)'
562 r'|G_FENCE|G_EXTRACT|G_UNMERGE_VALUES|G_INSERT|G_MERGE_VALUES|'
563 r'G_BUILD_VECTOR|G_BUILD_VECTOR_TRUNC|G_CONCAT_VECTORS|'
564 r'G_INTRINSIC|G_INTRINSIC_W_SIDE_EFFECTS|G_BR|G_BRCOND|'
565 r'G_BRINDIRECT|G_BRJT|G_INSERT_VECTOR_ELT|G_EXTRACT_VECTOR_ELT|'
566 r'G_SHUFFLE_VECTOR)\b',
567 Name.Builtin),
568 # Target independent opcodes
569 (r'(COPY|PHI|INSERT_SUBREG|EXTRACT_SUBREG|REG_SEQUENCE)\b',
570 Name.Builtin),
571 # Flags
572 (words(('killed', 'implicit')), Keyword),
573 # ConstantInt values
574 (r'(i[0-9]+)( +)', bygroups(Keyword.Type, Whitespace), 'constantint'),
575 # ConstantFloat values
576 (r'(half|float|double) +', Keyword.Type, 'constantfloat'),
577 # Bare immediates
578 include('integer'),
579 # MMO's
580 (r'(::)( *)', bygroups(Operator, Whitespace), 'mmo'),
581 # MIR Comments
582 (r';.*', Comment),
583 # If we get here, assume it's a target instruction
584 (r'[a-zA-Z0-9_]+', Name),
585 # Everything else that isn't highlighted
586 (r'[(), \n]+', Text),
587 ],
588 # The integer constant from a ConstantInt value
589 'constantint': [
590 include('integer'),
591 (r'(?=.)', Text, '#pop'),
592 ],
593 # The floating point constant from a ConstantFloat value
594 'constantfloat': [
595 include('float'),
596 (r'(?=.)', Text, '#pop'),
597 ],
598 'vreg': [
599 # The bank or class if there is one
600 (r'( *)(:(?!:))', bygroups(Whitespace, Keyword), ('#pop', 'vreg_bank_or_class')),
601 # The LLT if there is one
602 (r'( *)(\()', bygroups(Whitespace, Text), 'vreg_type'),
603 (r'(?=.)', Text, '#pop'),
604 ],
605 'vreg_bank_or_class': [
606 # The unassigned bank/class
607 (r'( *)(_)', bygroups(Whitespace, Name.Variable.Magic)),
608 (r'( *)([a-zA-Z0-9_]+)', bygroups(Whitespace, Name.Variable)),
609 # The LLT if there is one
610 (r'( *)(\()', bygroups(Whitespace, Text), 'vreg_type'),
611 (r'(?=.)', Text, '#pop'),
612 ],
613 'vreg_type': [
614 # Scalar and pointer types
615 (r'( *)([sp][0-9]+)', bygroups(Whitespace, Keyword.Type)),
616 (r'( *)(<[0-9]+ *x *[sp][0-9]+>)', bygroups(Whitespace, Keyword.Type)),
617 (r'\)', Text, '#pop'),
618 (r'(?=.)', Text, '#pop'),
619 ],
620 'mmo': [
621 (r'\(', Text),
622 (r' +', Whitespace),
623 (words(('load', 'store', 'on', 'into', 'from', 'align', 'monotonic',
624 'acquire', 'release', 'acq_rel', 'seq_cst')),
625 Keyword),
626 # IR references
627 (r'%ir\.[a-zA-Z0-9_.-]+', Name),
628 (r'%ir-block\.[a-zA-Z0-9_.-]+', Name),
629 (r'[-+]', Operator),
630 include('integer'),
631 include('global'),
632 (r',', Punctuation),
633 (r'\), \(', Text),
634 (r'\)', Text, '#pop'),
635 ],
636 'integer': [(r'-?[0-9]+', Number.Integer),],
637 'float': [(r'-?[0-9]+\.[0-9]+(e[+-][0-9]+)?', Number.Float)],
638 'global': [(r'\@[a-zA-Z0-9_.]+', Name.Variable.Global)],
639 }
640
641
642class LlvmMirLexer(RegexLexer):
643 """
644 Lexer for the overall LLVM MIR document format.
645
646 MIR is a human readable serialization format that's used to represent LLVM's
647 machine specific intermediate representation. It allows LLVM's developers to
648 see the state of the compilation process at various points, as well as test
649 individual pieces of the compiler.
650 """
651 name = 'LLVM-MIR'
652 url = 'https://llvm.org/docs/MIRLangRef.html'
653 aliases = ['llvm-mir']
654 filenames = ['*.mir']
655 version_added = '2.6'
656
657 tokens = {
658 'root': [
659 # Comments are hashes at the YAML level
660 (r'#.*', Comment),
661 # Documents starting with | are LLVM-IR
662 (r'--- \|$', Keyword, 'llvm_ir'),
663 # Other documents are MIR
664 (r'---', Keyword, 'llvm_mir'),
665 # Consume everything else in one token for efficiency
666 (r'[^-#]+|.', Text),
667 ],
668 'llvm_ir': [
669 # Documents end with '...' or '---'
670 (r'(\.\.\.|(?=---))', Keyword, '#pop'),
671 # Delegate to the LlvmLexer
672 (r'((?:.|\n)+?)(?=(\.\.\.|---))', bygroups(using(LlvmLexer))),
673 ],
674 'llvm_mir': [
675 # Comments are hashes at the YAML level
676 (r'#.*', Comment),
677 # Documents end with '...' or '---'
678 (r'(\.\.\.|(?=---))', Keyword, '#pop'),
679 # Handle the simple attributes
680 (r'name:', Keyword, 'name'),
681 (words(('alignment', ),
682 suffix=':'), Keyword, 'number'),
683 (words(('legalized', 'regBankSelected', 'tracksRegLiveness',
684 'selected', 'exposesReturnsTwice'),
685 suffix=':'), Keyword, 'boolean'),
686 # Handle the attributes don't highlight inside
687 (words(('registers', 'stack', 'fixedStack', 'liveins', 'frameInfo',
688 'machineFunctionInfo'),
689 suffix=':'), Keyword),
690 # Delegate the body block to the LlvmMirBodyLexer
691 (r'body: *\|', Keyword, 'llvm_mir_body'),
692 # Consume everything else
693 (r'.+', Text),
694 (r'\n', Whitespace),
695 ],
696 'name': [
697 (r'[^\n]+', Name),
698 default('#pop'),
699 ],
700 'boolean': [
701 (r' *(true|false)', Name.Builtin),
702 default('#pop'),
703 ],
704 'number': [
705 (r' *[0-9]+', Number),
706 default('#pop'),
707 ],
708 'llvm_mir_body': [
709 # Documents end with '...' or '---'.
710 # We have to pop llvm_mir_body and llvm_mir
711 (r'(\.\.\.|(?=---))', Keyword, '#pop:2'),
712 # Delegate the body block to the LlvmMirBodyLexer
713 (r'((?:.|\n)+?)(?=\.\.\.|---)', bygroups(using(LlvmMirBodyLexer))),
714 # The '...' is optional. If we didn't already find it then it isn't
715 # there. There might be a '---' instead though.
716 (r'(?!\.\.\.|---)((?:.|\n)+)', bygroups(using(LlvmMirBodyLexer))),
717 ],
718 }
719
720
721class NasmLexer(RegexLexer):
722 """
723 For Nasm (Intel) assembly code.
724 """
725 name = 'NASM'
726 aliases = ['nasm']
727 filenames = ['*.asm', '*.ASM', '*.nasm']
728 mimetypes = ['text/x-nasm']
729 url = 'https://nasm.us'
730 version_added = ''
731
732 # Tasm uses the same file endings, but TASM is not as common as NASM, so
733 # we prioritize NASM higher by default
734 priority = 1.0
735
736 identifier = r'[a-z$._?][\w$.?#@~]*'
737 hexn = r'(?:0x[0-9a-f]+|$0[0-9a-f]*|[0-9]+[0-9a-f]*h)'
738 octn = r'[0-7]+q'
739 binn = r'[01]+b'
740 decn = r'[0-9]+'
741 floatn = decn + r'\.e?' + decn
742 string = r'"(\\"|[^"\n])*"|' + r"'(\\'|[^'\n])*'|" + r"`(\\`|[^`\n])*`"
743 declkw = r'(?:res|d)[bwdqt]|times'
744 register = (r'(r[0-9][0-5]?[bwd]?|'
745 r'[a-d][lh]|[er]?[a-d]x|[er]?[sb]p|[er]?[sd]i|[c-gs]s|st[0-7]|'
746 r'mm[0-7]|cr[0-4]|dr[0-367]|tr[3-7]|k[0-7]|'
747 r'[xyz]mm(?:[12][0-9]?|3[01]?|[04-9]))\b')
748 wordop = r'seg|wrt|strict|rel|abs'
749 type = r'byte|[dq]?word'
750 # Directives must be followed by whitespace, otherwise CPU will match
751 # cpuid for instance.
752 directives = (r'(?:BITS|USE16|USE32|SECTION|SEGMENT|ABSOLUTE|EXTERN|GLOBAL|'
753 r'ORG|ALIGN|STRUC|ENDSTRUC|COMMON|CPU|GROUP|UPPERCASE|IMPORT|'
754 r'EXPORT|LIBRARY|MODULE)(?=\s)')
755
756 flags = re.IGNORECASE | re.MULTILINE
757 tokens = {
758 'root': [
759 (r'^\s*%', Comment.Preproc, 'preproc'),
760 include('whitespace'),
761 (identifier + ':', Name.Label),
762 (rf'({identifier})(\s+)(equ)',
763 bygroups(Name.Constant, Whitespace, Keyword.Declaration),
764 'instruction-args'),
765 (directives, Keyword, 'instruction-args'),
766 (declkw, Keyword.Declaration, 'instruction-args'),
767 (identifier, Name.Function, 'instruction-args'),
768 (r'[\r\n]+', Whitespace)
769 ],
770 'instruction-args': [
771 (string, String),
772 (hexn, Number.Hex),
773 (octn, Number.Oct),
774 (binn, Number.Bin),
775 (floatn, Number.Float),
776 (decn, Number.Integer),
777 include('punctuation'),
778 (register, Name.Builtin),
779 (identifier, Name.Variable),
780 (r'[\r\n]+', Whitespace, '#pop'),
781 include('whitespace')
782 ],
783 'preproc': [
784 (r'[^;\n]+', Comment.Preproc),
785 (r';.*?\n', Comment.Single, '#pop'),
786 (r'\n', Comment.Preproc, '#pop'),
787 ],
788 'whitespace': [
789 (r'\n', Whitespace),
790 (r'[ \t]+', Whitespace),
791 (r';.*', Comment.Single),
792 (r'#.*', Comment.Single)
793 ],
794 'punctuation': [
795 (r'[,{}():\[\]]+', Punctuation),
796 (r'[&|^<>+*/%~-]+', Operator),
797 (r'[$]+', Keyword.Constant),
798 (wordop, Operator.Word),
799 (type, Keyword.Type)
800 ],
801 }
802
803 def analyse_text(text):
804 # Probably TASM
805 if re.match(r'PROC', text, re.IGNORECASE):
806 return False
807
808
809class NasmObjdumpLexer(ObjdumpLexer):
810 """
811 For the output of ``objdump -d -M intel``.
812 """
813 name = 'objdump-nasm'
814 aliases = ['objdump-nasm']
815 filenames = ['*.objdump-intel']
816 mimetypes = ['text/x-nasm-objdump']
817 url = 'https://www.gnu.org/software/binutils'
818 version_added = '2.0'
819
820 tokens = _objdump_lexer_tokens(NasmLexer)
821
822
823class TasmLexer(RegexLexer):
824 """
825 For Tasm (Turbo Assembler) assembly code.
826 """
827 name = 'TASM'
828 aliases = ['tasm']
829 filenames = ['*.asm', '*.ASM', '*.tasm']
830 mimetypes = ['text/x-tasm']
831 url = 'https://en.wikipedia.org/wiki/Turbo_Assembler'
832 version_added = ''
833
834 identifier = r'[@a-z$._?][\w$.?#@~]*'
835 hexn = r'(?:0x[0-9a-f]+|$0[0-9a-f]*|[0-9]+[0-9a-f]*h)'
836 octn = r'[0-7]+q'
837 binn = r'[01]+b'
838 decn = r'[0-9]+'
839 floatn = decn + r'\.e?' + decn
840 string = r'"(\\"|[^"\n])*"|' + r"'(\\'|[^'\n])*'|" + r"`(\\`|[^`\n])*`"
841 declkw = r'(?:res|d)[bwdqt]|times'
842 register = (r'(r[0-9][0-5]?[bwd]|'
843 r'[a-d][lh]|[er]?[a-d]x|[er]?[sb]p|[er]?[sd]i|[c-gs]s|st[0-7]|'
844 r'mm[0-7]|cr[0-4]|dr[0-367]|tr[3-7])\b')
845 wordop = r'seg|wrt|strict'
846 type = r'byte|[dq]?word'
847 directives = (r'BITS|USE16|USE32|SECTION|SEGMENT|ABSOLUTE|EXTERN|GLOBAL|'
848 r'ORG|ALIGN|STRUC|ENDSTRUC|ENDS|COMMON|CPU|GROUP|UPPERCASE|INCLUDE|'
849 r'EXPORT|LIBRARY|MODULE|PROC|ENDP|USES|ARG|DATASEG|UDATASEG|END|IDEAL|'
850 r'P386|MODEL|ASSUME|CODESEG|SIZE')
851 # T[A-Z][a-z] is more of a convention. Lexer should filter out STRUC definitions
852 # and then 'add' them to datatype somehow.
853 datatype = (r'db|dd|dw|T[A-Z][a-z]+')
854
855 flags = re.IGNORECASE | re.MULTILINE
856 tokens = {
857 'root': [
858 (r'^\s*%', Comment.Preproc, 'preproc'),
859 include('whitespace'),
860 (identifier + ':', Name.Label),
861 (directives, Keyword, 'instruction-args'),
862 (rf'({identifier})(\s+)({datatype})',
863 bygroups(Name.Constant, Whitespace, Keyword.Declaration),
864 'instruction-args'),
865 (declkw, Keyword.Declaration, 'instruction-args'),
866 (identifier, Name.Function, 'instruction-args'),
867 (r'[\r\n]+', Whitespace)
868 ],
869 'instruction-args': [
870 (string, String),
871 (hexn, Number.Hex),
872 (octn, Number.Oct),
873 (binn, Number.Bin),
874 (floatn, Number.Float),
875 (decn, Number.Integer),
876 include('punctuation'),
877 (register, Name.Builtin),
878 (identifier, Name.Variable),
879 # Do not match newline when it's preceded by a backslash
880 (r'(\\)(\s*)(;.*)([\r\n])',
881 bygroups(Text, Whitespace, Comment.Single, Whitespace)),
882 (r'[\r\n]+', Whitespace, '#pop'),
883 include('whitespace')
884 ],
885 'preproc': [
886 (r'[^;\n]+', Comment.Preproc),
887 (r';.*?\n', Comment.Single, '#pop'),
888 (r'\n', Comment.Preproc, '#pop'),
889 ],
890 'whitespace': [
891 (r'[\n\r]', Whitespace),
892 (r'(\\)([\n\r])', bygroups(Text, Whitespace)),
893 (r'[ \t]+', Whitespace),
894 (r';.*', Comment.Single)
895 ],
896 'punctuation': [
897 (r'[,():\[\]]+', Punctuation),
898 (r'[&|^<>+*=/%~-]+', Operator),
899 (r'[$]+', Keyword.Constant),
900 (wordop, Operator.Word),
901 (type, Keyword.Type)
902 ],
903 }
904
905 def analyse_text(text):
906 # See above
907 if re.match(r'PROC', text, re.I):
908 return True
909
910
911class Ca65Lexer(RegexLexer):
912 """
913 For ca65 assembler sources.
914 """
915 name = 'ca65 assembler'
916 aliases = ['ca65']
917 filenames = ['*.s']
918 url = 'https://cc65.github.io'
919 version_added = '1.6'
920
921 flags = re.IGNORECASE
922
923 tokens = {
924 'root': [
925 (r';.*', Comment.Single),
926 (r'\s+', Whitespace),
927 (r'[a-z_.@$][\w.@$]*:', Name.Label),
928 (r'((ld|st)[axy]|(in|de)[cxy]|asl|lsr|ro[lr]|adc|sbc|cmp|cp[xy]'
929 r'|cl[cvdi]|se[cdi]|jmp|jsr|bne|beq|bpl|bmi|bvc|bvs|bcc|bcs'
930 r'|p[lh][ap]|rt[is]|brk|nop|ta[xy]|t[xy]a|txs|tsx|and|ora|eor'
931 r'|bit)\b', Keyword),
932 (r'\.\w+', Keyword.Pseudo),
933 (r'[-+~*/^&|!<>=]', Operator),
934 (r'"[^"\n]*.', String),
935 (r"'[^'\n]*.", String.Char),
936 (r'\$[0-9a-f]+|[0-9a-f]+h\b', Number.Hex),
937 (r'\d+', Number.Integer),
938 (r'%[01]+', Number.Bin),
939 (r'[#,.:()=\[\]]', Punctuation),
940 (r'[a-z_.@$][\w.@$]*', Name),
941 ]
942 }
943
944 def analyse_text(self, text):
945 # comments in GAS start with "#"
946 if re.search(r'^\s*;', text, re.MULTILINE):
947 return 0.9
948
949
950class Dasm16Lexer(RegexLexer):
951 """
952 For DCPU-16 Assembly.
953 """
954 name = 'DASM16'
955 url = 'http://0x10c.com/doc/dcpu-16.txt'
956 aliases = ['dasm16']
957 filenames = ['*.dasm16', '*.dasm']
958 mimetypes = ['text/x-dasm16']
959 version_added = '2.4'
960
961 INSTRUCTIONS = [
962 'SET',
963 'ADD', 'SUB',
964 'MUL', 'MLI',
965 'DIV', 'DVI',
966 'MOD', 'MDI',
967 'AND', 'BOR', 'XOR',
968 'SHR', 'ASR', 'SHL',
969 'IFB', 'IFC', 'IFE', 'IFN', 'IFG', 'IFA', 'IFL', 'IFU',
970 'ADX', 'SBX',
971 'STI', 'STD',
972 'JSR',
973 'INT', 'IAG', 'IAS', 'RFI', 'IAQ', 'HWN', 'HWQ', 'HWI',
974 ]
975
976 REGISTERS = [
977 'A', 'B', 'C',
978 'X', 'Y', 'Z',
979 'I', 'J',
980 'SP', 'PC', 'EX',
981 'POP', 'PEEK', 'PUSH'
982 ]
983
984 # Regexes yo
985 char = r'[a-zA-Z0-9_$@.]'
986 identifier = r'(?:[a-zA-Z$_]' + char + r'*|\.' + char + '+)'
987 number = r'[+-]?(?:0[xX][a-zA-Z0-9]+|\d+)'
988 binary_number = r'0b[01_]+'
989 instruction = r'(?i)(' + '|'.join(INSTRUCTIONS) + ')'
990 single_char = r"'\\?" + char + "'"
991 string = r'"(\\"|[^"])*"'
992
993 def guess_identifier(lexer, match):
994 ident = match.group(0)
995 klass = Name.Variable if ident.upper() in lexer.REGISTERS else Name.Label
996 yield match.start(), klass, ident
997
998 tokens = {
999 'root': [
1000 include('whitespace'),
1001 (':' + identifier, Name.Label),
1002 (identifier + ':', Name.Label),
1003 (instruction, Name.Function, 'instruction-args'),
1004 (r'\.' + identifier, Name.Function, 'data-args'),
1005 (r'[\r\n]+', Whitespace)
1006 ],
1007
1008 'numeric' : [
1009 (binary_number, Number.Integer),
1010 (number, Number.Integer),
1011 (single_char, String),
1012 ],
1013
1014 'arg' : [
1015 (identifier, guess_identifier),
1016 include('numeric')
1017 ],
1018
1019 'deref' : [
1020 (r'\+', Punctuation),
1021 (r'\]', Punctuation, '#pop'),
1022 include('arg'),
1023 include('whitespace')
1024 ],
1025
1026 'instruction-line' : [
1027 (r'[\r\n]+', Whitespace, '#pop'),
1028 (r';.*?$', Comment, '#pop'),
1029 include('whitespace')
1030 ],
1031
1032 'instruction-args': [
1033 (r',', Punctuation),
1034 (r'\[', Punctuation, 'deref'),
1035 include('arg'),
1036 include('instruction-line')
1037 ],
1038
1039 'data-args' : [
1040 (r',', Punctuation),
1041 include('numeric'),
1042 (string, String),
1043 include('instruction-line')
1044 ],
1045
1046 'whitespace': [
1047 (r'\n', Whitespace),
1048 (r'\s+', Whitespace),
1049 (r';.*?\n', Comment)
1050 ],
1051 }