1"""
2 pygments.lexers.modula2
3 ~~~~~~~~~~~~~~~~~~~~~~~
4
5 Multi-Dialect Lexer for Modula-2.
6
7 :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
8 :license: BSD, see LICENSE for details.
9"""
10
11import re
12
13from pygments.lexer import RegexLexer, include
14from pygments.util import get_bool_opt, get_list_opt
15from pygments.token import Text, Comment, Operator, Keyword, Name, \
16 String, Number, Punctuation, Error
17
18__all__ = ['Modula2Lexer']
19
20
21# Multi-Dialect Modula-2 Lexer
22class Modula2Lexer(RegexLexer):
23 """
24 For Modula-2 source code.
25
26 The Modula-2 lexer supports several dialects. By default, it operates in
27 fallback mode, recognising the *combined* literals, punctuation symbols
28 and operators of all supported dialects, and the *combined* reserved words
29 and builtins of PIM Modula-2, ISO Modula-2 and Modula-2 R10, while not
30 differentiating between library defined identifiers.
31
32 To select a specific dialect, a dialect option may be passed
33 or a dialect tag may be embedded into a source file.
34
35 Dialect Options:
36
37 `m2pim`
38 Select PIM Modula-2 dialect.
39 `m2iso`
40 Select ISO Modula-2 dialect.
41 `m2r10`
42 Select Modula-2 R10 dialect.
43 `objm2`
44 Select Objective Modula-2 dialect.
45
46 The PIM and ISO dialect options may be qualified with a language extension.
47
48 Language Extensions:
49
50 `+aglet`
51 Select Aglet Modula-2 extensions, available with m2iso.
52 `+gm2`
53 Select GNU Modula-2 extensions, available with m2pim.
54 `+p1`
55 Select p1 Modula-2 extensions, available with m2iso.
56 `+xds`
57 Select XDS Modula-2 extensions, available with m2iso.
58
59
60 Passing a Dialect Option via Unix Commandline Interface
61
62 Dialect options may be passed to the lexer using the `dialect` key.
63 Only one such option should be passed. If multiple dialect options are
64 passed, the first valid option is used, any subsequent options are ignored.
65
66 Examples:
67
68 `$ pygmentize -O full,dialect=m2iso -f html -o /path/to/output /path/to/input`
69 Use ISO dialect to render input to HTML output
70 `$ pygmentize -O full,dialect=m2iso+p1 -f rtf -o /path/to/output /path/to/input`
71 Use ISO dialect with p1 extensions to render input to RTF output
72
73
74 Embedding a Dialect Option within a source file
75
76 A dialect option may be embedded in a source file in form of a dialect
77 tag, a specially formatted comment that specifies a dialect option.
78
79 Dialect Tag EBNF::
80
81 dialectTag :
82 OpeningCommentDelim Prefix dialectOption ClosingCommentDelim ;
83
84 dialectOption :
85 'm2pim' | 'm2iso' | 'm2r10' | 'objm2' |
86 'm2iso+aglet' | 'm2pim+gm2' | 'm2iso+p1' | 'm2iso+xds' ;
87
88 Prefix : '!' ;
89
90 OpeningCommentDelim : '(*' ;
91
92 ClosingCommentDelim : '*)' ;
93
94 No whitespace is permitted between the tokens of a dialect tag.
95
96 In the event that a source file contains multiple dialect tags, the first
97 tag that contains a valid dialect option will be used and any subsequent
98 dialect tags will be ignored. Ideally, a dialect tag should be placed
99 at the beginning of a source file.
100
101 An embedded dialect tag overrides a dialect option set via command line.
102
103 Examples:
104
105 ``(*!m2r10*) DEFINITION MODULE Foobar; ...``
106 Use Modula2 R10 dialect to render this source file.
107 ``(*!m2pim+gm2*) DEFINITION MODULE Bazbam; ...``
108 Use PIM dialect with GNU extensions to render this source file.
109
110
111 Algol Publication Mode:
112
113 In Algol publication mode, source text is rendered for publication of
114 algorithms in scientific papers and academic texts, following the format
115 of the Revised Algol-60 Language Report. It is activated by passing
116 one of two corresponding styles as an option:
117
118 `algol`
119 render reserved words lowercase underline boldface
120 and builtins lowercase boldface italic
121 `algol_nu`
122 render reserved words lowercase boldface (no underlining)
123 and builtins lowercase boldface italic
124
125 The lexer automatically performs the required lowercase conversion when
126 this mode is activated.
127
128 Example:
129
130 ``$ pygmentize -O full,style=algol -f latex -o /path/to/output /path/to/input``
131 Render input file in Algol publication mode to LaTeX output.
132
133
134 Rendering Mode of First Class ADT Identifiers:
135
136 The rendering of standard library first class ADT identifiers is controlled
137 by option flag "treat_stdlib_adts_as_builtins".
138
139 When this option is turned on, standard library ADT identifiers are rendered
140 as builtins. When it is turned off, they are rendered as ordinary library
141 identifiers.
142
143 `treat_stdlib_adts_as_builtins` (default: On)
144
145 The option is useful for dialects that support ADTs as first class objects
146 and provide ADTs in the standard library that would otherwise be built-in.
147
148 At present, only Modula-2 R10 supports library ADTs as first class objects
149 and therefore, no ADT identifiers are defined for any other dialects.
150
151 Example:
152
153 ``$ pygmentize -O full,dialect=m2r10,treat_stdlib_adts_as_builtins=Off ...``
154 Render standard library ADTs as ordinary library types.
155
156 .. versionchanged:: 2.1
157 Added multi-dialect support.
158 """
159 name = 'Modula-2'
160 url = 'http://www.modula2.org/'
161 aliases = ['modula2', 'm2']
162 filenames = ['*.def', '*.mod']
163 mimetypes = ['text/x-modula2']
164 version_added = '1.3'
165
166 flags = re.MULTILINE | re.DOTALL
167
168 tokens = {
169 'whitespace': [
170 (r'\n+', Text), # blank lines
171 (r'\s+', Text), # whitespace
172 ],
173 'dialecttags': [
174 # PIM Dialect Tag
175 (r'\(\*!m2pim\*\)', Comment.Special),
176 # ISO Dialect Tag
177 (r'\(\*!m2iso\*\)', Comment.Special),
178 # M2R10 Dialect Tag
179 (r'\(\*!m2r10\*\)', Comment.Special),
180 # ObjM2 Dialect Tag
181 (r'\(\*!objm2\*\)', Comment.Special),
182 # Aglet Extensions Dialect Tag
183 (r'\(\*!m2iso\+aglet\*\)', Comment.Special),
184 # GNU Extensions Dialect Tag
185 (r'\(\*!m2pim\+gm2\*\)', Comment.Special),
186 # p1 Extensions Dialect Tag
187 (r'\(\*!m2iso\+p1\*\)', Comment.Special),
188 # XDS Extensions Dialect Tag
189 (r'\(\*!m2iso\+xds\*\)', Comment.Special),
190 ],
191 'identifiers': [
192 (r'([a-zA-Z_$][\w$]*)', Name),
193 ],
194 'prefixed_number_literals': [
195 #
196 # Base-2, whole number
197 (r'0b[01]+(\'[01]+)*', Number.Bin),
198 #
199 # Base-16, whole number
200 (r'0[ux][0-9A-F]+(\'[0-9A-F]+)*', Number.Hex),
201 ],
202 'plain_number_literals': [
203 #
204 # Base-10, real number with exponent
205 (r'[0-9]+(\'[0-9]+)*' # integral part
206 r'\.[0-9]+(\'[0-9]+)*' # fractional part
207 r'[eE][+-]?[0-9]+(\'[0-9]+)*', # exponent
208 Number.Float),
209 #
210 # Base-10, real number without exponent
211 (r'[0-9]+(\'[0-9]+)*' # integral part
212 r'\.[0-9]+(\'[0-9]+)*', # fractional part
213 Number.Float),
214 #
215 # Base-10, whole number
216 (r'[0-9]+(\'[0-9]+)*', Number.Integer),
217 ],
218 'suffixed_number_literals': [
219 #
220 # Base-8, whole number
221 (r'[0-7]+B', Number.Oct),
222 #
223 # Base-8, character code
224 (r'[0-7]+C', Number.Oct),
225 #
226 # Base-16, number
227 (r'[0-9A-F]+H', Number.Hex),
228 ],
229 'string_literals': [
230 (r'"(\\\\|\\[^\\]|[^"\\])*"', String.Double),
231 (r"'(\\\\|\\[^\\]|[^'\\])*'", String.Single),
232 ],
233 'digraph_operators': [
234 # Dot Product Operator
235 (r'\*\.', Operator),
236 # Array Concatenation Operator
237 (r'\+>', Operator), # M2R10 + ObjM2
238 # Inequality Operator
239 (r'<>', Operator), # ISO + PIM
240 # Less-Or-Equal, Subset
241 (r'<=', Operator),
242 # Greater-Or-Equal, Superset
243 (r'>=', Operator),
244 # Identity Operator
245 (r'==', Operator), # M2R10 + ObjM2
246 # Type Conversion Operator
247 (r'::', Operator), # M2R10 + ObjM2
248 # Assignment Symbol
249 (r':=', Operator),
250 # Postfix Increment Mutator
251 (r'\+\+', Operator), # M2R10 + ObjM2
252 # Postfix Decrement Mutator
253 (r'--', Operator), # M2R10 + ObjM2
254 ],
255 'unigraph_operators': [
256 # Arithmetic Operators
257 (r'[+-]', Operator),
258 (r'[*/]', Operator),
259 # ISO 80000-2 compliant Set Difference Operator
260 (r'\\', Operator), # M2R10 + ObjM2
261 # Relational Operators
262 (r'[=#<>]', Operator),
263 # Dereferencing Operator
264 (r'\^', Operator),
265 # Dereferencing Operator Synonym
266 (r'@', Operator), # ISO
267 # Logical AND Operator Synonym
268 (r'&', Operator), # PIM + ISO
269 # Logical NOT Operator Synonym
270 (r'~', Operator), # PIM + ISO
271 # Smalltalk Message Prefix
272 (r'`', Operator), # ObjM2
273 ],
274 'digraph_punctuation': [
275 # Range Constructor
276 (r'\.\.', Punctuation),
277 # Opening Chevron Bracket
278 (r'<<', Punctuation), # M2R10 + ISO
279 # Closing Chevron Bracket
280 (r'>>', Punctuation), # M2R10 + ISO
281 # Blueprint Punctuation
282 (r'->', Punctuation), # M2R10 + ISO
283 # Distinguish |# and # in M2 R10
284 (r'\|#', Punctuation),
285 # Distinguish ## and # in M2 R10
286 (r'##', Punctuation),
287 # Distinguish |* and * in M2 R10
288 (r'\|\*', Punctuation),
289 ],
290 'unigraph_punctuation': [
291 # Common Punctuation
292 (r'[()\[\]{},.:;|]', Punctuation),
293 # Case Label Separator Synonym
294 (r'!', Punctuation), # ISO
295 # Blueprint Punctuation
296 (r'\?', Punctuation), # M2R10 + ObjM2
297 ],
298 'comments': [
299 # Single Line Comment
300 (r'^//.*?\n', Comment.Single), # M2R10 + ObjM2
301 # Block Comment
302 (r'\(\*([^$].*?)\*\)', Comment.Multiline),
303 # Template Block Comment
304 (r'/\*(.*?)\*/', Comment.Multiline), # M2R10 + ObjM2
305 ],
306 'pragmas': [
307 # ISO Style Pragmas
308 (r'<\*.*?\*>', Comment.Preproc), # ISO, M2R10 + ObjM2
309 # Pascal Style Pragmas
310 (r'\(\*\$.*?\*\)', Comment.Preproc), # PIM
311 ],
312 'root': [
313 include('whitespace'),
314 include('dialecttags'),
315 include('pragmas'),
316 include('comments'),
317 include('identifiers'),
318 include('suffixed_number_literals'), # PIM + ISO
319 include('prefixed_number_literals'), # M2R10 + ObjM2
320 include('plain_number_literals'),
321 include('string_literals'),
322 include('digraph_punctuation'),
323 include('digraph_operators'),
324 include('unigraph_punctuation'),
325 include('unigraph_operators'),
326 ]
327 }
328
329# C o m m o n D a t a s e t s
330
331 # Common Reserved Words Dataset
332 common_reserved_words = (
333 # 37 common reserved words
334 'AND', 'ARRAY', 'BEGIN', 'BY', 'CASE', 'CONST', 'DEFINITION', 'DIV',
335 'DO', 'ELSE', 'ELSIF', 'END', 'EXIT', 'FOR', 'FROM', 'IF',
336 'IMPLEMENTATION', 'IMPORT', 'IN', 'LOOP', 'MOD', 'MODULE', 'NOT',
337 'OF', 'OR', 'POINTER', 'PROCEDURE', 'RECORD', 'REPEAT', 'RETURN',
338 'SET', 'THEN', 'TO', 'TYPE', 'UNTIL', 'VAR', 'WHILE',
339 )
340
341 # Common Builtins Dataset
342 common_builtins = (
343 # 16 common builtins
344 'ABS', 'BOOLEAN', 'CARDINAL', 'CHAR', 'CHR', 'FALSE', 'INTEGER',
345 'LONGINT', 'LONGREAL', 'MAX', 'MIN', 'NIL', 'ODD', 'ORD', 'REAL',
346 'TRUE',
347 )
348
349 # Common Pseudo-Module Builtins Dataset
350 common_pseudo_builtins = (
351 # 4 common pseudo builtins
352 'ADDRESS', 'BYTE', 'WORD', 'ADR'
353 )
354
355# P I M M o d u l a - 2 D a t a s e t s
356
357 # Lexemes to Mark as Error Tokens for PIM Modula-2
358 pim_lexemes_to_reject = (
359 '!', '`', '@', '$', '%', '?', '\\', '==', '++', '--', '::', '*.',
360 '+>', '->', '<<', '>>', '|#', '##',
361 )
362
363 # PIM Modula-2 Additional Reserved Words Dataset
364 pim_additional_reserved_words = (
365 # 3 additional reserved words
366 'EXPORT', 'QUALIFIED', 'WITH',
367 )
368
369 # PIM Modula-2 Additional Builtins Dataset
370 pim_additional_builtins = (
371 # 16 additional builtins
372 'BITSET', 'CAP', 'DEC', 'DISPOSE', 'EXCL', 'FLOAT', 'HALT', 'HIGH',
373 'INC', 'INCL', 'NEW', 'NIL', 'PROC', 'SIZE', 'TRUNC', 'VAL',
374 )
375
376 # PIM Modula-2 Additional Pseudo-Module Builtins Dataset
377 pim_additional_pseudo_builtins = (
378 # 5 additional pseudo builtins
379 'SYSTEM', 'PROCESS', 'TSIZE', 'NEWPROCESS', 'TRANSFER',
380 )
381
382# I S O M o d u l a - 2 D a t a s e t s
383
384 # Lexemes to Mark as Error Tokens for ISO Modula-2
385 iso_lexemes_to_reject = (
386 '`', '$', '%', '?', '\\', '==', '++', '--', '::', '*.', '+>', '->',
387 '<<', '>>', '|#', '##',
388 )
389
390 # ISO Modula-2 Additional Reserved Words Dataset
391 iso_additional_reserved_words = (
392 # 9 additional reserved words (ISO 10514-1)
393 'EXCEPT', 'EXPORT', 'FINALLY', 'FORWARD', 'PACKEDSET', 'QUALIFIED',
394 'REM', 'RETRY', 'WITH',
395 # 10 additional reserved words (ISO 10514-2 & ISO 10514-3)
396 'ABSTRACT', 'AS', 'CLASS', 'GUARD', 'INHERIT', 'OVERRIDE', 'READONLY',
397 'REVEAL', 'TRACED', 'UNSAFEGUARDED',
398 )
399
400 # ISO Modula-2 Additional Builtins Dataset
401 iso_additional_builtins = (
402 # 26 additional builtins (ISO 10514-1)
403 'BITSET', 'CAP', 'CMPLX', 'COMPLEX', 'DEC', 'DISPOSE', 'EXCL', 'FLOAT',
404 'HALT', 'HIGH', 'IM', 'INC', 'INCL', 'INT', 'INTERRUPTIBLE', 'LENGTH',
405 'LFLOAT', 'LONGCOMPLEX', 'NEW', 'PROC', 'PROTECTION', 'RE', 'SIZE',
406 'TRUNC', 'UNINTERRUBTIBLE', 'VAL',
407 # 5 additional builtins (ISO 10514-2 & ISO 10514-3)
408 'CREATE', 'DESTROY', 'EMPTY', 'ISMEMBER', 'SELF',
409 )
410
411 # ISO Modula-2 Additional Pseudo-Module Builtins Dataset
412 iso_additional_pseudo_builtins = (
413 # 14 additional builtins (SYSTEM)
414 'SYSTEM', 'BITSPERLOC', 'LOCSPERBYTE', 'LOCSPERWORD', 'LOC',
415 'ADDADR', 'SUBADR', 'DIFADR', 'MAKEADR', 'ADR',
416 'ROTATE', 'SHIFT', 'CAST', 'TSIZE',
417 # 13 additional builtins (COROUTINES)
418 'COROUTINES', 'ATTACH', 'COROUTINE', 'CURRENT', 'DETACH', 'HANDLER',
419 'INTERRUPTSOURCE', 'IOTRANSFER', 'IsATTACHED', 'LISTEN',
420 'NEWCOROUTINE', 'PROT', 'TRANSFER',
421 # 9 additional builtins (EXCEPTIONS)
422 'EXCEPTIONS', 'AllocateSource', 'CurrentNumber', 'ExceptionNumber',
423 'ExceptionSource', 'GetMessage', 'IsCurrentSource',
424 'IsExceptionalExecution', 'RAISE',
425 # 3 additional builtins (TERMINATION)
426 'TERMINATION', 'IsTerminating', 'HasHalted',
427 # 4 additional builtins (M2EXCEPTION)
428 'M2EXCEPTION', 'M2Exceptions', 'M2Exception', 'IsM2Exception',
429 'indexException', 'rangeException', 'caseSelectException',
430 'invalidLocation', 'functionException', 'wholeValueException',
431 'wholeDivException', 'realValueException', 'realDivException',
432 'complexValueException', 'complexDivException', 'protException',
433 'sysException', 'coException', 'exException',
434 )
435
436# M o d u l a - 2 R 1 0 D a t a s e t s
437
438 # Lexemes to Mark as Error Tokens for Modula-2 R10
439 m2r10_lexemes_to_reject = (
440 '!', '`', '@', '$', '%', '&', '<>',
441 )
442
443 # Modula-2 R10 reserved words in addition to the common set
444 m2r10_additional_reserved_words = (
445 # 12 additional reserved words
446 'ALIAS', 'ARGLIST', 'BLUEPRINT', 'COPY', 'GENLIB', 'INDETERMINATE',
447 'NEW', 'NONE', 'OPAQUE', 'REFERENTIAL', 'RELEASE', 'RETAIN',
448 # 2 additional reserved words with symbolic assembly option
449 'ASM', 'REG',
450 )
451
452 # Modula-2 R10 builtins in addition to the common set
453 m2r10_additional_builtins = (
454 # 26 additional builtins
455 'CARDINAL', 'COUNT', 'EMPTY', 'EXISTS', 'INSERT', 'LENGTH', 'LONGCARD',
456 'OCTET', 'PTR', 'PRED', 'READ', 'READNEW', 'REMOVE', 'RETRIEVE', 'SORT',
457 'STORE', 'SUBSET', 'SUCC', 'TLIMIT', 'TMAX', 'TMIN', 'TRUE', 'TSIZE',
458 'UNICHAR', 'WRITE', 'WRITEF',
459 )
460
461 # Modula-2 R10 Additional Pseudo-Module Builtins Dataset
462 m2r10_additional_pseudo_builtins = (
463 # 13 additional builtins (TPROPERTIES)
464 'TPROPERTIES', 'PROPERTY', 'LITERAL', 'TPROPERTY', 'TLITERAL',
465 'TBUILTIN', 'TDYN', 'TREFC', 'TNIL', 'TBASE', 'TPRECISION',
466 'TMAXEXP', 'TMINEXP',
467 # 4 additional builtins (CONVERSION)
468 'CONVERSION', 'TSXFSIZE', 'SXF', 'VAL',
469 # 35 additional builtins (UNSAFE)
470 'UNSAFE', 'CAST', 'INTRINSIC', 'AVAIL', 'ADD', 'SUB', 'ADDC', 'SUBC',
471 'FETCHADD', 'FETCHSUB', 'SHL', 'SHR', 'ASHR', 'ROTL', 'ROTR', 'ROTLC',
472 'ROTRC', 'BWNOT', 'BWAND', 'BWOR', 'BWXOR', 'BWNAND', 'BWNOR',
473 'SETBIT', 'TESTBIT', 'LSBIT', 'MSBIT', 'CSBITS', 'BAIL', 'HALT',
474 'TODO', 'FFI', 'ADDR', 'VARGLIST', 'VARGC',
475 # 11 additional builtins (ATOMIC)
476 'ATOMIC', 'INTRINSIC', 'AVAIL', 'SWAP', 'CAS', 'INC', 'DEC', 'BWAND',
477 'BWNAND', 'BWOR', 'BWXOR',
478 # 7 additional builtins (COMPILER)
479 'COMPILER', 'DEBUG', 'MODNAME', 'PROCNAME', 'LINENUM', 'DEFAULT',
480 'HASH',
481 # 5 additional builtins (ASSEMBLER)
482 'ASSEMBLER', 'REGISTER', 'SETREG', 'GETREG', 'CODE',
483 )
484
485# O b j e c t i v e M o d u l a - 2 D a t a s e t s
486
487 # Lexemes to Mark as Error Tokens for Objective Modula-2
488 objm2_lexemes_to_reject = (
489 '!', '$', '%', '&', '<>',
490 )
491
492 # Objective Modula-2 Extensions
493 # reserved words in addition to Modula-2 R10
494 objm2_additional_reserved_words = (
495 # 16 additional reserved words
496 'BYCOPY', 'BYREF', 'CLASS', 'CONTINUE', 'CRITICAL', 'INOUT', 'METHOD',
497 'ON', 'OPTIONAL', 'OUT', 'PRIVATE', 'PROTECTED', 'PROTOCOL', 'PUBLIC',
498 'SUPER', 'TRY',
499 )
500
501 # Objective Modula-2 Extensions
502 # builtins in addition to Modula-2 R10
503 objm2_additional_builtins = (
504 # 3 additional builtins
505 'OBJECT', 'NO', 'YES',
506 )
507
508 # Objective Modula-2 Extensions
509 # pseudo-module builtins in addition to Modula-2 R10
510 objm2_additional_pseudo_builtins = (
511 # None
512 )
513
514# A g l e t M o d u l a - 2 D a t a s e t s
515
516 # Aglet Extensions
517 # reserved words in addition to ISO Modula-2
518 aglet_additional_reserved_words = (
519 # None
520 )
521
522 # Aglet Extensions
523 # builtins in addition to ISO Modula-2
524 aglet_additional_builtins = (
525 # 9 additional builtins
526 'BITSET8', 'BITSET16', 'BITSET32', 'CARDINAL8', 'CARDINAL16',
527 'CARDINAL32', 'INTEGER8', 'INTEGER16', 'INTEGER32',
528 )
529
530 # Aglet Modula-2 Extensions
531 # pseudo-module builtins in addition to ISO Modula-2
532 aglet_additional_pseudo_builtins = (
533 # None
534 )
535
536# G N U M o d u l a - 2 D a t a s e t s
537
538 # GNU Extensions
539 # reserved words in addition to PIM Modula-2
540 gm2_additional_reserved_words = (
541 # 10 additional reserved words
542 'ASM', '__ATTRIBUTE__', '__BUILTIN__', '__COLUMN__', '__DATE__',
543 '__FILE__', '__FUNCTION__', '__LINE__', '__MODULE__', 'VOLATILE',
544 )
545
546 # GNU Extensions
547 # builtins in addition to PIM Modula-2
548 gm2_additional_builtins = (
549 # 21 additional builtins
550 'BITSET8', 'BITSET16', 'BITSET32', 'CARDINAL8', 'CARDINAL16',
551 'CARDINAL32', 'CARDINAL64', 'COMPLEX32', 'COMPLEX64', 'COMPLEX96',
552 'COMPLEX128', 'INTEGER8', 'INTEGER16', 'INTEGER32', 'INTEGER64',
553 'REAL8', 'REAL16', 'REAL32', 'REAL96', 'REAL128', 'THROW',
554 )
555
556 # GNU Extensions
557 # pseudo-module builtins in addition to PIM Modula-2
558 gm2_additional_pseudo_builtins = (
559 # None
560 )
561
562# p 1 M o d u l a - 2 D a t a s e t s
563
564 # p1 Extensions
565 # reserved words in addition to ISO Modula-2
566 p1_additional_reserved_words = (
567 # None
568 )
569
570 # p1 Extensions
571 # builtins in addition to ISO Modula-2
572 p1_additional_builtins = (
573 # None
574 )
575
576 # p1 Modula-2 Extensions
577 # pseudo-module builtins in addition to ISO Modula-2
578 p1_additional_pseudo_builtins = (
579 # 1 additional builtin
580 'BCD',
581 )
582
583# X D S M o d u l a - 2 D a t a s e t s
584
585 # XDS Extensions
586 # reserved words in addition to ISO Modula-2
587 xds_additional_reserved_words = (
588 # 1 additional reserved word
589 'SEQ',
590 )
591
592 # XDS Extensions
593 # builtins in addition to ISO Modula-2
594 xds_additional_builtins = (
595 # 9 additional builtins
596 'ASH', 'ASSERT', 'DIFFADR_TYPE', 'ENTIER', 'INDEX', 'LEN',
597 'LONGCARD', 'SHORTCARD', 'SHORTINT',
598 )
599
600 # XDS Modula-2 Extensions
601 # pseudo-module builtins in addition to ISO Modula-2
602 xds_additional_pseudo_builtins = (
603 # 22 additional builtins (SYSTEM)
604 'PROCESS', 'NEWPROCESS', 'BOOL8', 'BOOL16', 'BOOL32', 'CARD8',
605 'CARD16', 'CARD32', 'INT8', 'INT16', 'INT32', 'REF', 'MOVE',
606 'FILL', 'GET', 'PUT', 'CC', 'int', 'unsigned', 'size_t', 'void'
607 # 3 additional builtins (COMPILER)
608 'COMPILER', 'OPTION', 'EQUATION'
609 )
610
611# P I M S t a n d a r d L i b r a r y D a t a s e t s
612
613 # PIM Modula-2 Standard Library Modules Dataset
614 pim_stdlib_module_identifiers = (
615 'Terminal', 'FileSystem', 'InOut', 'RealInOut', 'MathLib0', 'Storage',
616 )
617
618 # PIM Modula-2 Standard Library Types Dataset
619 pim_stdlib_type_identifiers = (
620 'Flag', 'FlagSet', 'Response', 'Command', 'Lock', 'Permission',
621 'MediumType', 'File', 'FileProc', 'DirectoryProc', 'FileCommand',
622 'DirectoryCommand',
623 )
624
625 # PIM Modula-2 Standard Library Procedures Dataset
626 pim_stdlib_proc_identifiers = (
627 'Read', 'BusyRead', 'ReadAgain', 'Write', 'WriteString', 'WriteLn',
628 'Create', 'Lookup', 'Close', 'Delete', 'Rename', 'SetRead', 'SetWrite',
629 'SetModify', 'SetOpen', 'Doio', 'SetPos', 'GetPos', 'Length', 'Reset',
630 'Again', 'ReadWord', 'WriteWord', 'ReadChar', 'WriteChar',
631 'CreateMedium', 'DeleteMedium', 'AssignName', 'DeassignName',
632 'ReadMedium', 'LookupMedium', 'OpenInput', 'OpenOutput', 'CloseInput',
633 'CloseOutput', 'ReadString', 'ReadInt', 'ReadCard', 'ReadWrd',
634 'WriteInt', 'WriteCard', 'WriteOct', 'WriteHex', 'WriteWrd',
635 'ReadReal', 'WriteReal', 'WriteFixPt', 'WriteRealOct', 'sqrt', 'exp',
636 'ln', 'sin', 'cos', 'arctan', 'entier', 'ALLOCATE', 'DEALLOCATE',
637 )
638
639 # PIM Modula-2 Standard Library Variables Dataset
640 pim_stdlib_var_identifiers = (
641 'Done', 'termCH', 'in', 'out'
642 )
643
644 # PIM Modula-2 Standard Library Constants Dataset
645 pim_stdlib_const_identifiers = (
646 'EOL',
647 )
648
649# I S O S t a n d a r d L i b r a r y D a t a s e t s
650
651 # ISO Modula-2 Standard Library Modules Dataset
652 iso_stdlib_module_identifiers = (
653 # TO DO
654 )
655
656 # ISO Modula-2 Standard Library Types Dataset
657 iso_stdlib_type_identifiers = (
658 # TO DO
659 )
660
661 # ISO Modula-2 Standard Library Procedures Dataset
662 iso_stdlib_proc_identifiers = (
663 # TO DO
664 )
665
666 # ISO Modula-2 Standard Library Variables Dataset
667 iso_stdlib_var_identifiers = (
668 # TO DO
669 )
670
671 # ISO Modula-2 Standard Library Constants Dataset
672 iso_stdlib_const_identifiers = (
673 # TO DO
674 )
675
676# M 2 R 1 0 S t a n d a r d L i b r a r y D a t a s e t s
677
678 # Modula-2 R10 Standard Library ADTs Dataset
679 m2r10_stdlib_adt_identifiers = (
680 'BCD', 'LONGBCD', 'BITSET', 'SHORTBITSET', 'LONGBITSET',
681 'LONGLONGBITSET', 'COMPLEX', 'LONGCOMPLEX', 'SHORTCARD', 'LONGLONGCARD',
682 'SHORTINT', 'LONGLONGINT', 'POSINT', 'SHORTPOSINT', 'LONGPOSINT',
683 'LONGLONGPOSINT', 'BITSET8', 'BITSET16', 'BITSET32', 'BITSET64',
684 'BITSET128', 'BS8', 'BS16', 'BS32', 'BS64', 'BS128', 'CARDINAL8',
685 'CARDINAL16', 'CARDINAL32', 'CARDINAL64', 'CARDINAL128', 'CARD8',
686 'CARD16', 'CARD32', 'CARD64', 'CARD128', 'INTEGER8', 'INTEGER16',
687 'INTEGER32', 'INTEGER64', 'INTEGER128', 'INT8', 'INT16', 'INT32',
688 'INT64', 'INT128', 'STRING', 'UNISTRING',
689 )
690
691 # Modula-2 R10 Standard Library Blueprints Dataset
692 m2r10_stdlib_blueprint_identifiers = (
693 'ProtoRoot', 'ProtoComputational', 'ProtoNumeric', 'ProtoScalar',
694 'ProtoNonScalar', 'ProtoCardinal', 'ProtoInteger', 'ProtoReal',
695 'ProtoComplex', 'ProtoVector', 'ProtoTuple', 'ProtoCompArray',
696 'ProtoCollection', 'ProtoStaticArray', 'ProtoStaticSet',
697 'ProtoStaticString', 'ProtoArray', 'ProtoString', 'ProtoSet',
698 'ProtoMultiSet', 'ProtoDictionary', 'ProtoMultiDict', 'ProtoExtension',
699 'ProtoIO', 'ProtoCardMath', 'ProtoIntMath', 'ProtoRealMath',
700 )
701
702 # Modula-2 R10 Standard Library Modules Dataset
703 m2r10_stdlib_module_identifiers = (
704 'ASCII', 'BooleanIO', 'CharIO', 'UnicharIO', 'OctetIO',
705 'CardinalIO', 'LongCardIO', 'IntegerIO', 'LongIntIO', 'RealIO',
706 'LongRealIO', 'BCDIO', 'LongBCDIO', 'CardMath', 'LongCardMath',
707 'IntMath', 'LongIntMath', 'RealMath', 'LongRealMath', 'BCDMath',
708 'LongBCDMath', 'FileIO', 'FileSystem', 'Storage', 'IOSupport',
709 )
710
711 # Modula-2 R10 Standard Library Types Dataset
712 m2r10_stdlib_type_identifiers = (
713 'File', 'Status',
714 # TO BE COMPLETED
715 )
716
717 # Modula-2 R10 Standard Library Procedures Dataset
718 m2r10_stdlib_proc_identifiers = (
719 'ALLOCATE', 'DEALLOCATE', 'SIZE',
720 # TO BE COMPLETED
721 )
722
723 # Modula-2 R10 Standard Library Variables Dataset
724 m2r10_stdlib_var_identifiers = (
725 'stdIn', 'stdOut', 'stdErr',
726 )
727
728 # Modula-2 R10 Standard Library Constants Dataset
729 m2r10_stdlib_const_identifiers = (
730 'pi', 'tau',
731 )
732
733# D i a l e c t s
734
735 # Dialect modes
736 dialects = (
737 'unknown',
738 'm2pim', 'm2iso', 'm2r10', 'objm2',
739 'm2iso+aglet', 'm2pim+gm2', 'm2iso+p1', 'm2iso+xds',
740 )
741
742# D a t a b a s e s
743
744 # Lexemes to Mark as Errors Database
745 lexemes_to_reject_db = {
746 # Lexemes to reject for unknown dialect
747 'unknown': (
748 # LEAVE THIS EMPTY
749 ),
750 # Lexemes to reject for PIM Modula-2
751 'm2pim': (
752 pim_lexemes_to_reject,
753 ),
754 # Lexemes to reject for ISO Modula-2
755 'm2iso': (
756 iso_lexemes_to_reject,
757 ),
758 # Lexemes to reject for Modula-2 R10
759 'm2r10': (
760 m2r10_lexemes_to_reject,
761 ),
762 # Lexemes to reject for Objective Modula-2
763 'objm2': (
764 objm2_lexemes_to_reject,
765 ),
766 # Lexemes to reject for Aglet Modula-2
767 'm2iso+aglet': (
768 iso_lexemes_to_reject,
769 ),
770 # Lexemes to reject for GNU Modula-2
771 'm2pim+gm2': (
772 pim_lexemes_to_reject,
773 ),
774 # Lexemes to reject for p1 Modula-2
775 'm2iso+p1': (
776 iso_lexemes_to_reject,
777 ),
778 # Lexemes to reject for XDS Modula-2
779 'm2iso+xds': (
780 iso_lexemes_to_reject,
781 ),
782 }
783
784 # Reserved Words Database
785 reserved_words_db = {
786 # Reserved words for unknown dialect
787 'unknown': (
788 common_reserved_words,
789 pim_additional_reserved_words,
790 iso_additional_reserved_words,
791 m2r10_additional_reserved_words,
792 ),
793
794 # Reserved words for PIM Modula-2
795 'm2pim': (
796 common_reserved_words,
797 pim_additional_reserved_words,
798 ),
799
800 # Reserved words for Modula-2 R10
801 'm2iso': (
802 common_reserved_words,
803 iso_additional_reserved_words,
804 ),
805
806 # Reserved words for ISO Modula-2
807 'm2r10': (
808 common_reserved_words,
809 m2r10_additional_reserved_words,
810 ),
811
812 # Reserved words for Objective Modula-2
813 'objm2': (
814 common_reserved_words,
815 m2r10_additional_reserved_words,
816 objm2_additional_reserved_words,
817 ),
818
819 # Reserved words for Aglet Modula-2 Extensions
820 'm2iso+aglet': (
821 common_reserved_words,
822 iso_additional_reserved_words,
823 aglet_additional_reserved_words,
824 ),
825
826 # Reserved words for GNU Modula-2 Extensions
827 'm2pim+gm2': (
828 common_reserved_words,
829 pim_additional_reserved_words,
830 gm2_additional_reserved_words,
831 ),
832
833 # Reserved words for p1 Modula-2 Extensions
834 'm2iso+p1': (
835 common_reserved_words,
836 iso_additional_reserved_words,
837 p1_additional_reserved_words,
838 ),
839
840 # Reserved words for XDS Modula-2 Extensions
841 'm2iso+xds': (
842 common_reserved_words,
843 iso_additional_reserved_words,
844 xds_additional_reserved_words,
845 ),
846 }
847
848 # Builtins Database
849 builtins_db = {
850 # Builtins for unknown dialect
851 'unknown': (
852 common_builtins,
853 pim_additional_builtins,
854 iso_additional_builtins,
855 m2r10_additional_builtins,
856 ),
857
858 # Builtins for PIM Modula-2
859 'm2pim': (
860 common_builtins,
861 pim_additional_builtins,
862 ),
863
864 # Builtins for ISO Modula-2
865 'm2iso': (
866 common_builtins,
867 iso_additional_builtins,
868 ),
869
870 # Builtins for ISO Modula-2
871 'm2r10': (
872 common_builtins,
873 m2r10_additional_builtins,
874 ),
875
876 # Builtins for Objective Modula-2
877 'objm2': (
878 common_builtins,
879 m2r10_additional_builtins,
880 objm2_additional_builtins,
881 ),
882
883 # Builtins for Aglet Modula-2 Extensions
884 'm2iso+aglet': (
885 common_builtins,
886 iso_additional_builtins,
887 aglet_additional_builtins,
888 ),
889
890 # Builtins for GNU Modula-2 Extensions
891 'm2pim+gm2': (
892 common_builtins,
893 pim_additional_builtins,
894 gm2_additional_builtins,
895 ),
896
897 # Builtins for p1 Modula-2 Extensions
898 'm2iso+p1': (
899 common_builtins,
900 iso_additional_builtins,
901 p1_additional_builtins,
902 ),
903
904 # Builtins for XDS Modula-2 Extensions
905 'm2iso+xds': (
906 common_builtins,
907 iso_additional_builtins,
908 xds_additional_builtins,
909 ),
910 }
911
912 # Pseudo-Module Builtins Database
913 pseudo_builtins_db = {
914 # Builtins for unknown dialect
915 'unknown': (
916 common_pseudo_builtins,
917 pim_additional_pseudo_builtins,
918 iso_additional_pseudo_builtins,
919 m2r10_additional_pseudo_builtins,
920 ),
921
922 # Builtins for PIM Modula-2
923 'm2pim': (
924 common_pseudo_builtins,
925 pim_additional_pseudo_builtins,
926 ),
927
928 # Builtins for ISO Modula-2
929 'm2iso': (
930 common_pseudo_builtins,
931 iso_additional_pseudo_builtins,
932 ),
933
934 # Builtins for ISO Modula-2
935 'm2r10': (
936 common_pseudo_builtins,
937 m2r10_additional_pseudo_builtins,
938 ),
939
940 # Builtins for Objective Modula-2
941 'objm2': (
942 common_pseudo_builtins,
943 m2r10_additional_pseudo_builtins,
944 objm2_additional_pseudo_builtins,
945 ),
946
947 # Builtins for Aglet Modula-2 Extensions
948 'm2iso+aglet': (
949 common_pseudo_builtins,
950 iso_additional_pseudo_builtins,
951 aglet_additional_pseudo_builtins,
952 ),
953
954 # Builtins for GNU Modula-2 Extensions
955 'm2pim+gm2': (
956 common_pseudo_builtins,
957 pim_additional_pseudo_builtins,
958 gm2_additional_pseudo_builtins,
959 ),
960
961 # Builtins for p1 Modula-2 Extensions
962 'm2iso+p1': (
963 common_pseudo_builtins,
964 iso_additional_pseudo_builtins,
965 p1_additional_pseudo_builtins,
966 ),
967
968 # Builtins for XDS Modula-2 Extensions
969 'm2iso+xds': (
970 common_pseudo_builtins,
971 iso_additional_pseudo_builtins,
972 xds_additional_pseudo_builtins,
973 ),
974 }
975
976 # Standard Library ADTs Database
977 stdlib_adts_db = {
978 # Empty entry for unknown dialect
979 'unknown': (
980 # LEAVE THIS EMPTY
981 ),
982 # Standard Library ADTs for PIM Modula-2
983 'm2pim': (
984 # No first class library types
985 ),
986
987 # Standard Library ADTs for ISO Modula-2
988 'm2iso': (
989 # No first class library types
990 ),
991
992 # Standard Library ADTs for Modula-2 R10
993 'm2r10': (
994 m2r10_stdlib_adt_identifiers,
995 ),
996
997 # Standard Library ADTs for Objective Modula-2
998 'objm2': (
999 m2r10_stdlib_adt_identifiers,
1000 ),
1001
1002 # Standard Library ADTs for Aglet Modula-2
1003 'm2iso+aglet': (
1004 # No first class library types
1005 ),
1006
1007 # Standard Library ADTs for GNU Modula-2
1008 'm2pim+gm2': (
1009 # No first class library types
1010 ),
1011
1012 # Standard Library ADTs for p1 Modula-2
1013 'm2iso+p1': (
1014 # No first class library types
1015 ),
1016
1017 # Standard Library ADTs for XDS Modula-2
1018 'm2iso+xds': (
1019 # No first class library types
1020 ),
1021 }
1022
1023 # Standard Library Modules Database
1024 stdlib_modules_db = {
1025 # Empty entry for unknown dialect
1026 'unknown': (
1027 # LEAVE THIS EMPTY
1028 ),
1029 # Standard Library Modules for PIM Modula-2
1030 'm2pim': (
1031 pim_stdlib_module_identifiers,
1032 ),
1033
1034 # Standard Library Modules for ISO Modula-2
1035 'm2iso': (
1036 iso_stdlib_module_identifiers,
1037 ),
1038
1039 # Standard Library Modules for Modula-2 R10
1040 'm2r10': (
1041 m2r10_stdlib_blueprint_identifiers,
1042 m2r10_stdlib_module_identifiers,
1043 m2r10_stdlib_adt_identifiers,
1044 ),
1045
1046 # Standard Library Modules for Objective Modula-2
1047 'objm2': (
1048 m2r10_stdlib_blueprint_identifiers,
1049 m2r10_stdlib_module_identifiers,
1050 ),
1051
1052 # Standard Library Modules for Aglet Modula-2
1053 'm2iso+aglet': (
1054 iso_stdlib_module_identifiers,
1055 ),
1056
1057 # Standard Library Modules for GNU Modula-2
1058 'm2pim+gm2': (
1059 pim_stdlib_module_identifiers,
1060 ),
1061
1062 # Standard Library Modules for p1 Modula-2
1063 'm2iso+p1': (
1064 iso_stdlib_module_identifiers,
1065 ),
1066
1067 # Standard Library Modules for XDS Modula-2
1068 'm2iso+xds': (
1069 iso_stdlib_module_identifiers,
1070 ),
1071 }
1072
1073 # Standard Library Types Database
1074 stdlib_types_db = {
1075 # Empty entry for unknown dialect
1076 'unknown': (
1077 # LEAVE THIS EMPTY
1078 ),
1079 # Standard Library Types for PIM Modula-2
1080 'm2pim': (
1081 pim_stdlib_type_identifiers,
1082 ),
1083
1084 # Standard Library Types for ISO Modula-2
1085 'm2iso': (
1086 iso_stdlib_type_identifiers,
1087 ),
1088
1089 # Standard Library Types for Modula-2 R10
1090 'm2r10': (
1091 m2r10_stdlib_type_identifiers,
1092 ),
1093
1094 # Standard Library Types for Objective Modula-2
1095 'objm2': (
1096 m2r10_stdlib_type_identifiers,
1097 ),
1098
1099 # Standard Library Types for Aglet Modula-2
1100 'm2iso+aglet': (
1101 iso_stdlib_type_identifiers,
1102 ),
1103
1104 # Standard Library Types for GNU Modula-2
1105 'm2pim+gm2': (
1106 pim_stdlib_type_identifiers,
1107 ),
1108
1109 # Standard Library Types for p1 Modula-2
1110 'm2iso+p1': (
1111 iso_stdlib_type_identifiers,
1112 ),
1113
1114 # Standard Library Types for XDS Modula-2
1115 'm2iso+xds': (
1116 iso_stdlib_type_identifiers,
1117 ),
1118 }
1119
1120 # Standard Library Procedures Database
1121 stdlib_procedures_db = {
1122 # Empty entry for unknown dialect
1123 'unknown': (
1124 # LEAVE THIS EMPTY
1125 ),
1126 # Standard Library Procedures for PIM Modula-2
1127 'm2pim': (
1128 pim_stdlib_proc_identifiers,
1129 ),
1130
1131 # Standard Library Procedures for ISO Modula-2
1132 'm2iso': (
1133 iso_stdlib_proc_identifiers,
1134 ),
1135
1136 # Standard Library Procedures for Modula-2 R10
1137 'm2r10': (
1138 m2r10_stdlib_proc_identifiers,
1139 ),
1140
1141 # Standard Library Procedures for Objective Modula-2
1142 'objm2': (
1143 m2r10_stdlib_proc_identifiers,
1144 ),
1145
1146 # Standard Library Procedures for Aglet Modula-2
1147 'm2iso+aglet': (
1148 iso_stdlib_proc_identifiers,
1149 ),
1150
1151 # Standard Library Procedures for GNU Modula-2
1152 'm2pim+gm2': (
1153 pim_stdlib_proc_identifiers,
1154 ),
1155
1156 # Standard Library Procedures for p1 Modula-2
1157 'm2iso+p1': (
1158 iso_stdlib_proc_identifiers,
1159 ),
1160
1161 # Standard Library Procedures for XDS Modula-2
1162 'm2iso+xds': (
1163 iso_stdlib_proc_identifiers,
1164 ),
1165 }
1166
1167 # Standard Library Variables Database
1168 stdlib_variables_db = {
1169 # Empty entry for unknown dialect
1170 'unknown': (
1171 # LEAVE THIS EMPTY
1172 ),
1173 # Standard Library Variables for PIM Modula-2
1174 'm2pim': (
1175 pim_stdlib_var_identifiers,
1176 ),
1177
1178 # Standard Library Variables for ISO Modula-2
1179 'm2iso': (
1180 iso_stdlib_var_identifiers,
1181 ),
1182
1183 # Standard Library Variables for Modula-2 R10
1184 'm2r10': (
1185 m2r10_stdlib_var_identifiers,
1186 ),
1187
1188 # Standard Library Variables for Objective Modula-2
1189 'objm2': (
1190 m2r10_stdlib_var_identifiers,
1191 ),
1192
1193 # Standard Library Variables for Aglet Modula-2
1194 'm2iso+aglet': (
1195 iso_stdlib_var_identifiers,
1196 ),
1197
1198 # Standard Library Variables for GNU Modula-2
1199 'm2pim+gm2': (
1200 pim_stdlib_var_identifiers,
1201 ),
1202
1203 # Standard Library Variables for p1 Modula-2
1204 'm2iso+p1': (
1205 iso_stdlib_var_identifiers,
1206 ),
1207
1208 # Standard Library Variables for XDS Modula-2
1209 'm2iso+xds': (
1210 iso_stdlib_var_identifiers,
1211 ),
1212 }
1213
1214 # Standard Library Constants Database
1215 stdlib_constants_db = {
1216 # Empty entry for unknown dialect
1217 'unknown': (
1218 # LEAVE THIS EMPTY
1219 ),
1220 # Standard Library Constants for PIM Modula-2
1221 'm2pim': (
1222 pim_stdlib_const_identifiers,
1223 ),
1224
1225 # Standard Library Constants for ISO Modula-2
1226 'm2iso': (
1227 iso_stdlib_const_identifiers,
1228 ),
1229
1230 # Standard Library Constants for Modula-2 R10
1231 'm2r10': (
1232 m2r10_stdlib_const_identifiers,
1233 ),
1234
1235 # Standard Library Constants for Objective Modula-2
1236 'objm2': (
1237 m2r10_stdlib_const_identifiers,
1238 ),
1239
1240 # Standard Library Constants for Aglet Modula-2
1241 'm2iso+aglet': (
1242 iso_stdlib_const_identifiers,
1243 ),
1244
1245 # Standard Library Constants for GNU Modula-2
1246 'm2pim+gm2': (
1247 pim_stdlib_const_identifiers,
1248 ),
1249
1250 # Standard Library Constants for p1 Modula-2
1251 'm2iso+p1': (
1252 iso_stdlib_const_identifiers,
1253 ),
1254
1255 # Standard Library Constants for XDS Modula-2
1256 'm2iso+xds': (
1257 iso_stdlib_const_identifiers,
1258 ),
1259 }
1260
1261# M e t h o d s
1262
1263 # initialise a lexer instance
1264 def __init__(self, **options):
1265 #
1266 # check dialect options
1267 #
1268 dialects = get_list_opt(options, 'dialect', [])
1269 #
1270 for dialect_option in dialects:
1271 if dialect_option in self.dialects[1:-1]:
1272 # valid dialect option found
1273 self.set_dialect(dialect_option)
1274 break
1275 #
1276 # Fallback Mode (DEFAULT)
1277 else:
1278 # no valid dialect option
1279 self.set_dialect('unknown')
1280 #
1281 self.dialect_set_by_tag = False
1282 #
1283 # check style options
1284 #
1285 styles = get_list_opt(options, 'style', [])
1286 #
1287 # use lowercase mode for Algol style
1288 if 'algol' in styles or 'algol_nu' in styles:
1289 self.algol_publication_mode = True
1290 else:
1291 self.algol_publication_mode = False
1292 #
1293 # Check option flags
1294 #
1295 self.treat_stdlib_adts_as_builtins = get_bool_opt(
1296 options, 'treat_stdlib_adts_as_builtins', True)
1297 #
1298 # call superclass initialiser
1299 RegexLexer.__init__(self, **options)
1300
1301 # Set lexer to a specified dialect
1302 def set_dialect(self, dialect_id):
1303 #
1304 # if __debug__:
1305 # print 'entered set_dialect with arg: ', dialect_id
1306 #
1307 # check dialect name against known dialects
1308 if dialect_id not in self.dialects:
1309 dialect = 'unknown' # default
1310 else:
1311 dialect = dialect_id
1312 #
1313 # compose lexemes to reject set
1314 lexemes_to_reject_set = set()
1315 # add each list of reject lexemes for this dialect
1316 for list in self.lexemes_to_reject_db[dialect]:
1317 lexemes_to_reject_set.update(set(list))
1318 #
1319 # compose reserved words set
1320 reswords_set = set()
1321 # add each list of reserved words for this dialect
1322 for list in self.reserved_words_db[dialect]:
1323 reswords_set.update(set(list))
1324 #
1325 # compose builtins set
1326 builtins_set = set()
1327 # add each list of builtins for this dialect excluding reserved words
1328 for list in self.builtins_db[dialect]:
1329 builtins_set.update(set(list).difference(reswords_set))
1330 #
1331 # compose pseudo-builtins set
1332 pseudo_builtins_set = set()
1333 # add each list of builtins for this dialect excluding reserved words
1334 for list in self.pseudo_builtins_db[dialect]:
1335 pseudo_builtins_set.update(set(list).difference(reswords_set))
1336 #
1337 # compose ADTs set
1338 adts_set = set()
1339 # add each list of ADTs for this dialect excluding reserved words
1340 for list in self.stdlib_adts_db[dialect]:
1341 adts_set.update(set(list).difference(reswords_set))
1342 #
1343 # compose modules set
1344 modules_set = set()
1345 # add each list of builtins for this dialect excluding builtins
1346 for list in self.stdlib_modules_db[dialect]:
1347 modules_set.update(set(list).difference(builtins_set))
1348 #
1349 # compose types set
1350 types_set = set()
1351 # add each list of types for this dialect excluding builtins
1352 for list in self.stdlib_types_db[dialect]:
1353 types_set.update(set(list).difference(builtins_set))
1354 #
1355 # compose procedures set
1356 procedures_set = set()
1357 # add each list of procedures for this dialect excluding builtins
1358 for list in self.stdlib_procedures_db[dialect]:
1359 procedures_set.update(set(list).difference(builtins_set))
1360 #
1361 # compose variables set
1362 variables_set = set()
1363 # add each list of variables for this dialect excluding builtins
1364 for list in self.stdlib_variables_db[dialect]:
1365 variables_set.update(set(list).difference(builtins_set))
1366 #
1367 # compose constants set
1368 constants_set = set()
1369 # add each list of constants for this dialect excluding builtins
1370 for list in self.stdlib_constants_db[dialect]:
1371 constants_set.update(set(list).difference(builtins_set))
1372 #
1373 # update lexer state
1374 self.dialect = dialect
1375 self.lexemes_to_reject = lexemes_to_reject_set
1376 self.reserved_words = reswords_set
1377 self.builtins = builtins_set
1378 self.pseudo_builtins = pseudo_builtins_set
1379 self.adts = adts_set
1380 self.modules = modules_set
1381 self.types = types_set
1382 self.procedures = procedures_set
1383 self.variables = variables_set
1384 self.constants = constants_set
1385 #
1386 # if __debug__:
1387 # print 'exiting set_dialect'
1388 # print ' self.dialect: ', self.dialect
1389 # print ' self.lexemes_to_reject: ', self.lexemes_to_reject
1390 # print ' self.reserved_words: ', self.reserved_words
1391 # print ' self.builtins: ', self.builtins
1392 # print ' self.pseudo_builtins: ', self.pseudo_builtins
1393 # print ' self.adts: ', self.adts
1394 # print ' self.modules: ', self.modules
1395 # print ' self.types: ', self.types
1396 # print ' self.procedures: ', self.procedures
1397 # print ' self.variables: ', self.variables
1398 # print ' self.types: ', self.types
1399 # print ' self.constants: ', self.constants
1400
1401 # Extracts a dialect name from a dialect tag comment string and checks
1402 # the extracted name against known dialects. If a match is found, the
1403 # matching name is returned, otherwise dialect id 'unknown' is returned
1404 def get_dialect_from_dialect_tag(self, dialect_tag):
1405 #
1406 # if __debug__:
1407 # print 'entered get_dialect_from_dialect_tag with arg: ', dialect_tag
1408 #
1409 # constants
1410 left_tag_delim = '(*!'
1411 right_tag_delim = '*)'
1412 left_tag_delim_len = len(left_tag_delim)
1413 right_tag_delim_len = len(right_tag_delim)
1414 indicator_start = left_tag_delim_len
1415 indicator_end = -(right_tag_delim_len)
1416 #
1417 # check comment string for dialect indicator
1418 if len(dialect_tag) > (left_tag_delim_len + right_tag_delim_len) \
1419 and dialect_tag.startswith(left_tag_delim) \
1420 and dialect_tag.endswith(right_tag_delim):
1421 #
1422 # if __debug__:
1423 # print 'dialect tag found'
1424 #
1425 # extract dialect indicator
1426 indicator = dialect_tag[indicator_start:indicator_end]
1427 #
1428 # if __debug__:
1429 # print 'extracted: ', indicator
1430 #
1431 # check against known dialects
1432 for index in range(1, len(self.dialects)):
1433 #
1434 # if __debug__:
1435 # print 'dialects[', index, ']: ', self.dialects[index]
1436 #
1437 if indicator == self.dialects[index]:
1438 #
1439 # if __debug__:
1440 # print 'matching dialect found'
1441 #
1442 # indicator matches known dialect
1443 return indicator
1444 else:
1445 # indicator does not match any dialect
1446 return 'unknown' # default
1447 else:
1448 # invalid indicator string
1449 return 'unknown' # default
1450
1451 # intercept the token stream, modify token attributes and return them
1452 def get_tokens_unprocessed(self, text):
1453 for index, token, value in RegexLexer.get_tokens_unprocessed(self, text):
1454 #
1455 # check for dialect tag if dialect has not been set by tag
1456 if not self.dialect_set_by_tag and token == Comment.Special:
1457 indicated_dialect = self.get_dialect_from_dialect_tag(value)
1458 if indicated_dialect != 'unknown':
1459 # token is a dialect indicator
1460 # reset reserved words and builtins
1461 self.set_dialect(indicated_dialect)
1462 self.dialect_set_by_tag = True
1463 #
1464 # check for reserved words, predefined and stdlib identifiers
1465 if token is Name:
1466 if value in self.reserved_words:
1467 token = Keyword.Reserved
1468 if self.algol_publication_mode:
1469 value = value.lower()
1470 #
1471 elif value in self.builtins:
1472 token = Name.Builtin
1473 if self.algol_publication_mode:
1474 value = value.lower()
1475 #
1476 elif value in self.pseudo_builtins:
1477 token = Name.Builtin.Pseudo
1478 if self.algol_publication_mode:
1479 value = value.lower()
1480 #
1481 elif value in self.adts:
1482 if not self.treat_stdlib_adts_as_builtins:
1483 token = Name.Namespace
1484 else:
1485 token = Name.Builtin.Pseudo
1486 if self.algol_publication_mode:
1487 value = value.lower()
1488 #
1489 elif value in self.modules:
1490 token = Name.Namespace
1491 #
1492 elif value in self.types:
1493 token = Name.Class
1494 #
1495 elif value in self.procedures:
1496 token = Name.Function
1497 #
1498 elif value in self.variables:
1499 token = Name.Variable
1500 #
1501 elif value in self.constants:
1502 token = Name.Constant
1503 #
1504 elif token in Number:
1505 #
1506 # mark prefix number literals as error for PIM and ISO dialects
1507 if self.dialect not in ('unknown', 'm2r10', 'objm2'):
1508 if "'" in value or value[0:2] in ('0b', '0x', '0u'):
1509 token = Error
1510 #
1511 elif self.dialect in ('m2r10', 'objm2'):
1512 # mark base-8 number literals as errors for M2 R10 and ObjM2
1513 if token is Number.Oct:
1514 token = Error
1515 # mark suffix base-16 literals as errors for M2 R10 and ObjM2
1516 elif token is Number.Hex and 'H' in value:
1517 token = Error
1518 # mark real numbers with E as errors for M2 R10 and ObjM2
1519 elif token is Number.Float and 'E' in value:
1520 token = Error
1521 #
1522 elif token in Comment:
1523 #
1524 # mark single line comment as error for PIM and ISO dialects
1525 if token is Comment.Single:
1526 if self.dialect not in ('unknown', 'm2r10', 'objm2'):
1527 token = Error
1528 #
1529 if token is Comment.Preproc:
1530 # mark ISO pragma as error for PIM dialects
1531 if value.startswith('<*') and \
1532 self.dialect.startswith('m2pim'):
1533 token = Error
1534 # mark PIM pragma as comment for other dialects
1535 elif value.startswith('(*$') and \
1536 self.dialect != 'unknown' and \
1537 not self.dialect.startswith('m2pim'):
1538 token = Comment.Multiline
1539 #
1540 else: # token is neither Name nor Comment
1541 #
1542 # mark lexemes matching the dialect's error token set as errors
1543 if value in self.lexemes_to_reject:
1544 token = Error
1545 #
1546 # substitute lexemes when in Algol mode
1547 if self.algol_publication_mode:
1548 if value == '#':
1549 value = '≠'
1550 elif value == '<=':
1551 value = '≤'
1552 elif value == '>=':
1553 value = '≥'
1554 elif value == '==':
1555 value = '≡'
1556 elif value == '*.':
1557 value = '•'
1558
1559 # return result
1560 yield index, token, value
1561
1562 def analyse_text(text):
1563 """It's Pascal-like, but does not use FUNCTION -- uses PROCEDURE
1564 instead."""
1565
1566 # Check if this looks like Pascal, if not, bail out early
1567 if not ('(*' in text and '*)' in text and ':=' in text):
1568 return
1569
1570 result = 0
1571 # Procedure is in Modula2
1572 if re.search(r'\bPROCEDURE\b', text):
1573 result += 0.6
1574
1575 # FUNCTION is only valid in Pascal, but not in Modula2
1576 if re.search(r'\bFUNCTION\b', text):
1577 result = 0.0
1578
1579 return result