1"""
2 pygments.lexers.textfmts
3 ~~~~~~~~~~~~~~~~~~~~~~~~
4
5 Lexers for various text formats.
6
7 :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
8 :license: BSD, see LICENSE for details.
9"""
10
11import re
12
13from pygments.lexers import guess_lexer, get_lexer_by_name
14from pygments.lexer import RegexLexer, bygroups, default, include
15from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
16 Number, Generic, Literal, Punctuation
17from pygments.util import ClassNotFound
18
19__all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer',
20 'NotmuchLexer', 'KernelLogLexer']
21
22
23class IrcLogsLexer(RegexLexer):
24 """
25 Lexer for IRC logs in *irssi*, *xchat* or *weechat* style.
26 """
27
28 name = 'IRC logs'
29 aliases = ['irc']
30 filenames = ['*.weechatlog']
31 mimetypes = ['text/x-irclog']
32 url = 'https://en.wikipedia.org/wiki/Internet_Relay_Chat'
33 version_added = ''
34
35 flags = re.VERBOSE | re.MULTILINE
36 timestamp = r"""
37 (
38 # irssi / xchat and others
39 (?: \[|\()? # Opening bracket or paren for the timestamp
40 (?: # Timestamp
41 (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits
42 (?:\d{1,4})
43 [T ])? # Date/time separator: T or space
44 (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits
45 (?: \d?\d)
46 )
47 (?: \]|\))?\s+ # Closing bracket or paren for the timestamp
48 |
49 # weechat
50 \d{4}\s\w{3}\s\d{2}\s # Date
51 \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
52 |
53 # xchat
54 \w{3}\s\d{2}\s # Date
55 \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
56 )?
57 """
58 tokens = {
59 'root': [
60 # log start/end
61 (r'^\*\*\*\*(.*)\*\*\*\*$', Comment),
62 # hack
63 ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)),
64 # normal msgs
65 ("^" + timestamp + r"""
66 (\s*<.*?>\s*) # Nick """,
67 bygroups(Comment.Preproc, Name.Tag), 'msg'),
68 # /me msgs
69 ("^" + timestamp + r"""
70 (\s*[*]\s+) # Star
71 (\S+\s+.*?\n) # Nick + rest of message """,
72 bygroups(Comment.Preproc, Keyword, Generic.Inserted)),
73 # join/part msgs
74 ("^" + timestamp + r"""
75 (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols
76 (\S+\s+) # Nick + Space
77 (.*?\n) # Rest of message """,
78 bygroups(Comment.Preproc, Keyword, String, Comment)),
79 (r"^.*?\n", Text),
80 ],
81 'msg': [
82 (r"\S+:(?!//)", Name.Attribute), # Prefix
83 (r".*\n", Text, '#pop'),
84 ],
85 }
86
87
88class GettextLexer(RegexLexer):
89 """
90 Lexer for Gettext catalog files.
91 """
92 name = 'Gettext Catalog'
93 aliases = ['pot', 'po']
94 filenames = ['*.pot', '*.po']
95 mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext']
96 url = 'https://www.gnu.org/software/gettext'
97 version_added = '0.9'
98
99 tokens = {
100 'root': [
101 (r'^#,\s.*?$', Keyword.Type),
102 (r'^#:\s.*?$', Keyword.Declaration),
103 # (r'^#$', Comment),
104 (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single),
105 (r'^(")([A-Za-z-]+:)(.*")$',
106 bygroups(String, Name.Property, String)),
107 (r'^".*"$', String),
108 (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$',
109 bygroups(Name.Variable, Text, String)),
110 (r'^(msgstr\[)(\d)(\])(\s+)(".*")$',
111 bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)),
112 ]
113 }
114
115
116class HttpLexer(RegexLexer):
117 """
118 Lexer for HTTP sessions.
119 """
120
121 name = 'HTTP'
122 aliases = ['http']
123 url = 'https://httpwg.org/specs'
124 version_added = '1.5'
125
126 flags = re.DOTALL
127
128 def get_tokens_unprocessed(self, text, stack=('root',)):
129 """Reset the content-type state."""
130 self.content_type = None
131 return RegexLexer.get_tokens_unprocessed(self, text, stack)
132
133 def header_callback(self, match):
134 if match.group(1).lower() == 'content-type':
135 content_type = match.group(5).strip()
136 if ';' in content_type:
137 content_type = content_type[:content_type.find(';')].strip()
138 self.content_type = content_type
139 yield match.start(1), Name.Attribute, match.group(1)
140 yield match.start(2), Text, match.group(2)
141 yield match.start(3), Operator, match.group(3)
142 yield match.start(4), Text, match.group(4)
143 yield match.start(5), Literal, match.group(5)
144 yield match.start(6), Text, match.group(6)
145
146 def continuous_header_callback(self, match):
147 yield match.start(1), Text, match.group(1)
148 yield match.start(2), Literal, match.group(2)
149 yield match.start(3), Text, match.group(3)
150
151 def content_callback(self, match):
152 content_type = getattr(self, 'content_type', None)
153 content = match.group()
154 offset = match.start()
155 if content_type:
156 from pygments.lexers import get_lexer_for_mimetype
157 possible_lexer_mimetypes = [content_type]
158 if '+' in content_type:
159 # application/calendar+xml can be treated as application/xml
160 # if there's not a better match.
161 general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2',
162 content_type)
163 possible_lexer_mimetypes.append(general_type)
164
165 for i in possible_lexer_mimetypes:
166 try:
167 lexer = get_lexer_for_mimetype(i)
168 except ClassNotFound:
169 pass
170 else:
171 for idx, token, value in lexer.get_tokens_unprocessed(content):
172 yield offset + idx, token, value
173 return
174 yield offset, Text, content
175
176 tokens = {
177 'root': [
178 (r'([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)'
179 r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)',
180 bygroups(Name.Function, Text, Name.Namespace, Text,
181 Keyword.Reserved, Operator, Number, Text),
182 'headers'),
183 (r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)',
184 bygroups(Keyword.Reserved, Operator, Number, Text, Number, Text,
185 Name.Exception, Text),
186 'headers'),
187 ],
188 'headers': [
189 (r'([^\s:]+)( *)(:)( *)([^\r\n]*)(\r?\n|\Z)', header_callback),
190 (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback),
191 (r'\r?\n', Text, 'content')
192 ],
193 'content': [
194 (r'.+', content_callback)
195 ]
196 }
197
198 def analyse_text(text):
199 return any (
200 re.search(pattern, text) is not None
201 for pattern in (
202 r'^([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)',
203 r'^(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)',
204 )
205 )
206
207
208class TodotxtLexer(RegexLexer):
209 """
210 Lexer for Todo.txt todo list format.
211 """
212
213 name = 'Todotxt'
214 url = 'http://todotxt.com/'
215 aliases = ['todotxt']
216 version_added = '2.0'
217 # *.todotxt is not a standard extension for Todo.txt files; including it
218 # makes testing easier, and also makes autodetecting file type easier.
219 filenames = ['todo.txt', '*.todotxt']
220 mimetypes = ['text/x-todo']
221
222 # Aliases mapping standard token types of Todo.txt format concepts
223 CompleteTaskText = Operator # Chosen to de-emphasize complete tasks
224 IncompleteTaskText = Text # Incomplete tasks should look like plain text
225
226 # Priority should have most emphasis to indicate importance of tasks
227 Priority = Generic.Heading
228 # Dates should have next most emphasis because time is important
229 Date = Generic.Subheading
230
231 # Project and context should have equal weight, and be in different colors
232 Project = Generic.Error
233 Context = String
234
235 # If tag functionality is added, it should have the same weight as Project
236 # and Context, and a different color. Generic.Traceback would work well.
237
238 # Regex patterns for building up rules; dates, priorities, projects, and
239 # contexts are all atomic
240 # TODO: Make date regex more ISO 8601 compliant
241 date_regex = r'\d{4,}-\d{2}-\d{2}'
242 priority_regex = r'\([A-Z]\)'
243 project_regex = r'\+\S+'
244 context_regex = r'@\S+'
245
246 # Compound regex expressions
247 complete_one_date_regex = r'(x )(' + date_regex + r')'
248 complete_two_date_regex = (complete_one_date_regex + r'( )(' +
249 date_regex + r')')
250 priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')'
251
252 tokens = {
253 # Should parse starting at beginning of line; each line is a task
254 'root': [
255 # Complete task entry points: two total:
256 # 1. Complete task with two dates
257 (complete_two_date_regex, bygroups(CompleteTaskText, Date,
258 CompleteTaskText, Date),
259 'complete'),
260 # 2. Complete task with one date
261 (complete_one_date_regex, bygroups(CompleteTaskText, Date),
262 'complete'),
263
264 # Incomplete task entry points: six total:
265 # 1. Priority plus date
266 (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date),
267 'incomplete'),
268 # 2. Priority only
269 (priority_regex, Priority, 'incomplete'),
270 # 3. Leading date
271 (date_regex, Date, 'incomplete'),
272 # 4. Leading context
273 (context_regex, Context, 'incomplete'),
274 # 5. Leading project
275 (project_regex, Project, 'incomplete'),
276 # 6. Non-whitespace catch-all
277 (r'\S+', IncompleteTaskText, 'incomplete'),
278 ],
279
280 # Parse a complete task
281 'complete': [
282 # Newline indicates end of task, should return to root
283 (r'\s*\n', CompleteTaskText, '#pop'),
284 # Tokenize contexts and projects
285 (context_regex, Context),
286 (project_regex, Project),
287 # Tokenize non-whitespace text
288 (r'\S+', CompleteTaskText),
289 # Tokenize whitespace not containing a newline
290 (r'\s+', CompleteTaskText),
291 ],
292
293 # Parse an incomplete task
294 'incomplete': [
295 # Newline indicates end of task, should return to root
296 (r'\s*\n', IncompleteTaskText, '#pop'),
297 # Tokenize contexts and projects
298 (context_regex, Context),
299 (project_regex, Project),
300 # Tokenize non-whitespace text
301 (r'\S+', IncompleteTaskText),
302 # Tokenize whitespace not containing a newline
303 (r'\s+', IncompleteTaskText),
304 ],
305 }
306
307
308class NotmuchLexer(RegexLexer):
309 """
310 For Notmuch email text format.
311
312 Additional options accepted:
313
314 `body_lexer`
315 If given, highlight the contents of the message body with the specified
316 lexer, else guess it according to the body content (default: ``None``).
317 """
318
319 name = 'Notmuch'
320 url = 'https://notmuchmail.org/'
321 aliases = ['notmuch']
322 version_added = '2.5'
323
324 def _highlight_code(self, match):
325 code = match.group(1)
326
327 try:
328 if self.body_lexer:
329 lexer = get_lexer_by_name(self.body_lexer)
330 else:
331 lexer = guess_lexer(code.strip())
332 except ClassNotFound:
333 lexer = get_lexer_by_name('text')
334
335 yield from lexer.get_tokens_unprocessed(code)
336
337 tokens = {
338 'root': [
339 (r'\fmessage\{\s*', Keyword, ('message', 'message-attr')),
340 ],
341 'message-attr': [
342 (r'(\s*id:\s*)(\S+)', bygroups(Name.Attribute, String)),
343 (r'(\s*(?:depth|match|excluded):\s*)(\d+)',
344 bygroups(Name.Attribute, Number.Integer)),
345 (r'(\s*filename:\s*)(.+\n)',
346 bygroups(Name.Attribute, String)),
347 default('#pop'),
348 ],
349 'message': [
350 (r'\fmessage\}\n', Keyword, '#pop'),
351 (r'\fheader\{\n', Keyword, 'header'),
352 (r'\fbody\{\n', Keyword, 'body'),
353 ],
354 'header': [
355 (r'\fheader\}\n', Keyword, '#pop'),
356 (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)',
357 bygroups(Name.Attribute, String)),
358 (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)',
359 bygroups(Generic.Strong, Literal, Name.Tag)),
360 ],
361 'body': [
362 (r'\fpart\{\n', Keyword, 'part'),
363 (r'\f(part|attachment)\{\s*', Keyword, ('part', 'part-attr')),
364 (r'\fbody\}\n', Keyword, '#pop'),
365 ],
366 'part-attr': [
367 (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)),
368 (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)',
369 bygroups(Punctuation, Name.Attribute, String)),
370 (r'(,\s*)(Content-type:\s*)(.+\n)',
371 bygroups(Punctuation, Name.Attribute, String)),
372 default('#pop'),
373 ],
374 'part': [
375 (r'\f(?:part|attachment)\}\n', Keyword, '#pop'),
376 (r'\f(?:part|attachment)\{\s*', Keyword, ('#push', 'part-attr')),
377 (r'^Non-text part: .*\n', Comment),
378 (r'(?s)(.*?(?=\f(?:part|attachment)\}\n))', _highlight_code),
379 ],
380 }
381
382 def analyse_text(text):
383 return 1.0 if text.startswith('\fmessage{') else 0.0
384
385 def __init__(self, **options):
386 self.body_lexer = options.get('body_lexer', None)
387 RegexLexer.__init__(self, **options)
388
389
390class KernelLogLexer(RegexLexer):
391 """
392 For Linux Kernel log ("dmesg") output.
393 """
394 name = 'Kernel log'
395 aliases = ['kmsg', 'dmesg']
396 filenames = ['*.kmsg', '*.dmesg']
397 url = 'https://fr.wikipedia.org/wiki/Dmesg'
398 version_added = '2.6'
399
400 tokens = {
401 'root': [
402 (r'^[^:]+:debug : (?=\[)', Text, 'debug'),
403 (r'^[^:]+:info : (?=\[)', Text, 'info'),
404 (r'^[^:]+:warn : (?=\[)', Text, 'warn'),
405 (r'^[^:]+:notice: (?=\[)', Text, 'warn'),
406 (r'^[^:]+:err : (?=\[)', Text, 'error'),
407 (r'^[^:]+:crit : (?=\[)', Text, 'error'),
408 (r'^(?=\[)', Text, 'unknown'),
409 ],
410 'unknown': [
411 (r'^(?=.+(warning|notice|audit|deprecated))', Text, 'warn'),
412 (r'^(?=.+(error|critical|fail|Bug))', Text, 'error'),
413 default('info'),
414 ],
415 'base': [
416 (r'\[[0-9. ]+\] ', Number),
417 (r'(?<=\] ).+?:', Keyword),
418 (r'\n', Text, '#pop'),
419 ],
420 'debug': [
421 include('base'),
422 (r'.+\n', Comment, '#pop')
423 ],
424 'info': [
425 include('base'),
426 (r'.+\n', Text, '#pop')
427 ],
428 'warn': [
429 include('base'),
430 (r'.+\n', Generic.Strong, '#pop')
431 ],
432 'error': [
433 include('base'),
434 (r'.+\n', Generic.Error, '#pop')
435 ]
436 }