1""" 
    2    pygments.lexers.textfmts 
    3    ~~~~~~~~~~~~~~~~~~~~~~~~ 
    4 
    5    Lexers for various text formats. 
    6 
    7    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS. 
    8    :license: BSD, see LICENSE for details. 
    9""" 
    10 
    11import re 
    12 
    13from pygments.lexers import guess_lexer, get_lexer_by_name 
    14from pygments.lexer import RegexLexer, bygroups, default, include 
    15from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ 
    16    Number, Generic, Literal, Punctuation 
    17from pygments.util import ClassNotFound 
    18 
    19__all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer', 
    20           'NotmuchLexer', 'KernelLogLexer'] 
    21 
    22 
    23class IrcLogsLexer(RegexLexer): 
    24    """ 
    25    Lexer for IRC logs in *irssi*, *xchat* or *weechat* style. 
    26    """ 
    27 
    28    name = 'IRC logs' 
    29    aliases = ['irc'] 
    30    filenames = ['*.weechatlog'] 
    31    mimetypes = ['text/x-irclog'] 
    32    url = 'https://en.wikipedia.org/wiki/Internet_Relay_Chat' 
    33    version_added = '' 
    34 
    35    flags = re.VERBOSE | re.MULTILINE 
    36    timestamp = r""" 
    37        ( 
    38          # irssi / xchat and others 
    39          (?: \[|\()?                  # Opening bracket or paren for the timestamp 
    40            (?:                        # Timestamp 
    41                (?: (?:\d{1,4} [-/])*  # Date as - or /-separated groups of digits 
    42                    (?:\d{1,4}) 
    43                 [T ])?                # Date/time separator: T or space 
    44                (?: \d?\d [:.])*       # Time as :/.-separated groups of 1 or 2 digits 
    45                    (?: \d?\d) 
    46            ) 
    47          (?: \]|\))?\s+               # Closing bracket or paren for the timestamp 
    48        | 
    49          # weechat 
    50          \d{4}\s\w{3}\s\d{2}\s        # Date 
    51          \d{2}:\d{2}:\d{2}\s+         # Time + Whitespace 
    52        | 
    53          # xchat 
    54          \w{3}\s\d{2}\s               # Date 
    55          \d{2}:\d{2}:\d{2}\s+         # Time + Whitespace 
    56        )? 
    57    """ 
    58    tokens = { 
    59        'root': [ 
    60            # log start/end 
    61            (r'^\*\*\*\*(.*)\*\*\*\*$', Comment), 
    62            # hack 
    63            ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)), 
    64            # normal msgs 
    65            ("^" + timestamp + r""" 
    66                (\s*<.*?>\s*)          # Nick """, 
    67             bygroups(Comment.Preproc, Name.Tag), 'msg'), 
    68            # /me msgs 
    69            ("^" + timestamp + r""" 
    70                (\s*[*]\s+)            # Star 
    71                (\S+\s+.*?\n)          # Nick + rest of message """, 
    72             bygroups(Comment.Preproc, Keyword, Generic.Inserted)), 
    73            # join/part msgs 
    74            ("^" + timestamp + r""" 
    75                (\s*(?:\*{3}|<?-[!@=P]?->?)\s*)  # Star(s) or symbols 
    76                (\S+\s+)                     # Nick + Space 
    77                (.*?\n)                         # Rest of message """, 
    78             bygroups(Comment.Preproc, Keyword, String, Comment)), 
    79            (r"^.*?\n", Text), 
    80        ], 
    81        'msg': [ 
    82            (r"\S+:(?!//)", Name.Attribute),  # Prefix 
    83            (r".*\n", Text, '#pop'), 
    84        ], 
    85    } 
    86 
    87 
    88class GettextLexer(RegexLexer): 
    89    """ 
    90    Lexer for Gettext catalog files. 
    91    """ 
    92    name = 'Gettext Catalog' 
    93    aliases = ['pot', 'po'] 
    94    filenames = ['*.pot', '*.po'] 
    95    mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext'] 
    96    url = 'https://www.gnu.org/software/gettext' 
    97    version_added = '0.9' 
    98 
    99    tokens = { 
    100        'root': [ 
    101            (r'^#,\s.*?$', Keyword.Type), 
    102            (r'^#:\s.*?$', Keyword.Declaration), 
    103            # (r'^#$', Comment), 
    104            (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single), 
    105            (r'^(")([A-Za-z-]+:)(.*")$', 
    106             bygroups(String, Name.Property, String)), 
    107            (r'^".*"$', String), 
    108            (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$', 
    109             bygroups(Name.Variable, Text, String)), 
    110            (r'^(msgstr\[)(\d)(\])(\s+)(".*")$', 
    111             bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)), 
    112        ] 
    113    } 
    114 
    115 
    116class HttpLexer(RegexLexer): 
    117    """ 
    118    Lexer for HTTP sessions. 
    119    """ 
    120 
    121    name = 'HTTP' 
    122    aliases = ['http'] 
    123    url = 'https://httpwg.org/specs' 
    124    version_added = '1.5' 
    125 
    126    flags = re.DOTALL 
    127 
    128    def get_tokens_unprocessed(self, text, stack=('root',)): 
    129        """Reset the content-type state.""" 
    130        self.content_type = None 
    131        return RegexLexer.get_tokens_unprocessed(self, text, stack) 
    132 
    133    def header_callback(self, match): 
    134        if match.group(1).lower() == 'content-type': 
    135            content_type = match.group(5).strip() 
    136            if ';' in content_type: 
    137                content_type = content_type[:content_type.find(';')].strip() 
    138            self.content_type = content_type 
    139        yield match.start(1), Name.Attribute, match.group(1) 
    140        yield match.start(2), Text, match.group(2) 
    141        yield match.start(3), Operator, match.group(3) 
    142        yield match.start(4), Text, match.group(4) 
    143        yield match.start(5), Literal, match.group(5) 
    144        yield match.start(6), Text, match.group(6) 
    145 
    146    def continuous_header_callback(self, match): 
    147        yield match.start(1), Text, match.group(1) 
    148        yield match.start(2), Literal, match.group(2) 
    149        yield match.start(3), Text, match.group(3) 
    150 
    151    def content_callback(self, match): 
    152        content_type = getattr(self, 'content_type', None) 
    153        content = match.group() 
    154        offset = match.start() 
    155        if content_type: 
    156            from pygments.lexers import get_lexer_for_mimetype 
    157            possible_lexer_mimetypes = [content_type] 
    158            if '+' in content_type: 
    159                # application/calendar+xml can be treated as application/xml 
    160                # if there's not a better match. 
    161                general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2', 
    162                                      content_type) 
    163                possible_lexer_mimetypes.append(general_type) 
    164 
    165            for i in possible_lexer_mimetypes: 
    166                try: 
    167                    lexer = get_lexer_for_mimetype(i) 
    168                except ClassNotFound: 
    169                    pass 
    170                else: 
    171                    for idx, token, value in lexer.get_tokens_unprocessed(content): 
    172                        yield offset + idx, token, value 
    173                    return 
    174        yield offset, Text, content 
    175 
    176    tokens = { 
    177        'root': [ 
    178            (r'([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)' 
    179             r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)', 
    180             bygroups(Name.Function, Text, Name.Namespace, Text, 
    181                      Keyword.Reserved, Operator, Number, Text), 
    182             'headers'), 
    183            (r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)', 
    184             bygroups(Keyword.Reserved, Operator, Number, Text, Number, Text, 
    185                      Name.Exception, Text), 
    186             'headers'), 
    187        ], 
    188        'headers': [ 
    189            (r'([^\s:]+)( *)(:)( *)([^\r\n]*)(\r?\n|\Z)', header_callback), 
    190            (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback), 
    191            (r'\r?\n', Text, 'content') 
    192        ], 
    193        'content': [ 
    194            (r'.+', content_callback) 
    195        ] 
    196    } 
    197 
    198    def analyse_text(text): 
    199        return any ( 
    200            re.search(pattern, text) is not None 
    201            for pattern in ( 
    202                r'^([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)', 
    203                r'^(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)', 
    204            ) 
    205        ) 
    206 
    207 
    208class TodotxtLexer(RegexLexer): 
    209    """ 
    210    Lexer for Todo.txt todo list format. 
    211    """ 
    212 
    213    name = 'Todotxt' 
    214    url = 'http://todotxt.com/' 
    215    aliases = ['todotxt'] 
    216    version_added = '2.0' 
    217    # *.todotxt is not a standard extension for Todo.txt files; including it 
    218    # makes testing easier, and also makes autodetecting file type easier. 
    219    filenames = ['todo.txt', '*.todotxt'] 
    220    mimetypes = ['text/x-todo'] 
    221 
    222    # Aliases mapping standard token types of Todo.txt format concepts 
    223    CompleteTaskText = Operator  # Chosen to de-emphasize complete tasks 
    224    IncompleteTaskText = Text    # Incomplete tasks should look like plain text 
    225 
    226    # Priority should have most emphasis to indicate importance of tasks 
    227    Priority = Generic.Heading 
    228    # Dates should have next most emphasis because time is important 
    229    Date = Generic.Subheading 
    230 
    231    # Project and context should have equal weight, and be in different colors 
    232    Project = Generic.Error 
    233    Context = String 
    234 
    235    # If tag functionality is added, it should have the same weight as Project 
    236    # and Context, and a different color. Generic.Traceback would work well. 
    237 
    238    # Regex patterns for building up rules; dates, priorities, projects, and 
    239    # contexts are all atomic 
    240    # TODO: Make date regex more ISO 8601 compliant 
    241    date_regex = r'\d{4,}-\d{2}-\d{2}' 
    242    priority_regex = r'\([A-Z]\)' 
    243    project_regex = r'\+\S+' 
    244    context_regex = r'@\S+' 
    245 
    246    # Compound regex expressions 
    247    complete_one_date_regex = r'(x )(' + date_regex + r')' 
    248    complete_two_date_regex = (complete_one_date_regex + r'( )(' + 
    249                               date_regex + r')') 
    250    priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')' 
    251 
    252    tokens = { 
    253        # Should parse starting at beginning of line; each line is a task 
    254        'root': [ 
    255            # Complete task entry points: two total: 
    256            # 1. Complete task with two dates 
    257            (complete_two_date_regex, bygroups(CompleteTaskText, Date, 
    258                                               CompleteTaskText, Date), 
    259             'complete'), 
    260            # 2. Complete task with one date 
    261            (complete_one_date_regex, bygroups(CompleteTaskText, Date), 
    262             'complete'), 
    263 
    264            # Incomplete task entry points: six total: 
    265            # 1. Priority plus date 
    266            (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date), 
    267             'incomplete'), 
    268            # 2. Priority only 
    269            (priority_regex, Priority, 'incomplete'), 
    270            # 3. Leading date 
    271            (date_regex, Date, 'incomplete'), 
    272            # 4. Leading context 
    273            (context_regex, Context, 'incomplete'), 
    274            # 5. Leading project 
    275            (project_regex, Project, 'incomplete'), 
    276            # 6. Non-whitespace catch-all 
    277            (r'\S+', IncompleteTaskText, 'incomplete'), 
    278        ], 
    279 
    280        # Parse a complete task 
    281        'complete': [ 
    282            # Newline indicates end of task, should return to root 
    283            (r'\s*\n', CompleteTaskText, '#pop'), 
    284            # Tokenize contexts and projects 
    285            (context_regex, Context), 
    286            (project_regex, Project), 
    287            # Tokenize non-whitespace text 
    288            (r'\S+', CompleteTaskText), 
    289            # Tokenize whitespace not containing a newline 
    290            (r'\s+', CompleteTaskText), 
    291        ], 
    292 
    293        # Parse an incomplete task 
    294        'incomplete': [ 
    295            # Newline indicates end of task, should return to root 
    296            (r'\s*\n', IncompleteTaskText, '#pop'), 
    297            # Tokenize contexts and projects 
    298            (context_regex, Context), 
    299            (project_regex, Project), 
    300            # Tokenize non-whitespace text 
    301            (r'\S+', IncompleteTaskText), 
    302            # Tokenize whitespace not containing a newline 
    303            (r'\s+', IncompleteTaskText), 
    304        ], 
    305    } 
    306 
    307 
    308class NotmuchLexer(RegexLexer): 
    309    """ 
    310    For Notmuch email text format. 
    311 
    312    Additional options accepted: 
    313 
    314    `body_lexer` 
    315        If given, highlight the contents of the message body with the specified 
    316        lexer, else guess it according to the body content (default: ``None``). 
    317    """ 
    318 
    319    name = 'Notmuch' 
    320    url = 'https://notmuchmail.org/' 
    321    aliases = ['notmuch'] 
    322    version_added = '2.5' 
    323 
    324    def _highlight_code(self, match): 
    325        code = match.group(1) 
    326 
    327        try: 
    328            if self.body_lexer: 
    329                lexer = get_lexer_by_name(self.body_lexer) 
    330            else: 
    331                lexer = guess_lexer(code.strip()) 
    332        except ClassNotFound: 
    333            lexer = get_lexer_by_name('text') 
    334 
    335        yield from lexer.get_tokens_unprocessed(code) 
    336 
    337    tokens = { 
    338        'root': [ 
    339            (r'\fmessage\{\s*', Keyword, ('message', 'message-attr')), 
    340        ], 
    341        'message-attr': [ 
    342            (r'(\s*id:\s*)(\S+)', bygroups(Name.Attribute, String)), 
    343            (r'(\s*(?:depth|match|excluded):\s*)(\d+)', 
    344             bygroups(Name.Attribute, Number.Integer)), 
    345            (r'(\s*filename:\s*)(.+\n)', 
    346             bygroups(Name.Attribute, String)), 
    347            default('#pop'), 
    348        ], 
    349        'message': [ 
    350            (r'\fmessage\}\n', Keyword, '#pop'), 
    351            (r'\fheader\{\n', Keyword, 'header'), 
    352            (r'\fbody\{\n', Keyword, 'body'), 
    353        ], 
    354        'header': [ 
    355            (r'\fheader\}\n', Keyword, '#pop'), 
    356            (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)', 
    357             bygroups(Name.Attribute, String)), 
    358            (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)', 
    359             bygroups(Generic.Strong, Literal, Name.Tag)), 
    360        ], 
    361        'body': [ 
    362            (r'\fpart\{\n', Keyword, 'part'), 
    363            (r'\f(part|attachment)\{\s*', Keyword, ('part', 'part-attr')), 
    364            (r'\fbody\}\n', Keyword, '#pop'), 
    365        ], 
    366        'part-attr': [ 
    367            (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)), 
    368            (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)', 
    369             bygroups(Punctuation, Name.Attribute, String)), 
    370            (r'(,\s*)(Content-type:\s*)(.+\n)', 
    371             bygroups(Punctuation, Name.Attribute, String)), 
    372            default('#pop'), 
    373        ], 
    374        'part': [ 
    375            (r'\f(?:part|attachment)\}\n', Keyword, '#pop'), 
    376            (r'\f(?:part|attachment)\{\s*', Keyword, ('#push', 'part-attr')), 
    377            (r'^Non-text part: .*\n', Comment), 
    378            (r'(?s)(.*?(?=\f(?:part|attachment)\}\n))', _highlight_code), 
    379        ], 
    380    } 
    381 
    382    def analyse_text(text): 
    383        return 1.0 if text.startswith('\fmessage{') else 0.0 
    384 
    385    def __init__(self, **options): 
    386        self.body_lexer = options.get('body_lexer', None) 
    387        RegexLexer.__init__(self, **options) 
    388 
    389 
    390class KernelLogLexer(RegexLexer): 
    391    """ 
    392    For Linux Kernel log ("dmesg") output. 
    393    """ 
    394    name = 'Kernel log' 
    395    aliases = ['kmsg', 'dmesg'] 
    396    filenames = ['*.kmsg', '*.dmesg'] 
    397    url = 'https://fr.wikipedia.org/wiki/Dmesg' 
    398    version_added = '2.6' 
    399 
    400    tokens = { 
    401        'root': [ 
    402            (r'^[^:]+:debug : (?=\[)', Text, 'debug'), 
    403            (r'^[^:]+:info  : (?=\[)', Text, 'info'), 
    404            (r'^[^:]+:warn  : (?=\[)', Text, 'warn'), 
    405            (r'^[^:]+:notice: (?=\[)', Text, 'warn'), 
    406            (r'^[^:]+:err   : (?=\[)', Text, 'error'), 
    407            (r'^[^:]+:crit  : (?=\[)', Text, 'error'), 
    408            (r'^(?=\[)', Text, 'unknown'), 
    409        ], 
    410        'unknown': [ 
    411            (r'^(?=.+(warning|notice|audit|deprecated))', Text, 'warn'), 
    412            (r'^(?=.+(error|critical|fail|Bug))', Text, 'error'), 
    413            default('info'), 
    414        ], 
    415        'base': [ 
    416            (r'\[[0-9. ]+\] ', Number), 
    417            (r'(?<=\] ).+?:', Keyword), 
    418            (r'\n', Text, '#pop'), 
    419        ], 
    420        'debug': [ 
    421            include('base'), 
    422            (r'.+\n', Comment, '#pop') 
    423        ], 
    424        'info': [ 
    425            include('base'), 
    426            (r'.+\n', Text, '#pop') 
    427        ], 
    428        'warn': [ 
    429            include('base'), 
    430            (r'.+\n', Generic.Strong, '#pop') 
    431        ], 
    432        'error': [ 
    433            include('base'), 
    434            (r'.+\n', Generic.Error, '#pop') 
    435        ] 
    436    }