Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/lexers/mime.py: 45%
96 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1"""
2 pygments.lexers.mime
3 ~~~~~~~~~~~~~~~~~~~~
5 Lexer for Multipurpose Internet Mail Extensions (MIME) data.
7 :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
8 :license: BSD, see LICENSE for details.
9"""
11import re
13from pygments.lexer import RegexLexer, include
14from pygments.lexers import get_lexer_for_mimetype
15from pygments.token import Text, Name, String, Operator, Comment, Other
16from pygments.util import get_int_opt, ClassNotFound
18__all__ = ["MIMELexer"]
21class MIMELexer(RegexLexer):
22 """
23 Lexer for Multipurpose Internet Mail Extensions (MIME) data. This lexer is
24 designed to process nested multipart data.
26 It assumes that the given data contains both header and body (and is
27 split at an empty line). If no valid header is found, then the entire data
28 will be treated as body.
30 Additional options accepted:
32 `MIME-max-level`
33 Max recursion level for nested MIME structure. Any negative number
34 would treated as unlimited. (default: -1)
36 `Content-Type`
37 Treat the data as a specific content type. Useful when header is
38 missing, or this lexer would try to parse from header. (default:
39 `text/plain`)
41 `Multipart-Boundary`
42 Set the default multipart boundary delimiter. This option is only used
43 when `Content-Type` is `multipart` and header is missing. This lexer
44 would try to parse from header by default. (default: None)
46 `Content-Transfer-Encoding`
47 Treat the data as a specific encoding. Or this lexer would try to parse
48 from header by default. (default: None)
50 .. versionadded:: 2.5
51 """
53 name = "MIME"
54 aliases = ["mime"]
55 mimetypes = ["multipart/mixed",
56 "multipart/related",
57 "multipart/alternative"]
59 def __init__(self, **options):
60 super().__init__(**options)
61 self.boundary = options.get("Multipart-Boundary")
62 self.content_transfer_encoding = options.get("Content_Transfer_Encoding")
63 self.content_type = options.get("Content_Type", "text/plain")
64 self.max_nested_level = get_int_opt(options, "MIME-max-level", -1)
66 def get_header_tokens(self, match):
67 field = match.group(1)
69 if field.lower() in self.attention_headers:
70 yield match.start(1), Name.Tag, field + ":"
71 yield match.start(2), Text.Whitespace, match.group(2)
73 pos = match.end(2)
74 body = match.group(3)
75 for i, t, v in self.get_tokens_unprocessed(body, ("root", field.lower())):
76 yield pos + i, t, v
78 else:
79 yield match.start(), Comment, match.group()
81 def get_body_tokens(self, match):
82 pos_body_start = match.start()
83 entire_body = match.group()
85 # skip first newline
86 if entire_body[0] == '\n':
87 yield pos_body_start, Text.Whitespace, '\n'
88 pos_body_start = pos_body_start + 1
89 entire_body = entire_body[1:]
91 # if it is not a multipart
92 if not self.content_type.startswith("multipart") or not self.boundary:
93 for i, t, v in self.get_bodypart_tokens(entire_body):
94 yield pos_body_start + i, t, v
95 return
97 # find boundary
98 bdry_pattern = r"^--%s(--)?\n" % re.escape(self.boundary)
99 bdry_matcher = re.compile(bdry_pattern, re.MULTILINE)
101 # some data has prefix text before first boundary
102 m = bdry_matcher.search(entire_body)
103 if m:
104 pos_part_start = pos_body_start + m.end()
105 pos_iter_start = lpos_end = m.end()
106 yield pos_body_start, Text, entire_body[:m.start()]
107 yield pos_body_start + lpos_end, String.Delimiter, m.group()
108 else:
109 pos_part_start = pos_body_start
110 pos_iter_start = 0
112 # process tokens of each body part
113 for m in bdry_matcher.finditer(entire_body, pos_iter_start):
114 # bodypart
115 lpos_start = pos_part_start - pos_body_start
116 lpos_end = m.start()
117 part = entire_body[lpos_start:lpos_end]
118 for i, t, v in self.get_bodypart_tokens(part):
119 yield pos_part_start + i, t, v
121 # boundary
122 yield pos_body_start + lpos_end, String.Delimiter, m.group()
123 pos_part_start = pos_body_start + m.end()
125 # some data has suffix text after last boundary
126 lpos_start = pos_part_start - pos_body_start
127 if lpos_start != len(entire_body):
128 yield pos_part_start, Text, entire_body[lpos_start:]
130 def get_bodypart_tokens(self, text):
131 # return if:
132 # * no content
133 # * no content type specific
134 # * content encoding is not readable
135 # * max recurrsion exceed
136 if not text.strip() or not self.content_type:
137 return [(0, Other, text)]
139 cte = self.content_transfer_encoding
140 if cte and cte not in {"8bit", "7bit", "quoted-printable"}:
141 return [(0, Other, text)]
143 if self.max_nested_level == 0:
144 return [(0, Other, text)]
146 # get lexer
147 try:
148 lexer = get_lexer_for_mimetype(self.content_type)
149 except ClassNotFound:
150 return [(0, Other, text)]
152 if isinstance(lexer, type(self)):
153 lexer.max_nested_level = self.max_nested_level - 1
155 return lexer.get_tokens_unprocessed(text)
157 def store_content_type(self, match):
158 self.content_type = match.group(1)
160 prefix_len = match.start(1) - match.start(0)
161 yield match.start(0), Text.Whitespace, match.group(0)[:prefix_len]
162 yield match.start(1), Name.Label, match.group(2)
163 yield match.end(2), String.Delimiter, '/'
164 yield match.start(3), Name.Label, match.group(3)
166 def get_content_type_subtokens(self, match):
167 yield match.start(1), Text, match.group(1)
168 yield match.start(2), Text.Whitespace, match.group(2)
169 yield match.start(3), Name.Attribute, match.group(3)
170 yield match.start(4), Operator, match.group(4)
171 yield match.start(5), String, match.group(5)
173 if match.group(3).lower() == "boundary":
174 boundary = match.group(5).strip()
175 if boundary[0] == '"' and boundary[-1] == '"':
176 boundary = boundary[1:-1]
177 self.boundary = boundary
179 def store_content_transfer_encoding(self, match):
180 self.content_transfer_encoding = match.group(0).lower()
181 yield match.start(0), Name.Constant, match.group(0)
183 attention_headers = {"content-type", "content-transfer-encoding"}
185 tokens = {
186 "root": [
187 (r"^([\w-]+):( *)([\s\S]*?\n)(?![ \t])", get_header_tokens),
188 (r"^$[\s\S]+", get_body_tokens),
189 ],
190 "header": [
191 # folding
192 (r"\n[ \t]", Text.Whitespace),
193 (r"\n(?![ \t])", Text.Whitespace, "#pop"),
194 ],
195 "content-type": [
196 include("header"),
197 (
198 r"^\s*((multipart|application|audio|font|image|model|text|video"
199 r"|message)/([\w-]+))",
200 store_content_type,
201 ),
202 (r'(;)((?:[ \t]|\n[ \t])*)([\w:-]+)(=)([\s\S]*?)(?=;|\n(?![ \t]))',
203 get_content_type_subtokens),
204 (r';[ \t]*\n(?![ \t])', Text, '#pop'),
205 ],
206 "content-transfer-encoding": [
207 include("header"),
208 (r"([\w-]+)", store_content_transfer_encoding),
209 ],
210 }