Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/lexers/mime.py: 45%

1"""

2 pygments.lexers.mime

3 ~~~~~~~~~~~~~~~~~~~~

5 Lexer for Multipurpose Internet Mail Extensions (MIME) data.

8 :license: BSD, see LICENSE for details.

9"""

11import re

13from pygments.lexer import RegexLexer, include

14from pygments.lexers import get_lexer_for_mimetype

15from pygments.token import Text, Name, String, Operator, Comment, Other

16from pygments.util import get_int_opt, ClassNotFound

18__all__ = ["MIMELexer"]

21class MIMELexer(RegexLexer):

22 """

23 Lexer for Multipurpose Internet Mail Extensions (MIME) data. This lexer is

24 designed to process nested multipart data.

26 It assumes that the given data contains both header and body (and is

27 split at an empty line). If no valid header is found, then the entire data

28 will be treated as body.

30 Additional options accepted:

32 `MIME-max-level`

33 Max recursion level for nested MIME structure. Any negative number

34 would treated as unlimited. (default: -1)

36 `Content-Type`

37 Treat the data as a specific content type. Useful when header is

38 missing, or this lexer would try to parse from header. (default:

39 `text/plain`)

41 `Multipart-Boundary`

42 Set the default multipart boundary delimiter. This option is only used

43 when `Content-Type` is `multipart` and header is missing. This lexer

44 would try to parse from header by default. (default: None)

46 `Content-Transfer-Encoding`

47 Treat the data as a specific encoding. Or this lexer would try to parse

48 from header by default. (default: None)

50 .. versionadded:: 2.5

51 """

53 name = "MIME"

54 aliases = ["mime"]

55 mimetypes = ["multipart/mixed",

56 "multipart/related",

57 "multipart/alternative"]

59 def __init__(self, **options):

60 super().__init__(**options)

61 self.boundary = options.get("Multipart-Boundary")

62 self.content_transfer_encoding = options.get("Content_Transfer_Encoding")

63 self.content_type = options.get("Content_Type", "text/plain")

64 self.max_nested_level = get_int_opt(options, "MIME-max-level", -1)

66 def get_header_tokens(self, match):

67 field = match.group(1)

69 if field.lower() in self.attention_headers:

70 yield match.start(1), Name.Tag, field + ":"

71 yield match.start(2), Text.Whitespace, match.group(2)

73 pos = match.end(2)

74 body = match.group(3)

75 for i, t, v in self.get_tokens_unprocessed(body, ("root", field.lower())):

76 yield pos + i, t, v

78 else:

79 yield match.start(), Comment, match.group()

81 def get_body_tokens(self, match):

82 pos_body_start = match.start()

83 entire_body = match.group()

85 # skip first newline

86 if entire_body[0] == '\n':

87 yield pos_body_start, Text.Whitespace, '\n'

88 pos_body_start = pos_body_start + 1

89 entire_body = entire_body[1:]

91 # if it is not a multipart

92 if not self.content_type.startswith("multipart") or not self.boundary:

93 for i, t, v in self.get_bodypart_tokens(entire_body):

94 yield pos_body_start + i, t, v

95 return

97 # find boundary

98 bdry_pattern = r"^--%s(--)?\n" % re.escape(self.boundary)

99 bdry_matcher = re.compile(bdry_pattern, re.MULTILINE)

100

101 # some data has prefix text before first boundary

102 m = bdry_matcher.search(entire_body)

103 if m:

104 pos_part_start = pos_body_start + m.end()

105 pos_iter_start = lpos_end = m.end()

106 yield pos_body_start, Text, entire_body[:m.start()]

107 yield pos_body_start + lpos_end, String.Delimiter, m.group()

108 else:

109 pos_part_start = pos_body_start

110 pos_iter_start = 0

111

112 # process tokens of each body part

113 for m in bdry_matcher.finditer(entire_body, pos_iter_start):

114 # bodypart

115 lpos_start = pos_part_start - pos_body_start

116 lpos_end = m.start()

117 part = entire_body[lpos_start:lpos_end]

118 for i, t, v in self.get_bodypart_tokens(part):

119 yield pos_part_start + i, t, v

120

121 # boundary

122 yield pos_body_start + lpos_end, String.Delimiter, m.group()

123 pos_part_start = pos_body_start + m.end()

124

125 # some data has suffix text after last boundary

126 lpos_start = pos_part_start - pos_body_start

127 if lpos_start != len(entire_body):

128 yield pos_part_start, Text, entire_body[lpos_start:]

129

130 def get_bodypart_tokens(self, text):

131 # return if:

132 # * no content

133 # * no content type specific

134 # * content encoding is not readable

135 # * max recurrsion exceed

136 if not text.strip() or not self.content_type:

137 return [(0, Other, text)]

138

139 cte = self.content_transfer_encoding

140 if cte and cte not in {"8bit", "7bit", "quoted-printable"}:

141 return [(0, Other, text)]

142

143 if self.max_nested_level == 0:

144 return [(0, Other, text)]

145

146 # get lexer

147 try:

148 lexer = get_lexer_for_mimetype(self.content_type)

149 except ClassNotFound:

150 return [(0, Other, text)]

151

152 if isinstance(lexer, type(self)):

153 lexer.max_nested_level = self.max_nested_level - 1

154

155 return lexer.get_tokens_unprocessed(text)

156

157 def store_content_type(self, match):

158 self.content_type = match.group(1)

159

160 prefix_len = match.start(1) - match.start(0)

161 yield match.start(0), Text.Whitespace, match.group(0)[:prefix_len]

162 yield match.start(1), Name.Label, match.group(2)

163 yield match.end(2), String.Delimiter, '/'

164 yield match.start(3), Name.Label, match.group(3)

165

166 def get_content_type_subtokens(self, match):

167 yield match.start(1), Text, match.group(1)

168 yield match.start(2), Text.Whitespace, match.group(2)

169 yield match.start(3), Name.Attribute, match.group(3)

170 yield match.start(4), Operator, match.group(4)

171 yield match.start(5), String, match.group(5)

172

173 if match.group(3).lower() == "boundary":

174 boundary = match.group(5).strip()

175 if boundary[0] == '"' and boundary[-1] == '"':

176 boundary = boundary[1:-1]

177 self.boundary = boundary

178

179 def store_content_transfer_encoding(self, match):

180 self.content_transfer_encoding = match.group(0).lower()

181 yield match.start(0), Name.Constant, match.group(0)

182

183 attention_headers = {"content-type", "content-transfer-encoding"}

184

185 tokens = {

186 "root": [

187 (r"^([\w-]+):( *)([\s\S]*?\n)(?![ \t])", get_header_tokens),

188 (r"^$[\s\S]+", get_body_tokens),

189 ],

190 "header": [

191 # folding

192 (r"\n[ \t]", Text.Whitespace),

193 (r"\n(?![ \t])", Text.Whitespace, "#pop"),

194 ],

195 "content-type": [

196 include("header"),

197 (

199 r"|message)/([\w-]+))",

200 store_content_type,

201 ),

202 (r'(;)((?:[ \t]|\n[ \t])*)([\w:-]+)(=)([\s\S]*?)(?=;|\n(?![ \t]))',

203 get_content_type_subtokens),

204 (r';[ \t]*\n(?![ \t])', Text, '#pop'),

205 ],

206 "content-transfer-encoding": [

207 include("header"),

208 (r"([\w-]+)", store_content_transfer_encoding),

209 ],

210 }