Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/lexers/mime.py: 45%

96 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1""" 

2 pygments.lexers.mime 

3 ~~~~~~~~~~~~~~~~~~~~ 

4 

5 Lexer for Multipurpose Internet Mail Extensions (MIME) data. 

6 

7 :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS. 

8 :license: BSD, see LICENSE for details. 

9""" 

10 

11import re 

12 

13from pygments.lexer import RegexLexer, include 

14from pygments.lexers import get_lexer_for_mimetype 

15from pygments.token import Text, Name, String, Operator, Comment, Other 

16from pygments.util import get_int_opt, ClassNotFound 

17 

18__all__ = ["MIMELexer"] 

19 

20 

21class MIMELexer(RegexLexer): 

22 """ 

23 Lexer for Multipurpose Internet Mail Extensions (MIME) data. This lexer is 

24 designed to process nested multipart data. 

25 

26 It assumes that the given data contains both header and body (and is 

27 split at an empty line). If no valid header is found, then the entire data 

28 will be treated as body. 

29 

30 Additional options accepted: 

31 

32 `MIME-max-level` 

33 Max recursion level for nested MIME structure. Any negative number 

34 would treated as unlimited. (default: -1) 

35 

36 `Content-Type` 

37 Treat the data as a specific content type. Useful when header is 

38 missing, or this lexer would try to parse from header. (default: 

39 `text/plain`) 

40 

41 `Multipart-Boundary` 

42 Set the default multipart boundary delimiter. This option is only used 

43 when `Content-Type` is `multipart` and header is missing. This lexer 

44 would try to parse from header by default. (default: None) 

45 

46 `Content-Transfer-Encoding` 

47 Treat the data as a specific encoding. Or this lexer would try to parse 

48 from header by default. (default: None) 

49 

50 .. versionadded:: 2.5 

51 """ 

52 

53 name = "MIME" 

54 aliases = ["mime"] 

55 mimetypes = ["multipart/mixed", 

56 "multipart/related", 

57 "multipart/alternative"] 

58 

59 def __init__(self, **options): 

60 super().__init__(**options) 

61 self.boundary = options.get("Multipart-Boundary") 

62 self.content_transfer_encoding = options.get("Content_Transfer_Encoding") 

63 self.content_type = options.get("Content_Type", "text/plain") 

64 self.max_nested_level = get_int_opt(options, "MIME-max-level", -1) 

65 

66 def get_header_tokens(self, match): 

67 field = match.group(1) 

68 

69 if field.lower() in self.attention_headers: 

70 yield match.start(1), Name.Tag, field + ":" 

71 yield match.start(2), Text.Whitespace, match.group(2) 

72 

73 pos = match.end(2) 

74 body = match.group(3) 

75 for i, t, v in self.get_tokens_unprocessed(body, ("root", field.lower())): 

76 yield pos + i, t, v 

77 

78 else: 

79 yield match.start(), Comment, match.group() 

80 

81 def get_body_tokens(self, match): 

82 pos_body_start = match.start() 

83 entire_body = match.group() 

84 

85 # skip first newline 

86 if entire_body[0] == '\n': 

87 yield pos_body_start, Text.Whitespace, '\n' 

88 pos_body_start = pos_body_start + 1 

89 entire_body = entire_body[1:] 

90 

91 # if it is not a multipart 

92 if not self.content_type.startswith("multipart") or not self.boundary: 

93 for i, t, v in self.get_bodypart_tokens(entire_body): 

94 yield pos_body_start + i, t, v 

95 return 

96 

97 # find boundary 

98 bdry_pattern = r"^--%s(--)?\n" % re.escape(self.boundary) 

99 bdry_matcher = re.compile(bdry_pattern, re.MULTILINE) 

100 

101 # some data has prefix text before first boundary 

102 m = bdry_matcher.search(entire_body) 

103 if m: 

104 pos_part_start = pos_body_start + m.end() 

105 pos_iter_start = lpos_end = m.end() 

106 yield pos_body_start, Text, entire_body[:m.start()] 

107 yield pos_body_start + lpos_end, String.Delimiter, m.group() 

108 else: 

109 pos_part_start = pos_body_start 

110 pos_iter_start = 0 

111 

112 # process tokens of each body part 

113 for m in bdry_matcher.finditer(entire_body, pos_iter_start): 

114 # bodypart 

115 lpos_start = pos_part_start - pos_body_start 

116 lpos_end = m.start() 

117 part = entire_body[lpos_start:lpos_end] 

118 for i, t, v in self.get_bodypart_tokens(part): 

119 yield pos_part_start + i, t, v 

120 

121 # boundary 

122 yield pos_body_start + lpos_end, String.Delimiter, m.group() 

123 pos_part_start = pos_body_start + m.end() 

124 

125 # some data has suffix text after last boundary 

126 lpos_start = pos_part_start - pos_body_start 

127 if lpos_start != len(entire_body): 

128 yield pos_part_start, Text, entire_body[lpos_start:] 

129 

130 def get_bodypart_tokens(self, text): 

131 # return if: 

132 # * no content 

133 # * no content type specific 

134 # * content encoding is not readable 

135 # * max recurrsion exceed 

136 if not text.strip() or not self.content_type: 

137 return [(0, Other, text)] 

138 

139 cte = self.content_transfer_encoding 

140 if cte and cte not in {"8bit", "7bit", "quoted-printable"}: 

141 return [(0, Other, text)] 

142 

143 if self.max_nested_level == 0: 

144 return [(0, Other, text)] 

145 

146 # get lexer 

147 try: 

148 lexer = get_lexer_for_mimetype(self.content_type) 

149 except ClassNotFound: 

150 return [(0, Other, text)] 

151 

152 if isinstance(lexer, type(self)): 

153 lexer.max_nested_level = self.max_nested_level - 1 

154 

155 return lexer.get_tokens_unprocessed(text) 

156 

157 def store_content_type(self, match): 

158 self.content_type = match.group(1) 

159 

160 prefix_len = match.start(1) - match.start(0) 

161 yield match.start(0), Text.Whitespace, match.group(0)[:prefix_len] 

162 yield match.start(1), Name.Label, match.group(2) 

163 yield match.end(2), String.Delimiter, '/' 

164 yield match.start(3), Name.Label, match.group(3) 

165 

166 def get_content_type_subtokens(self, match): 

167 yield match.start(1), Text, match.group(1) 

168 yield match.start(2), Text.Whitespace, match.group(2) 

169 yield match.start(3), Name.Attribute, match.group(3) 

170 yield match.start(4), Operator, match.group(4) 

171 yield match.start(5), String, match.group(5) 

172 

173 if match.group(3).lower() == "boundary": 

174 boundary = match.group(5).strip() 

175 if boundary[0] == '"' and boundary[-1] == '"': 

176 boundary = boundary[1:-1] 

177 self.boundary = boundary 

178 

179 def store_content_transfer_encoding(self, match): 

180 self.content_transfer_encoding = match.group(0).lower() 

181 yield match.start(0), Name.Constant, match.group(0) 

182 

183 attention_headers = {"content-type", "content-transfer-encoding"} 

184 

185 tokens = { 

186 "root": [ 

187 (r"^([\w-]+):( *)([\s\S]*?\n)(?![ \t])", get_header_tokens), 

188 (r"^$[\s\S]+", get_body_tokens), 

189 ], 

190 "header": [ 

191 # folding 

192 (r"\n[ \t]", Text.Whitespace), 

193 (r"\n(?![ \t])", Text.Whitespace, "#pop"), 

194 ], 

195 "content-type": [ 

196 include("header"), 

197 ( 

198 r"^\s*((multipart|application|audio|font|image|model|text|video" 

199 r"|message)/([\w-]+))", 

200 store_content_type, 

201 ), 

202 (r'(;)((?:[ \t]|\n[ \t])*)([\w:-]+)(=)([\s\S]*?)(?=;|\n(?![ \t]))', 

203 get_content_type_subtokens), 

204 (r';[ \t]*\n(?![ \t])', Text, '#pop'), 

205 ], 

206 "content-transfer-encoding": [ 

207 include("header"), 

208 (r"([\w-]+)", store_content_transfer_encoding), 

209 ], 

210 }