Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/lexers/textfmts.py: 58%

109 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-18 06:13 +0000

1""" 

2 pygments.lexers.textfmts 

3 ~~~~~~~~~~~~~~~~~~~~~~~~ 

4 

5 Lexers for various text formats. 

6 

7 :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS. 

8 :license: BSD, see LICENSE for details. 

9""" 

10 

11import re 

12 

13from pygments.lexers import guess_lexer, get_lexer_by_name 

14from pygments.lexer import RegexLexer, bygroups, default, include 

15from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ 

16 Number, Generic, Literal, Punctuation 

17from pygments.util import ClassNotFound 

18 

19__all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer', 

20 'NotmuchLexer', 'KernelLogLexer'] 

21 

22 

23class IrcLogsLexer(RegexLexer): 

24 """ 

25 Lexer for IRC logs in *irssi*, *xchat* or *weechat* style. 

26 """ 

27 

28 name = 'IRC logs' 

29 aliases = ['irc'] 

30 filenames = ['*.weechatlog'] 

31 mimetypes = ['text/x-irclog'] 

32 

33 flags = re.VERBOSE | re.MULTILINE 

34 timestamp = r""" 

35 ( 

36 # irssi / xchat and others 

37 (?: \[|\()? # Opening bracket or paren for the timestamp 

38 (?: # Timestamp 

39 (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits 

40 (?:\d{1,4}) 

41 [T ])? # Date/time separator: T or space 

42 (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits 

43 (?: \d?\d) 

44 ) 

45 (?: \]|\))?\s+ # Closing bracket or paren for the timestamp 

46 | 

47 # weechat 

48 \d{4}\s\w{3}\s\d{2}\s # Date 

49 \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace 

50 | 

51 # xchat 

52 \w{3}\s\d{2}\s # Date 

53 \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace 

54 )? 

55 """ 

56 tokens = { 

57 'root': [ 

58 # log start/end 

59 (r'^\*\*\*\*(.*)\*\*\*\*$', Comment), 

60 # hack 

61 ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)), 

62 # normal msgs 

63 ("^" + timestamp + r""" 

64 (\s*<.*?>\s*) # Nick """, 

65 bygroups(Comment.Preproc, Name.Tag), 'msg'), 

66 # /me msgs 

67 ("^" + timestamp + r""" 

68 (\s*[*]\s+) # Star 

69 (\S+\s+.*?\n) # Nick + rest of message """, 

70 bygroups(Comment.Preproc, Keyword, Generic.Inserted)), 

71 # join/part msgs 

72 ("^" + timestamp + r""" 

73 (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols 

74 (\S+\s+) # Nick + Space 

75 (.*?\n) # Rest of message """, 

76 bygroups(Comment.Preproc, Keyword, String, Comment)), 

77 (r"^.*?\n", Text), 

78 ], 

79 'msg': [ 

80 (r"\S+:(?!//)", Name.Attribute), # Prefix 

81 (r".*\n", Text, '#pop'), 

82 ], 

83 } 

84 

85 

86class GettextLexer(RegexLexer): 

87 """ 

88 Lexer for Gettext catalog files. 

89 

90 .. versionadded:: 0.9 

91 """ 

92 name = 'Gettext Catalog' 

93 aliases = ['pot', 'po'] 

94 filenames = ['*.pot', '*.po'] 

95 mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext'] 

96 

97 tokens = { 

98 'root': [ 

99 (r'^#,\s.*?$', Keyword.Type), 

100 (r'^#:\s.*?$', Keyword.Declaration), 

101 # (r'^#$', Comment), 

102 (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single), 

103 (r'^(")([A-Za-z-]+:)(.*")$', 

104 bygroups(String, Name.Property, String)), 

105 (r'^".*"$', String), 

106 (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$', 

107 bygroups(Name.Variable, Text, String)), 

108 (r'^(msgstr\[)(\d)(\])(\s+)(".*")$', 

109 bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)), 

110 ] 

111 } 

112 

113 

114class HttpLexer(RegexLexer): 

115 """ 

116 Lexer for HTTP sessions. 

117 

118 .. versionadded:: 1.5 

119 """ 

120 

121 name = 'HTTP' 

122 aliases = ['http'] 

123 

124 flags = re.DOTALL 

125 

126 def get_tokens_unprocessed(self, text, stack=('root',)): 

127 """Reset the content-type state.""" 

128 self.content_type = None 

129 return RegexLexer.get_tokens_unprocessed(self, text, stack) 

130 

131 def header_callback(self, match): 

132 if match.group(1).lower() == 'content-type': 

133 content_type = match.group(5).strip() 

134 if ';' in content_type: 

135 content_type = content_type[:content_type.find(';')].strip() 

136 self.content_type = content_type 

137 yield match.start(1), Name.Attribute, match.group(1) 

138 yield match.start(2), Text, match.group(2) 

139 yield match.start(3), Operator, match.group(3) 

140 yield match.start(4), Text, match.group(4) 

141 yield match.start(5), Literal, match.group(5) 

142 yield match.start(6), Text, match.group(6) 

143 

144 def continuous_header_callback(self, match): 

145 yield match.start(1), Text, match.group(1) 

146 yield match.start(2), Literal, match.group(2) 

147 yield match.start(3), Text, match.group(3) 

148 

149 def content_callback(self, match): 

150 content_type = getattr(self, 'content_type', None) 

151 content = match.group() 

152 offset = match.start() 

153 if content_type: 

154 from pygments.lexers import get_lexer_for_mimetype 

155 possible_lexer_mimetypes = [content_type] 

156 if '+' in content_type: 

157 # application/calendar+xml can be treated as application/xml 

158 # if there's not a better match. 

159 general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2', 

160 content_type) 

161 possible_lexer_mimetypes.append(general_type) 

162 

163 for i in possible_lexer_mimetypes: 

164 try: 

165 lexer = get_lexer_for_mimetype(i) 

166 except ClassNotFound: 

167 pass 

168 else: 

169 for idx, token, value in lexer.get_tokens_unprocessed(content): 

170 yield offset + idx, token, value 

171 return 

172 yield offset, Text, content 

173 

174 tokens = { 

175 'root': [ 

176 (r'([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)' 

177 r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)', 

178 bygroups(Name.Function, Text, Name.Namespace, Text, 

179 Keyword.Reserved, Operator, Number, Text), 

180 'headers'), 

181 (r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)', 

182 bygroups(Keyword.Reserved, Operator, Number, Text, Number, Text, 

183 Name.Exception, Text), 

184 'headers'), 

185 ], 

186 'headers': [ 

187 (r'([^\s:]+)( *)(:)( *)([^\r\n]*)(\r?\n|\Z)', header_callback), 

188 (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback), 

189 (r'\r?\n', Text, 'content') 

190 ], 

191 'content': [ 

192 (r'.+', content_callback) 

193 ] 

194 } 

195 

196 def analyse_text(text): 

197 return any ( 

198 re.search(pattern, text) is not None 

199 for pattern in ( 

200 r'^([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)', 

201 r'^(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)', 

202 ) 

203 ) 

204 

205 

206class TodotxtLexer(RegexLexer): 

207 """ 

208 Lexer for Todo.txt todo list format. 

209 

210 .. versionadded:: 2.0 

211 """ 

212 

213 name = 'Todotxt' 

214 url = 'http://todotxt.com/' 

215 aliases = ['todotxt'] 

216 # *.todotxt is not a standard extension for Todo.txt files; including it 

217 # makes testing easier, and also makes autodetecting file type easier. 

218 filenames = ['todo.txt', '*.todotxt'] 

219 mimetypes = ['text/x-todo'] 

220 

221 # Aliases mapping standard token types of Todo.txt format concepts 

222 CompleteTaskText = Operator # Chosen to de-emphasize complete tasks 

223 IncompleteTaskText = Text # Incomplete tasks should look like plain text 

224 

225 # Priority should have most emphasis to indicate importance of tasks 

226 Priority = Generic.Heading 

227 # Dates should have next most emphasis because time is important 

228 Date = Generic.Subheading 

229 

230 # Project and context should have equal weight, and be in different colors 

231 Project = Generic.Error 

232 Context = String 

233 

234 # If tag functionality is added, it should have the same weight as Project 

235 # and Context, and a different color. Generic.Traceback would work well. 

236 

237 # Regex patterns for building up rules; dates, priorities, projects, and 

238 # contexts are all atomic 

239 # TODO: Make date regex more ISO 8601 compliant 

240 date_regex = r'\d{4,}-\d{2}-\d{2}' 

241 priority_regex = r'\([A-Z]\)' 

242 project_regex = r'\+\S+' 

243 context_regex = r'@\S+' 

244 

245 # Compound regex expressions 

246 complete_one_date_regex = r'(x )(' + date_regex + r')' 

247 complete_two_date_regex = (complete_one_date_regex + r'( )(' + 

248 date_regex + r')') 

249 priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')' 

250 

251 tokens = { 

252 # Should parse starting at beginning of line; each line is a task 

253 'root': [ 

254 # Complete task entry points: two total: 

255 # 1. Complete task with two dates 

256 (complete_two_date_regex, bygroups(CompleteTaskText, Date, 

257 CompleteTaskText, Date), 

258 'complete'), 

259 # 2. Complete task with one date 

260 (complete_one_date_regex, bygroups(CompleteTaskText, Date), 

261 'complete'), 

262 

263 # Incomplete task entry points: six total: 

264 # 1. Priority plus date 

265 (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date), 

266 'incomplete'), 

267 # 2. Priority only 

268 (priority_regex, Priority, 'incomplete'), 

269 # 3. Leading date 

270 (date_regex, Date, 'incomplete'), 

271 # 4. Leading context 

272 (context_regex, Context, 'incomplete'), 

273 # 5. Leading project 

274 (project_regex, Project, 'incomplete'), 

275 # 6. Non-whitespace catch-all 

276 (r'\S+', IncompleteTaskText, 'incomplete'), 

277 ], 

278 

279 # Parse a complete task 

280 'complete': [ 

281 # Newline indicates end of task, should return to root 

282 (r'\s*\n', CompleteTaskText, '#pop'), 

283 # Tokenize contexts and projects 

284 (context_regex, Context), 

285 (project_regex, Project), 

286 # Tokenize non-whitespace text 

287 (r'\S+', CompleteTaskText), 

288 # Tokenize whitespace not containing a newline 

289 (r'\s+', CompleteTaskText), 

290 ], 

291 

292 # Parse an incomplete task 

293 'incomplete': [ 

294 # Newline indicates end of task, should return to root 

295 (r'\s*\n', IncompleteTaskText, '#pop'), 

296 # Tokenize contexts and projects 

297 (context_regex, Context), 

298 (project_regex, Project), 

299 # Tokenize non-whitespace text 

300 (r'\S+', IncompleteTaskText), 

301 # Tokenize whitespace not containing a newline 

302 (r'\s+', IncompleteTaskText), 

303 ], 

304 } 

305 

306 

307class NotmuchLexer(RegexLexer): 

308 """ 

309 For Notmuch email text format. 

310 

311 .. versionadded:: 2.5 

312 

313 Additional options accepted: 

314 

315 `body_lexer` 

316 If given, highlight the contents of the message body with the specified 

317 lexer, else guess it according to the body content (default: ``None``). 

318 """ 

319 

320 name = 'Notmuch' 

321 url = 'https://notmuchmail.org/' 

322 aliases = ['notmuch'] 

323 

324 def _highlight_code(self, match): 

325 code = match.group(1) 

326 

327 try: 

328 if self.body_lexer: 

329 lexer = get_lexer_by_name(self.body_lexer) 

330 else: 

331 lexer = guess_lexer(code.strip()) 

332 except ClassNotFound: 

333 lexer = get_lexer_by_name('text') 

334 

335 yield from lexer.get_tokens_unprocessed(code) 

336 

337 tokens = { 

338 'root': [ 

339 (r'\fmessage\{\s*', Keyword, ('message', 'message-attr')), 

340 ], 

341 'message-attr': [ 

342 (r'(\s*id:\s*)(\S+)', bygroups(Name.Attribute, String)), 

343 (r'(\s*(?:depth|match|excluded):\s*)(\d+)', 

344 bygroups(Name.Attribute, Number.Integer)), 

345 (r'(\s*filename:\s*)(.+\n)', 

346 bygroups(Name.Attribute, String)), 

347 default('#pop'), 

348 ], 

349 'message': [ 

350 (r'\fmessage\}\n', Keyword, '#pop'), 

351 (r'\fheader\{\n', Keyword, 'header'), 

352 (r'\fbody\{\n', Keyword, 'body'), 

353 ], 

354 'header': [ 

355 (r'\fheader\}\n', Keyword, '#pop'), 

356 (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)', 

357 bygroups(Name.Attribute, String)), 

358 (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)', 

359 bygroups(Generic.Strong, Literal, Name.Tag)), 

360 ], 

361 'body': [ 

362 (r'\fpart\{\n', Keyword, 'part'), 

363 (r'\f(part|attachment)\{\s*', Keyword, ('part', 'part-attr')), 

364 (r'\fbody\}\n', Keyword, '#pop'), 

365 ], 

366 'part-attr': [ 

367 (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)), 

368 (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)', 

369 bygroups(Punctuation, Name.Attribute, String)), 

370 (r'(,\s*)(Content-type:\s*)(.+\n)', 

371 bygroups(Punctuation, Name.Attribute, String)), 

372 default('#pop'), 

373 ], 

374 'part': [ 

375 (r'\f(?:part|attachment)\}\n', Keyword, '#pop'), 

376 (r'\f(?:part|attachment)\{\s*', Keyword, ('#push', 'part-attr')), 

377 (r'^Non-text part: .*\n', Comment), 

378 (r'(?s)(.*?(?=\f(?:part|attachment)\}\n))', _highlight_code), 

379 ], 

380 } 

381 

382 def analyse_text(text): 

383 return 1.0 if text.startswith('\fmessage{') else 0.0 

384 

385 def __init__(self, **options): 

386 self.body_lexer = options.get('body_lexer', None) 

387 RegexLexer.__init__(self, **options) 

388 

389 

390class KernelLogLexer(RegexLexer): 

391 """ 

392 For Linux Kernel log ("dmesg") output. 

393 

394 .. versionadded:: 2.6 

395 """ 

396 name = 'Kernel log' 

397 aliases = ['kmsg', 'dmesg'] 

398 filenames = ['*.kmsg', '*.dmesg'] 

399 

400 tokens = { 

401 'root': [ 

402 (r'^[^:]+:debug : (?=\[)', Text, 'debug'), 

403 (r'^[^:]+:info : (?=\[)', Text, 'info'), 

404 (r'^[^:]+:warn : (?=\[)', Text, 'warn'), 

405 (r'^[^:]+:notice: (?=\[)', Text, 'warn'), 

406 (r'^[^:]+:err : (?=\[)', Text, 'error'), 

407 (r'^[^:]+:crit : (?=\[)', Text, 'error'), 

408 (r'^(?=\[)', Text, 'unknown'), 

409 ], 

410 'unknown': [ 

411 (r'^(?=.+(warning|notice|audit|deprecated))', Text, 'warn'), 

412 (r'^(?=.+(error|critical|fail|Bug))', Text, 'error'), 

413 default('info'), 

414 ], 

415 'base': [ 

416 (r'\[[0-9. ]+\] ', Number), 

417 (r'(?<=\] ).+?:', Keyword), 

418 (r'\n', Text, '#pop'), 

419 ], 

420 'debug': [ 

421 include('base'), 

422 (r'.+\n', Comment, '#pop') 

423 ], 

424 'info': [ 

425 include('base'), 

426 (r'.+\n', Text, '#pop') 

427 ], 

428 'warn': [ 

429 include('base'), 

430 (r'.+\n', Generic.Strong, '#pop') 

431 ], 

432 'error': [ 

433 include('base'), 

434 (r'.+\n', Generic.Error, '#pop') 

435 ] 

436 }