Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/email/feedparser.py: 31%

333 statements  

« prev     ^ index     » next       coverage.py v7.0.1, created at 2022-12-25 06:11 +0000

1# Copyright (C) 2004-2006 Python Software Foundation 

2# Authors: Baxter, Wouters and Warsaw 

3# Contact: email-sig@python.org 

4 

5"""FeedParser - An email feed parser. 

6 

7The feed parser implements an interface for incrementally parsing an email 

8message, line by line. This has advantages for certain applications, such as 

9those reading email messages off a socket. 

10 

11FeedParser.feed() is the primary interface for pushing new data into the 

12parser. It returns when there's nothing more it can do with the available 

13data. When you have no more data to push into the parser, call .close(). 

14This completes the parsing and returns the root message object. 

15 

16The other advantage of this parser is that it will never raise a parsing 

17exception. Instead, when it finds something unexpected, it adds a 'defect' to 

18the current message. Defects are just instances that live on the message 

19object's .defects attribute. 

20""" 

21 

22__all__ = ['FeedParser', 'BytesFeedParser'] 

23 

24import re 

25 

26from email import errors 

27from email._policybase import compat32 

28from collections import deque 

29from io import StringIO 

30 

31NLCRE = re.compile(r'\r\n|\r|\n') 

32NLCRE_bol = re.compile(r'(\r\n|\r|\n)') 

33NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z') 

34NLCRE_crack = re.compile(r'(\r\n|\r|\n)') 

35# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 

36# except controls, SP, and ":". 

37headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') 

38EMPTYSTRING = '' 

39NL = '\n' 

40 

41NeedMoreData = object() 

42 

43 

44 

45class BufferedSubFile(object): 

46 """A file-ish object that can have new data loaded into it. 

47 

48 You can also push and pop line-matching predicates onto a stack. When the 

49 current predicate matches the current line, a false EOF response 

50 (i.e. empty string) is returned instead. This lets the parser adhere to a 

51 simple abstraction -- it parses until EOF closes the current message. 

52 """ 

53 def __init__(self): 

54 # Text stream of the last partial line pushed into this object. 

55 # See issue 22233 for why this is a text stream and not a list. 

56 self._partial = StringIO(newline='') 

57 # A deque of full, pushed lines 

58 self._lines = deque() 

59 # The stack of false-EOF checking predicates. 

60 self._eofstack = [] 

61 # A flag indicating whether the file has been closed or not. 

62 self._closed = False 

63 

64 def push_eof_matcher(self, pred): 

65 self._eofstack.append(pred) 

66 

67 def pop_eof_matcher(self): 

68 return self._eofstack.pop() 

69 

70 def close(self): 

71 # Don't forget any trailing partial line. 

72 self._partial.seek(0) 

73 self.pushlines(self._partial.readlines()) 

74 self._partial.seek(0) 

75 self._partial.truncate() 

76 self._closed = True 

77 

78 def readline(self): 

79 if not self._lines: 

80 if self._closed: 

81 return '' 

82 return NeedMoreData 

83 # Pop the line off the stack and see if it matches the current 

84 # false-EOF predicate. 

85 line = self._lines.popleft() 

86 # RFC 2046, section 5.1.2 requires us to recognize outer level 

87 # boundaries at any level of inner nesting. Do this, but be sure it's 

88 # in the order of most to least nested. 

89 for ateof in reversed(self._eofstack): 

90 if ateof(line): 

91 # We're at the false EOF. But push the last line back first. 

92 self._lines.appendleft(line) 

93 return '' 

94 return line 

95 

96 def unreadline(self, line): 

97 # Let the consumer push a line back into the buffer. 

98 assert line is not NeedMoreData 

99 self._lines.appendleft(line) 

100 

101 def push(self, data): 

102 """Push some new data into this object.""" 

103 self._partial.write(data) 

104 if '\n' not in data and '\r' not in data: 

105 # No new complete lines, wait for more. 

106 return 

107 

108 # Crack into lines, preserving the linesep characters. 

109 self._partial.seek(0) 

110 parts = self._partial.readlines() 

111 self._partial.seek(0) 

112 self._partial.truncate() 

113 

114 # If the last element of the list does not end in a newline, then treat 

115 # it as a partial line. We only check for '\n' here because a line 

116 # ending with '\r' might be a line that was split in the middle of a 

117 # '\r\n' sequence (see bugs 1555570 and 1721862). 

118 if not parts[-1].endswith('\n'): 

119 self._partial.write(parts.pop()) 

120 self.pushlines(parts) 

121 

122 def pushlines(self, lines): 

123 self._lines.extend(lines) 

124 

125 def __iter__(self): 

126 return self 

127 

128 def __next__(self): 

129 line = self.readline() 

130 if line == '': 

131 raise StopIteration 

132 return line 

133 

134 

135 

136class FeedParser: 

137 """A feed-style parser of email.""" 

138 

139 def __init__(self, _factory=None, *, policy=compat32): 

140 """_factory is called with no arguments to create a new message obj 

141 

142 The policy keyword specifies a policy object that controls a number of 

143 aspects of the parser's operation. The default policy maintains 

144 backward compatibility. 

145 

146 """ 

147 self.policy = policy 

148 self._old_style_factory = False 

149 if _factory is None: 

150 if policy.message_factory is None: 

151 from email.message import Message 

152 self._factory = Message 

153 else: 

154 self._factory = policy.message_factory 

155 else: 

156 self._factory = _factory 

157 try: 

158 _factory(policy=self.policy) 

159 except TypeError: 

160 # Assume this is an old-style factory 

161 self._old_style_factory = True 

162 self._input = BufferedSubFile() 

163 self._msgstack = [] 

164 self._parse = self._parsegen().__next__ 

165 self._cur = None 

166 self._last = None 

167 self._headersonly = False 

168 

169 # Non-public interface for supporting Parser's headersonly flag 

170 def _set_headersonly(self): 

171 self._headersonly = True 

172 

173 def feed(self, data): 

174 """Push more data into the parser.""" 

175 self._input.push(data) 

176 self._call_parse() 

177 

178 def _call_parse(self): 

179 try: 

180 self._parse() 

181 except StopIteration: 

182 pass 

183 

184 def close(self): 

185 """Parse all remaining data and return the root message object.""" 

186 self._input.close() 

187 self._call_parse() 

188 root = self._pop_message() 

189 assert not self._msgstack 

190 # Look for final set of defects 

191 if root.get_content_maintype() == 'multipart' \ 

192 and not root.is_multipart(): 

193 defect = errors.MultipartInvariantViolationDefect() 

194 self.policy.handle_defect(root, defect) 

195 return root 

196 

197 def _new_message(self): 

198 if self._old_style_factory: 

199 msg = self._factory() 

200 else: 

201 msg = self._factory(policy=self.policy) 

202 if self._cur and self._cur.get_content_type() == 'multipart/digest': 

203 msg.set_default_type('message/rfc822') 

204 if self._msgstack: 

205 self._msgstack[-1].attach(msg) 

206 self._msgstack.append(msg) 

207 self._cur = msg 

208 self._last = msg 

209 

210 def _pop_message(self): 

211 retval = self._msgstack.pop() 

212 if self._msgstack: 

213 self._cur = self._msgstack[-1] 

214 else: 

215 self._cur = None 

216 return retval 

217 

218 def _parsegen(self): 

219 # Create a new message and start by parsing headers. 

220 self._new_message() 

221 headers = [] 

222 # Collect the headers, searching for a line that doesn't match the RFC 

223 # 2822 header or continuation pattern (including an empty line). 

224 for line in self._input: 

225 if line is NeedMoreData: 

226 yield NeedMoreData 

227 continue 

228 if not headerRE.match(line): 

229 # If we saw the RFC defined header/body separator 

230 # (i.e. newline), just throw it away. Otherwise the line is 

231 # part of the body so push it back. 

232 if not NLCRE.match(line): 

233 defect = errors.MissingHeaderBodySeparatorDefect() 

234 self.policy.handle_defect(self._cur, defect) 

235 self._input.unreadline(line) 

236 break 

237 headers.append(line) 

238 # Done with the headers, so parse them and figure out what we're 

239 # supposed to see in the body of the message. 

240 self._parse_headers(headers) 

241 # Headers-only parsing is a backwards compatibility hack, which was 

242 # necessary in the older parser, which could raise errors. All 

243 # remaining lines in the input are thrown into the message body. 

244 if self._headersonly: 

245 lines = [] 

246 while True: 

247 line = self._input.readline() 

248 if line is NeedMoreData: 

249 yield NeedMoreData 

250 continue 

251 if line == '': 

252 break 

253 lines.append(line) 

254 self._cur.set_payload(EMPTYSTRING.join(lines)) 

255 return 

256 if self._cur.get_content_type() == 'message/delivery-status': 

257 # message/delivery-status contains blocks of headers separated by 

258 # a blank line. We'll represent each header block as a separate 

259 # nested message object, but the processing is a bit different 

260 # than standard message/* types because there is no body for the 

261 # nested messages. A blank line separates the subparts. 

262 while True: 

263 self._input.push_eof_matcher(NLCRE.match) 

264 for retval in self._parsegen(): 

265 if retval is NeedMoreData: 

266 yield NeedMoreData 

267 continue 

268 break 

269 msg = self._pop_message() 

270 # We need to pop the EOF matcher in order to tell if we're at 

271 # the end of the current file, not the end of the last block 

272 # of message headers. 

273 self._input.pop_eof_matcher() 

274 # The input stream must be sitting at the newline or at the 

275 # EOF. We want to see if we're at the end of this subpart, so 

276 # first consume the blank line, then test the next line to see 

277 # if we're at this subpart's EOF. 

278 while True: 

279 line = self._input.readline() 

280 if line is NeedMoreData: 

281 yield NeedMoreData 

282 continue 

283 break 

284 while True: 

285 line = self._input.readline() 

286 if line is NeedMoreData: 

287 yield NeedMoreData 

288 continue 

289 break 

290 if line == '': 

291 break 

292 # Not at EOF so this is a line we're going to need. 

293 self._input.unreadline(line) 

294 return 

295 if self._cur.get_content_maintype() == 'message': 

296 # The message claims to be a message/* type, then what follows is 

297 # another RFC 2822 message. 

298 for retval in self._parsegen(): 

299 if retval is NeedMoreData: 

300 yield NeedMoreData 

301 continue 

302 break 

303 self._pop_message() 

304 return 

305 if self._cur.get_content_maintype() == 'multipart': 

306 boundary = self._cur.get_boundary() 

307 if boundary is None: 

308 # The message /claims/ to be a multipart but it has not 

309 # defined a boundary. That's a problem which we'll handle by 

310 # reading everything until the EOF and marking the message as 

311 # defective. 

312 defect = errors.NoBoundaryInMultipartDefect() 

313 self.policy.handle_defect(self._cur, defect) 

314 lines = [] 

315 for line in self._input: 

316 if line is NeedMoreData: 

317 yield NeedMoreData 

318 continue 

319 lines.append(line) 

320 self._cur.set_payload(EMPTYSTRING.join(lines)) 

321 return 

322 # Make sure a valid content type was specified per RFC 2045:6.4. 

323 if (str(self._cur.get('content-transfer-encoding', '8bit')).lower() 

324 not in ('7bit', '8bit', 'binary')): 

325 defect = errors.InvalidMultipartContentTransferEncodingDefect() 

326 self.policy.handle_defect(self._cur, defect) 

327 # Create a line match predicate which matches the inter-part 

328 # boundary as well as the end-of-multipart boundary. Don't push 

329 # this onto the input stream until we've scanned past the 

330 # preamble. 

331 separator = '--' + boundary 

332 boundaryre = re.compile( 

333 '(?P<sep>' + re.escape(separator) + 

334 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 

335 capturing_preamble = True 

336 preamble = [] 

337 linesep = False 

338 close_boundary_seen = False 

339 while True: 

340 line = self._input.readline() 

341 if line is NeedMoreData: 

342 yield NeedMoreData 

343 continue 

344 if line == '': 

345 break 

346 mo = boundaryre.match(line) 

347 if mo: 

348 # If we're looking at the end boundary, we're done with 

349 # this multipart. If there was a newline at the end of 

350 # the closing boundary, then we need to initialize the 

351 # epilogue with the empty string (see below). 

352 if mo.group('end'): 

353 close_boundary_seen = True 

354 linesep = mo.group('linesep') 

355 break 

356 # We saw an inter-part boundary. Were we in the preamble? 

357 if capturing_preamble: 

358 if preamble: 

359 # According to RFC 2046, the last newline belongs 

360 # to the boundary. 

361 lastline = preamble[-1] 

362 eolmo = NLCRE_eol.search(lastline) 

363 if eolmo: 

364 preamble[-1] = lastline[:-len(eolmo.group(0))] 

365 self._cur.preamble = EMPTYSTRING.join(preamble) 

366 capturing_preamble = False 

367 self._input.unreadline(line) 

368 continue 

369 # We saw a boundary separating two parts. Consume any 

370 # multiple boundary lines that may be following. Our 

371 # interpretation of RFC 2046 BNF grammar does not produce 

372 # body parts within such double boundaries. 

373 while True: 

374 line = self._input.readline() 

375 if line is NeedMoreData: 

376 yield NeedMoreData 

377 continue 

378 mo = boundaryre.match(line) 

379 if not mo: 

380 self._input.unreadline(line) 

381 break 

382 # Recurse to parse this subpart; the input stream points 

383 # at the subpart's first line. 

384 self._input.push_eof_matcher(boundaryre.match) 

385 for retval in self._parsegen(): 

386 if retval is NeedMoreData: 

387 yield NeedMoreData 

388 continue 

389 break 

390 # Because of RFC 2046, the newline preceding the boundary 

391 # separator actually belongs to the boundary, not the 

392 # previous subpart's payload (or epilogue if the previous 

393 # part is a multipart). 

394 if self._last.get_content_maintype() == 'multipart': 

395 epilogue = self._last.epilogue 

396 if epilogue == '': 

397 self._last.epilogue = None 

398 elif epilogue is not None: 

399 mo = NLCRE_eol.search(epilogue) 

400 if mo: 

401 end = len(mo.group(0)) 

402 self._last.epilogue = epilogue[:-end] 

403 else: 

404 payload = self._last._payload 

405 if isinstance(payload, str): 

406 mo = NLCRE_eol.search(payload) 

407 if mo: 

408 payload = payload[:-len(mo.group(0))] 

409 self._last._payload = payload 

410 self._input.pop_eof_matcher() 

411 self._pop_message() 

412 # Set the multipart up for newline cleansing, which will 

413 # happen if we're in a nested multipart. 

414 self._last = self._cur 

415 else: 

416 # I think we must be in the preamble 

417 assert capturing_preamble 

418 preamble.append(line) 

419 # We've seen either the EOF or the end boundary. If we're still 

420 # capturing the preamble, we never saw the start boundary. Note 

421 # that as a defect and store the captured text as the payload. 

422 if capturing_preamble: 

423 defect = errors.StartBoundaryNotFoundDefect() 

424 self.policy.handle_defect(self._cur, defect) 

425 self._cur.set_payload(EMPTYSTRING.join(preamble)) 

426 epilogue = [] 

427 for line in self._input: 

428 if line is NeedMoreData: 

429 yield NeedMoreData 

430 continue 

431 self._cur.epilogue = EMPTYSTRING.join(epilogue) 

432 return 

433 # If we're not processing the preamble, then we might have seen 

434 # EOF without seeing that end boundary...that is also a defect. 

435 if not close_boundary_seen: 

436 defect = errors.CloseBoundaryNotFoundDefect() 

437 self.policy.handle_defect(self._cur, defect) 

438 return 

439 # Everything from here to the EOF is epilogue. If the end boundary 

440 # ended in a newline, we'll need to make sure the epilogue isn't 

441 # None 

442 if linesep: 

443 epilogue = [''] 

444 else: 

445 epilogue = [] 

446 for line in self._input: 

447 if line is NeedMoreData: 

448 yield NeedMoreData 

449 continue 

450 epilogue.append(line) 

451 # Any CRLF at the front of the epilogue is not technically part of 

452 # the epilogue. Also, watch out for an empty string epilogue, 

453 # which means a single newline. 

454 if epilogue: 

455 firstline = epilogue[0] 

456 bolmo = NLCRE_bol.match(firstline) 

457 if bolmo: 

458 epilogue[0] = firstline[len(bolmo.group(0)):] 

459 self._cur.epilogue = EMPTYSTRING.join(epilogue) 

460 return 

461 # Otherwise, it's some non-multipart type, so the entire rest of the 

462 # file contents becomes the payload. 

463 lines = [] 

464 for line in self._input: 

465 if line is NeedMoreData: 

466 yield NeedMoreData 

467 continue 

468 lines.append(line) 

469 self._cur.set_payload(EMPTYSTRING.join(lines)) 

470 

471 def _parse_headers(self, lines): 

472 # Passed a list of lines that make up the headers for the current msg 

473 lastheader = '' 

474 lastvalue = [] 

475 for lineno, line in enumerate(lines): 

476 # Check for continuation 

477 if line[0] in ' \t': 

478 if not lastheader: 

479 # The first line of the headers was a continuation. This 

480 # is illegal, so let's note the defect, store the illegal 

481 # line, and ignore it for purposes of headers. 

482 defect = errors.FirstHeaderLineIsContinuationDefect(line) 

483 self.policy.handle_defect(self._cur, defect) 

484 continue 

485 lastvalue.append(line) 

486 continue 

487 if lastheader: 

488 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 

489 lastheader, lastvalue = '', [] 

490 # Check for envelope header, i.e. unix-from 

491 if line.startswith('From '): 

492 if lineno == 0: 

493 # Strip off the trailing newline 

494 mo = NLCRE_eol.search(line) 

495 if mo: 

496 line = line[:-len(mo.group(0))] 

497 self._cur.set_unixfrom(line) 

498 continue 

499 elif lineno == len(lines) - 1: 

500 # Something looking like a unix-from at the end - it's 

501 # probably the first line of the body, so push back the 

502 # line and stop. 

503 self._input.unreadline(line) 

504 return 

505 else: 

506 # Weirdly placed unix-from line. Note this as a defect 

507 # and ignore it. 

508 defect = errors.MisplacedEnvelopeHeaderDefect(line) 

509 self._cur.defects.append(defect) 

510 continue 

511 # Split the line on the colon separating field name from value. 

512 # There will always be a colon, because if there wasn't the part of 

513 # the parser that calls us would have started parsing the body. 

514 i = line.find(':') 

515 

516 # If the colon is on the start of the line the header is clearly 

517 # malformed, but we might be able to salvage the rest of the 

518 # message. Track the error but keep going. 

519 if i == 0: 

520 defect = errors.InvalidHeaderDefect("Missing header name.") 

521 self._cur.defects.append(defect) 

522 continue 

523 

524 assert i>0, "_parse_headers fed line with no : and no leading WS" 

525 lastheader = line[:i] 

526 lastvalue = [line] 

527 # Done with all the lines, so handle the last header. 

528 if lastheader: 

529 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 

530 

531 

532class BytesFeedParser(FeedParser): 

533 """Like FeedParser, but feed accepts bytes.""" 

534 

535 def feed(self, data): 

536 super().feed(data.decode('ascii', 'surrogateescape'))