Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/email/_header_value_parser.py: 20%

1851 statements  

« prev     ^ index     » next       coverage.py v7.0.1, created at 2022-12-25 06:11 +0000

1"""Header value parser implementing various email-related RFC parsing rules. 

2 

3The parsing methods defined in this module implement various email related 

4parsing rules. Principal among them is RFC 5322, which is the followon 

5to RFC 2822 and primarily a clarification of the former. It also implements 

6RFC 2047 encoded word decoding. 

7 

8RFC 5322 goes to considerable trouble to maintain backward compatibility with 

9RFC 822 in the parse phase, while cleaning up the structure on the generation 

10phase. This parser supports correct RFC 5322 generation by tagging white space 

11as folding white space only when folding is allowed in the non-obsolete rule 

12sets. Actually, the parser is even more generous when accepting input than RFC 

135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages. 

14Where possible deviations from the standard are annotated on the 'defects' 

15attribute of tokens that deviate. 

16 

17The general structure of the parser follows RFC 5322, and uses its terminology 

18where there is a direct correspondence. Where the implementation requires a 

19somewhat different structure than that used by the formal grammar, new terms 

20that mimic the closest existing terms are used. Thus, it really helps to have 

21a copy of RFC 5322 handy when studying this code. 

22 

23Input to the parser is a string that has already been unfolded according to 

24RFC 5322 rules. According to the RFC this unfolding is the very first step, and 

25this parser leaves the unfolding step to a higher level message parser, which 

26will have already detected the line breaks that need unfolding while 

27determining the beginning and end of each header. 

28 

29The output of the parser is a TokenList object, which is a list subclass. A 

30TokenList is a recursive data structure. The terminal nodes of the structure 

31are Terminal objects, which are subclasses of str. These do not correspond 

32directly to terminal objects in the formal grammar, but are instead more 

33practical higher level combinations of true terminals. 

34 

35All TokenList and Terminal objects have a 'value' attribute, which produces the 

36semantically meaningful value of that part of the parse subtree. The value of 

37all whitespace tokens (no matter how many sub-tokens they may contain) is a 

38single space, as per the RFC rules. This includes 'CFWS', which is herein 

39included in the general class of whitespace tokens. There is one exception to 

40the rule that whitespace tokens are collapsed into single spaces in values: in 

41the value of a 'bare-quoted-string' (a quoted-string with no leading or 

42trailing whitespace), any whitespace that appeared between the quotation marks 

43is preserved in the returned value. Note that in all Terminal strings quoted 

44pairs are turned into their unquoted values. 

45 

46All TokenList and Terminal objects also have a string value, which attempts to 

47be a "canonical" representation of the RFC-compliant form of the substring that 

48produced the parsed subtree, including minimal use of quoted pair quoting. 

49Whitespace runs are not collapsed. 

50 

51Comment tokens also have a 'content' attribute providing the string found 

52between the parens (including any nested comments) with whitespace preserved. 

53 

54All TokenList and Terminal objects have a 'defects' attribute which is a 

55possibly empty list all of the defects found while creating the token. Defects 

56may appear on any token in the tree, and a composite list of all defects in the 

57subtree is available through the 'all_defects' attribute of any node. (For 

58Terminal notes x.defects == x.all_defects.) 

59 

60Each object in a parse tree is called a 'token', and each has a 'token_type' 

61attribute that gives the name from the RFC 5322 grammar that it represents. 

62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that 

63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters. 

64It is returned in place of lists of (ctext/quoted-pair) and 

65(qtext/quoted-pair). 

66 

67XXX: provide complete list of token types. 

68""" 

69 

70import re 

71import sys 

72import urllib # For urllib.parse.unquote 

73from string import hexdigits 

74from operator import itemgetter 

75from email import _encoded_words as _ew 

76from email import errors 

77from email import utils 

78 

79# 

80# Useful constants and functions 

81# 

82 

83WSP = set(' \t') 

84CFWS_LEADER = WSP | set('(') 

85SPECIALS = set(r'()<>@,:;.\"[]') 

86ATOM_ENDS = SPECIALS | WSP 

87DOT_ATOM_ENDS = ATOM_ENDS - set('.') 

88# '.', '"', and '(' do not end phrases in order to support obs-phrase 

89PHRASE_ENDS = SPECIALS - set('."(') 

90TSPECIALS = (SPECIALS | set('/?=')) - set('.') 

91TOKEN_ENDS = TSPECIALS | WSP 

92ASPECIALS = TSPECIALS | set("*'%") 

93ATTRIBUTE_ENDS = ASPECIALS | WSP 

94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') 

95 

96def quote_string(value): 

97 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' 

98 

99# Match a RFC 2047 word, looks like =?utf-8?q?someword?= 

100rfc2047_matcher = re.compile(r''' 

101 =\? # literal =? 

102 [^?]* # charset 

103 \? # literal ? 

104 [qQbB] # literal 'q' or 'b', case insensitive 

105 \? # literal ? 

106 .*? # encoded word 

107 \?= # literal ?= 

108''', re.VERBOSE | re.MULTILINE) 

109 

110 

111# 

112# TokenList and its subclasses 

113# 

114 

115class TokenList(list): 

116 

117 token_type = None 

118 syntactic_break = True 

119 ew_combine_allowed = True 

120 

121 def __init__(self, *args, **kw): 

122 super().__init__(*args, **kw) 

123 self.defects = [] 

124 

125 def __str__(self): 

126 return ''.join(str(x) for x in self) 

127 

128 def __repr__(self): 

129 return '{}({})'.format(self.__class__.__name__, 

130 super().__repr__()) 

131 

132 @property 

133 def value(self): 

134 return ''.join(x.value for x in self if x.value) 

135 

136 @property 

137 def all_defects(self): 

138 return sum((x.all_defects for x in self), self.defects) 

139 

140 def startswith_fws(self): 

141 return self[0].startswith_fws() 

142 

143 @property 

144 def as_ew_allowed(self): 

145 """True if all top level tokens of this part may be RFC2047 encoded.""" 

146 return all(part.as_ew_allowed for part in self) 

147 

148 @property 

149 def comments(self): 

150 comments = [] 

151 for token in self: 

152 comments.extend(token.comments) 

153 return comments 

154 

155 def fold(self, *, policy): 

156 return _refold_parse_tree(self, policy=policy) 

157 

158 def pprint(self, indent=''): 

159 print(self.ppstr(indent=indent)) 

160 

161 def ppstr(self, indent=''): 

162 return '\n'.join(self._pp(indent=indent)) 

163 

164 def _pp(self, indent=''): 

165 yield '{}{}/{}('.format( 

166 indent, 

167 self.__class__.__name__, 

168 self.token_type) 

169 for token in self: 

170 if not hasattr(token, '_pp'): 

171 yield (indent + ' !! invalid element in token ' 

172 'list: {!r}'.format(token)) 

173 else: 

174 yield from token._pp(indent+' ') 

175 if self.defects: 

176 extra = ' Defects: {}'.format(self.defects) 

177 else: 

178 extra = '' 

179 yield '{}){}'.format(indent, extra) 

180 

181 

182class WhiteSpaceTokenList(TokenList): 

183 

184 @property 

185 def value(self): 

186 return ' ' 

187 

188 @property 

189 def comments(self): 

190 return [x.content for x in self if x.token_type=='comment'] 

191 

192 

193class UnstructuredTokenList(TokenList): 

194 token_type = 'unstructured' 

195 

196 

197class Phrase(TokenList): 

198 token_type = 'phrase' 

199 

200class Word(TokenList): 

201 token_type = 'word' 

202 

203 

204class CFWSList(WhiteSpaceTokenList): 

205 token_type = 'cfws' 

206 

207 

208class Atom(TokenList): 

209 token_type = 'atom' 

210 

211 

212class Token(TokenList): 

213 token_type = 'token' 

214 encode_as_ew = False 

215 

216 

217class EncodedWord(TokenList): 

218 token_type = 'encoded-word' 

219 cte = None 

220 charset = None 

221 lang = None 

222 

223 

224class QuotedString(TokenList): 

225 

226 token_type = 'quoted-string' 

227 

228 @property 

229 def content(self): 

230 for x in self: 

231 if x.token_type == 'bare-quoted-string': 

232 return x.value 

233 

234 @property 

235 def quoted_value(self): 

236 res = [] 

237 for x in self: 

238 if x.token_type == 'bare-quoted-string': 

239 res.append(str(x)) 

240 else: 

241 res.append(x.value) 

242 return ''.join(res) 

243 

244 @property 

245 def stripped_value(self): 

246 for token in self: 

247 if token.token_type == 'bare-quoted-string': 

248 return token.value 

249 

250 

251class BareQuotedString(QuotedString): 

252 

253 token_type = 'bare-quoted-string' 

254 

255 def __str__(self): 

256 return quote_string(''.join(str(x) for x in self)) 

257 

258 @property 

259 def value(self): 

260 return ''.join(str(x) for x in self) 

261 

262 

263class Comment(WhiteSpaceTokenList): 

264 

265 token_type = 'comment' 

266 

267 def __str__(self): 

268 return ''.join(sum([ 

269 ["("], 

270 [self.quote(x) for x in self], 

271 [")"], 

272 ], [])) 

273 

274 def quote(self, value): 

275 if value.token_type == 'comment': 

276 return str(value) 

277 return str(value).replace('\\', '\\\\').replace( 

278 '(', r'\(').replace( 

279 ')', r'\)') 

280 

281 @property 

282 def content(self): 

283 return ''.join(str(x) for x in self) 

284 

285 @property 

286 def comments(self): 

287 return [self.content] 

288 

289class AddressList(TokenList): 

290 

291 token_type = 'address-list' 

292 

293 @property 

294 def addresses(self): 

295 return [x for x in self if x.token_type=='address'] 

296 

297 @property 

298 def mailboxes(self): 

299 return sum((x.mailboxes 

300 for x in self if x.token_type=='address'), []) 

301 

302 @property 

303 def all_mailboxes(self): 

304 return sum((x.all_mailboxes 

305 for x in self if x.token_type=='address'), []) 

306 

307 

308class Address(TokenList): 

309 

310 token_type = 'address' 

311 

312 @property 

313 def display_name(self): 

314 if self[0].token_type == 'group': 

315 return self[0].display_name 

316 

317 @property 

318 def mailboxes(self): 

319 if self[0].token_type == 'mailbox': 

320 return [self[0]] 

321 elif self[0].token_type == 'invalid-mailbox': 

322 return [] 

323 return self[0].mailboxes 

324 

325 @property 

326 def all_mailboxes(self): 

327 if self[0].token_type == 'mailbox': 

328 return [self[0]] 

329 elif self[0].token_type == 'invalid-mailbox': 

330 return [self[0]] 

331 return self[0].all_mailboxes 

332 

333class MailboxList(TokenList): 

334 

335 token_type = 'mailbox-list' 

336 

337 @property 

338 def mailboxes(self): 

339 return [x for x in self if x.token_type=='mailbox'] 

340 

341 @property 

342 def all_mailboxes(self): 

343 return [x for x in self 

344 if x.token_type in ('mailbox', 'invalid-mailbox')] 

345 

346 

347class GroupList(TokenList): 

348 

349 token_type = 'group-list' 

350 

351 @property 

352 def mailboxes(self): 

353 if not self or self[0].token_type != 'mailbox-list': 

354 return [] 

355 return self[0].mailboxes 

356 

357 @property 

358 def all_mailboxes(self): 

359 if not self or self[0].token_type != 'mailbox-list': 

360 return [] 

361 return self[0].all_mailboxes 

362 

363 

364class Group(TokenList): 

365 

366 token_type = "group" 

367 

368 @property 

369 def mailboxes(self): 

370 if self[2].token_type != 'group-list': 

371 return [] 

372 return self[2].mailboxes 

373 

374 @property 

375 def all_mailboxes(self): 

376 if self[2].token_type != 'group-list': 

377 return [] 

378 return self[2].all_mailboxes 

379 

380 @property 

381 def display_name(self): 

382 return self[0].display_name 

383 

384 

385class NameAddr(TokenList): 

386 

387 token_type = 'name-addr' 

388 

389 @property 

390 def display_name(self): 

391 if len(self) == 1: 

392 return None 

393 return self[0].display_name 

394 

395 @property 

396 def local_part(self): 

397 return self[-1].local_part 

398 

399 @property 

400 def domain(self): 

401 return self[-1].domain 

402 

403 @property 

404 def route(self): 

405 return self[-1].route 

406 

407 @property 

408 def addr_spec(self): 

409 return self[-1].addr_spec 

410 

411 

412class AngleAddr(TokenList): 

413 

414 token_type = 'angle-addr' 

415 

416 @property 

417 def local_part(self): 

418 for x in self: 

419 if x.token_type == 'addr-spec': 

420 return x.local_part 

421 

422 @property 

423 def domain(self): 

424 for x in self: 

425 if x.token_type == 'addr-spec': 

426 return x.domain 

427 

428 @property 

429 def route(self): 

430 for x in self: 

431 if x.token_type == 'obs-route': 

432 return x.domains 

433 

434 @property 

435 def addr_spec(self): 

436 for x in self: 

437 if x.token_type == 'addr-spec': 

438 if x.local_part: 

439 return x.addr_spec 

440 else: 

441 return quote_string(x.local_part) + x.addr_spec 

442 else: 

443 return '<>' 

444 

445 

446class ObsRoute(TokenList): 

447 

448 token_type = 'obs-route' 

449 

450 @property 

451 def domains(self): 

452 return [x.domain for x in self if x.token_type == 'domain'] 

453 

454 

455class Mailbox(TokenList): 

456 

457 token_type = 'mailbox' 

458 

459 @property 

460 def display_name(self): 

461 if self[0].token_type == 'name-addr': 

462 return self[0].display_name 

463 

464 @property 

465 def local_part(self): 

466 return self[0].local_part 

467 

468 @property 

469 def domain(self): 

470 return self[0].domain 

471 

472 @property 

473 def route(self): 

474 if self[0].token_type == 'name-addr': 

475 return self[0].route 

476 

477 @property 

478 def addr_spec(self): 

479 return self[0].addr_spec 

480 

481 

482class InvalidMailbox(TokenList): 

483 

484 token_type = 'invalid-mailbox' 

485 

486 @property 

487 def display_name(self): 

488 return None 

489 

490 local_part = domain = route = addr_spec = display_name 

491 

492 

493class Domain(TokenList): 

494 

495 token_type = 'domain' 

496 as_ew_allowed = False 

497 

498 @property 

499 def domain(self): 

500 return ''.join(super().value.split()) 

501 

502 

503class DotAtom(TokenList): 

504 token_type = 'dot-atom' 

505 

506 

507class DotAtomText(TokenList): 

508 token_type = 'dot-atom-text' 

509 as_ew_allowed = True 

510 

511 

512class NoFoldLiteral(TokenList): 

513 token_type = 'no-fold-literal' 

514 as_ew_allowed = False 

515 

516 

517class AddrSpec(TokenList): 

518 

519 token_type = 'addr-spec' 

520 as_ew_allowed = False 

521 

522 @property 

523 def local_part(self): 

524 return self[0].local_part 

525 

526 @property 

527 def domain(self): 

528 if len(self) < 3: 

529 return None 

530 return self[-1].domain 

531 

532 @property 

533 def value(self): 

534 if len(self) < 3: 

535 return self[0].value 

536 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip() 

537 

538 @property 

539 def addr_spec(self): 

540 nameset = set(self.local_part) 

541 if len(nameset) > len(nameset-DOT_ATOM_ENDS): 

542 lp = quote_string(self.local_part) 

543 else: 

544 lp = self.local_part 

545 if self.domain is not None: 

546 return lp + '@' + self.domain 

547 return lp 

548 

549 

550class ObsLocalPart(TokenList): 

551 

552 token_type = 'obs-local-part' 

553 as_ew_allowed = False 

554 

555 

556class DisplayName(Phrase): 

557 

558 token_type = 'display-name' 

559 ew_combine_allowed = False 

560 

561 @property 

562 def display_name(self): 

563 res = TokenList(self) 

564 if len(res) == 0: 

565 return res.value 

566 if res[0].token_type == 'cfws': 

567 res.pop(0) 

568 else: 

569 if res[0][0].token_type == 'cfws': 

570 res[0] = TokenList(res[0][1:]) 

571 if res[-1].token_type == 'cfws': 

572 res.pop() 

573 else: 

574 if res[-1][-1].token_type == 'cfws': 

575 res[-1] = TokenList(res[-1][:-1]) 

576 return res.value 

577 

578 @property 

579 def value(self): 

580 quote = False 

581 if self.defects: 

582 quote = True 

583 else: 

584 for x in self: 

585 if x.token_type == 'quoted-string': 

586 quote = True 

587 if len(self) != 0 and quote: 

588 pre = post = '' 

589 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws': 

590 pre = ' ' 

591 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws': 

592 post = ' ' 

593 return pre+quote_string(self.display_name)+post 

594 else: 

595 return super().value 

596 

597 

598class LocalPart(TokenList): 

599 

600 token_type = 'local-part' 

601 as_ew_allowed = False 

602 

603 @property 

604 def value(self): 

605 if self[0].token_type == "quoted-string": 

606 return self[0].quoted_value 

607 else: 

608 return self[0].value 

609 

610 @property 

611 def local_part(self): 

612 # Strip whitespace from front, back, and around dots. 

613 res = [DOT] 

614 last = DOT 

615 last_is_tl = False 

616 for tok in self[0] + [DOT]: 

617 if tok.token_type == 'cfws': 

618 continue 

619 if (last_is_tl and tok.token_type == 'dot' and 

620 last[-1].token_type == 'cfws'): 

621 res[-1] = TokenList(last[:-1]) 

622 is_tl = isinstance(tok, TokenList) 

623 if (is_tl and last.token_type == 'dot' and 

624 tok[0].token_type == 'cfws'): 

625 res.append(TokenList(tok[1:])) 

626 else: 

627 res.append(tok) 

628 last = res[-1] 

629 last_is_tl = is_tl 

630 res = TokenList(res[1:-1]) 

631 return res.value 

632 

633 

634class DomainLiteral(TokenList): 

635 

636 token_type = 'domain-literal' 

637 as_ew_allowed = False 

638 

639 @property 

640 def domain(self): 

641 return ''.join(super().value.split()) 

642 

643 @property 

644 def ip(self): 

645 for x in self: 

646 if x.token_type == 'ptext': 

647 return x.value 

648 

649 

650class MIMEVersion(TokenList): 

651 

652 token_type = 'mime-version' 

653 major = None 

654 minor = None 

655 

656 

657class Parameter(TokenList): 

658 

659 token_type = 'parameter' 

660 sectioned = False 

661 extended = False 

662 charset = 'us-ascii' 

663 

664 @property 

665 def section_number(self): 

666 # Because the first token, the attribute (name) eats CFWS, the second 

667 # token is always the section if there is one. 

668 return self[1].number if self.sectioned else 0 

669 

670 @property 

671 def param_value(self): 

672 # This is part of the "handle quoted extended parameters" hack. 

673 for token in self: 

674 if token.token_type == 'value': 

675 return token.stripped_value 

676 if token.token_type == 'quoted-string': 

677 for token in token: 

678 if token.token_type == 'bare-quoted-string': 

679 for token in token: 

680 if token.token_type == 'value': 

681 return token.stripped_value 

682 return '' 

683 

684 

685class InvalidParameter(Parameter): 

686 

687 token_type = 'invalid-parameter' 

688 

689 

690class Attribute(TokenList): 

691 

692 token_type = 'attribute' 

693 

694 @property 

695 def stripped_value(self): 

696 for token in self: 

697 if token.token_type.endswith('attrtext'): 

698 return token.value 

699 

700class Section(TokenList): 

701 

702 token_type = 'section' 

703 number = None 

704 

705 

706class Value(TokenList): 

707 

708 token_type = 'value' 

709 

710 @property 

711 def stripped_value(self): 

712 token = self[0] 

713 if token.token_type == 'cfws': 

714 token = self[1] 

715 if token.token_type.endswith( 

716 ('quoted-string', 'attribute', 'extended-attribute')): 

717 return token.stripped_value 

718 return self.value 

719 

720 

721class MimeParameters(TokenList): 

722 

723 token_type = 'mime-parameters' 

724 syntactic_break = False 

725 

726 @property 

727 def params(self): 

728 # The RFC specifically states that the ordering of parameters is not 

729 # guaranteed and may be reordered by the transport layer. So we have 

730 # to assume the RFC 2231 pieces can come in any order. However, we 

731 # output them in the order that we first see a given name, which gives 

732 # us a stable __str__. 

733 params = {} # Using order preserving dict from Python 3.7+ 

734 for token in self: 

735 if not token.token_type.endswith('parameter'): 

736 continue 

737 if token[0].token_type != 'attribute': 

738 continue 

739 name = token[0].value.strip() 

740 if name not in params: 

741 params[name] = [] 

742 params[name].append((token.section_number, token)) 

743 for name, parts in params.items(): 

744 parts = sorted(parts, key=itemgetter(0)) 

745 first_param = parts[0][1] 

746 charset = first_param.charset 

747 # Our arbitrary error recovery is to ignore duplicate parameters, 

748 # to use appearance order if there are duplicate rfc 2231 parts, 

749 # and to ignore gaps. This mimics the error recovery of get_param. 

750 if not first_param.extended and len(parts) > 1: 

751 if parts[1][0] == 0: 

752 parts[1][1].defects.append(errors.InvalidHeaderDefect( 

753 'duplicate parameter name; duplicate(s) ignored')) 

754 parts = parts[:1] 

755 # Else assume the *0* was missing...note that this is different 

756 # from get_param, but we registered a defect for this earlier. 

757 value_parts = [] 

758 i = 0 

759 for section_number, param in parts: 

760 if section_number != i: 

761 # We could get fancier here and look for a complete 

762 # duplicate extended parameter and ignore the second one 

763 # seen. But we're not doing that. The old code didn't. 

764 if not param.extended: 

765 param.defects.append(errors.InvalidHeaderDefect( 

766 'duplicate parameter name; duplicate ignored')) 

767 continue 

768 else: 

769 param.defects.append(errors.InvalidHeaderDefect( 

770 "inconsistent RFC2231 parameter numbering")) 

771 i += 1 

772 value = param.param_value 

773 if param.extended: 

774 try: 

775 value = urllib.parse.unquote_to_bytes(value) 

776 except UnicodeEncodeError: 

777 # source had surrogate escaped bytes. What we do now 

778 # is a bit of an open question. I'm not sure this is 

779 # the best choice, but it is what the old algorithm did 

780 value = urllib.parse.unquote(value, encoding='latin-1') 

781 else: 

782 try: 

783 value = value.decode(charset, 'surrogateescape') 

784 except LookupError: 

785 # XXX: there should really be a custom defect for 

786 # unknown character set to make it easy to find, 

787 # because otherwise unknown charset is a silent 

788 # failure. 

789 value = value.decode('us-ascii', 'surrogateescape') 

790 if utils._has_surrogates(value): 

791 param.defects.append(errors.UndecodableBytesDefect()) 

792 value_parts.append(value) 

793 value = ''.join(value_parts) 

794 yield name, value 

795 

796 def __str__(self): 

797 params = [] 

798 for name, value in self.params: 

799 if value: 

800 params.append('{}={}'.format(name, quote_string(value))) 

801 else: 

802 params.append(name) 

803 params = '; '.join(params) 

804 return ' ' + params if params else '' 

805 

806 

807class ParameterizedHeaderValue(TokenList): 

808 

809 # Set this false so that the value doesn't wind up on a new line even 

810 # if it and the parameters would fit there but not on the first line. 

811 syntactic_break = False 

812 

813 @property 

814 def params(self): 

815 for token in reversed(self): 

816 if token.token_type == 'mime-parameters': 

817 return token.params 

818 return {} 

819 

820 

821class ContentType(ParameterizedHeaderValue): 

822 token_type = 'content-type' 

823 as_ew_allowed = False 

824 maintype = 'text' 

825 subtype = 'plain' 

826 

827 

828class ContentDisposition(ParameterizedHeaderValue): 

829 token_type = 'content-disposition' 

830 as_ew_allowed = False 

831 content_disposition = None 

832 

833 

834class ContentTransferEncoding(TokenList): 

835 token_type = 'content-transfer-encoding' 

836 as_ew_allowed = False 

837 cte = '7bit' 

838 

839 

840class HeaderLabel(TokenList): 

841 token_type = 'header-label' 

842 as_ew_allowed = False 

843 

844 

845class MsgID(TokenList): 

846 token_type = 'msg-id' 

847 as_ew_allowed = False 

848 

849 def fold(self, policy): 

850 # message-id tokens may not be folded. 

851 return str(self) + policy.linesep 

852 

853 

854class MessageID(MsgID): 

855 token_type = 'message-id' 

856 

857 

858class InvalidMessageID(MessageID): 

859 token_type = 'invalid-message-id' 

860 

861 

862class Header(TokenList): 

863 token_type = 'header' 

864 

865 

866# 

867# Terminal classes and instances 

868# 

869 

870class Terminal(str): 

871 

872 as_ew_allowed = True 

873 ew_combine_allowed = True 

874 syntactic_break = True 

875 

876 def __new__(cls, value, token_type): 

877 self = super().__new__(cls, value) 

878 self.token_type = token_type 

879 self.defects = [] 

880 return self 

881 

882 def __repr__(self): 

883 return "{}({})".format(self.__class__.__name__, super().__repr__()) 

884 

885 def pprint(self): 

886 print(self.__class__.__name__ + '/' + self.token_type) 

887 

888 @property 

889 def all_defects(self): 

890 return list(self.defects) 

891 

892 def _pp(self, indent=''): 

893 return ["{}{}/{}({}){}".format( 

894 indent, 

895 self.__class__.__name__, 

896 self.token_type, 

897 super().__repr__(), 

898 '' if not self.defects else ' {}'.format(self.defects), 

899 )] 

900 

901 def pop_trailing_ws(self): 

902 # This terminates the recursion. 

903 return None 

904 

905 @property 

906 def comments(self): 

907 return [] 

908 

909 def __getnewargs__(self): 

910 return(str(self), self.token_type) 

911 

912 

913class WhiteSpaceTerminal(Terminal): 

914 

915 @property 

916 def value(self): 

917 return ' ' 

918 

919 def startswith_fws(self): 

920 return True 

921 

922 

923class ValueTerminal(Terminal): 

924 

925 @property 

926 def value(self): 

927 return self 

928 

929 def startswith_fws(self): 

930 return False 

931 

932 

933class EWWhiteSpaceTerminal(WhiteSpaceTerminal): 

934 

935 @property 

936 def value(self): 

937 return '' 

938 

939 def __str__(self): 

940 return '' 

941 

942 

943class _InvalidEwError(errors.HeaderParseError): 

944 """Invalid encoded word found while parsing headers.""" 

945 

946 

947# XXX these need to become classes and used as instances so 

948# that a program can't change them in a parse tree and screw 

949# up other parse trees. Maybe should have tests for that, too. 

950DOT = ValueTerminal('.', 'dot') 

951ListSeparator = ValueTerminal(',', 'list-separator') 

952RouteComponentMarker = ValueTerminal('@', 'route-component-marker') 

953 

954# 

955# Parser 

956# 

957 

958# Parse strings according to RFC822/2047/2822/5322 rules. 

959# 

960# This is a stateless parser. Each get_XXX function accepts a string and 

961# returns either a Terminal or a TokenList representing the RFC object named 

962# by the method and a string containing the remaining unparsed characters 

963# from the input. Thus a parser method consumes the next syntactic construct 

964# of a given type and returns a token representing the construct plus the 

965# unparsed remainder of the input string. 

966# 

967# For example, if the first element of a structured header is a 'phrase', 

968# then: 

969# 

970# phrase, value = get_phrase(value) 

971# 

972# returns the complete phrase from the start of the string value, plus any 

973# characters left in the string after the phrase is removed. 

974 

975_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split 

976_non_atom_end_matcher = re.compile(r"[^{}]+".format( 

977 re.escape(''.join(ATOM_ENDS)))).match 

978_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall 

979_non_token_end_matcher = re.compile(r"[^{}]+".format( 

980 re.escape(''.join(TOKEN_ENDS)))).match 

981_non_attribute_end_matcher = re.compile(r"[^{}]+".format( 

982 re.escape(''.join(ATTRIBUTE_ENDS)))).match 

983_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( 

984 re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match 

985 

986def _validate_xtext(xtext): 

987 """If input token contains ASCII non-printables, register a defect.""" 

988 

989 non_printables = _non_printable_finder(xtext) 

990 if non_printables: 

991 xtext.defects.append(errors.NonPrintableDefect(non_printables)) 

992 if utils._has_surrogates(xtext): 

993 xtext.defects.append(errors.UndecodableBytesDefect( 

994 "Non-ASCII characters found in header token")) 

995 

996def _get_ptext_to_endchars(value, endchars): 

997 """Scan printables/quoted-pairs until endchars and return unquoted ptext. 

998 

999 This function turns a run of qcontent, ccontent-without-comments, or 

1000 dtext-with-quoted-printables into a single string by unquoting any 

1001 quoted printables. It returns the string, the remaining value, and 

1002 a flag that is True iff there were any quoted printables decoded. 

1003 

1004 """ 

1005 fragment, *remainder = _wsp_splitter(value, 1) 

1006 vchars = [] 

1007 escape = False 

1008 had_qp = False 

1009 for pos in range(len(fragment)): 

1010 if fragment[pos] == '\\': 

1011 if escape: 

1012 escape = False 

1013 had_qp = True 

1014 else: 

1015 escape = True 

1016 continue 

1017 if escape: 

1018 escape = False 

1019 elif fragment[pos] in endchars: 

1020 break 

1021 vchars.append(fragment[pos]) 

1022 else: 

1023 pos = pos + 1 

1024 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp 

1025 

1026def get_fws(value): 

1027 """FWS = 1*WSP 

1028 

1029 This isn't the RFC definition. We're using fws to represent tokens where 

1030 folding can be done, but when we are parsing the *un*folding has already 

1031 been done so we don't need to watch out for CRLF. 

1032 

1033 """ 

1034 newvalue = value.lstrip() 

1035 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') 

1036 return fws, newvalue 

1037 

1038def get_encoded_word(value): 

1039 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 

1040 

1041 """ 

1042 ew = EncodedWord() 

1043 if not value.startswith('=?'): 

1044 raise errors.HeaderParseError( 

1045 "expected encoded word but found {}".format(value)) 

1046 tok, *remainder = value[2:].split('?=', 1) 

1047 if tok == value[2:]: 

1048 raise errors.HeaderParseError( 

1049 "expected encoded word but found {}".format(value)) 

1050 remstr = ''.join(remainder) 

1051 if (len(remstr) > 1 and 

1052 remstr[0] in hexdigits and 

1053 remstr[1] in hexdigits and 

1054 tok.count('?') < 2): 

1055 # The ? after the CTE was followed by an encoded word escape (=XX). 

1056 rest, *remainder = remstr.split('?=', 1) 

1057 tok = tok + '?=' + rest 

1058 if len(tok.split()) > 1: 

1059 ew.defects.append(errors.InvalidHeaderDefect( 

1060 "whitespace inside encoded word")) 

1061 ew.cte = value 

1062 value = ''.join(remainder) 

1063 try: 

1064 text, charset, lang, defects = _ew.decode('=?' + tok + '?=') 

1065 except (ValueError, KeyError): 

1066 raise _InvalidEwError( 

1067 "encoded word format invalid: '{}'".format(ew.cte)) 

1068 ew.charset = charset 

1069 ew.lang = lang 

1070 ew.defects.extend(defects) 

1071 while text: 

1072 if text[0] in WSP: 

1073 token, text = get_fws(text) 

1074 ew.append(token) 

1075 continue 

1076 chars, *remainder = _wsp_splitter(text, 1) 

1077 vtext = ValueTerminal(chars, 'vtext') 

1078 _validate_xtext(vtext) 

1079 ew.append(vtext) 

1080 text = ''.join(remainder) 

1081 # Encoded words should be followed by a WS 

1082 if value and value[0] not in WSP: 

1083 ew.defects.append(errors.InvalidHeaderDefect( 

1084 "missing trailing whitespace after encoded-word")) 

1085 return ew, value 

1086 

1087def get_unstructured(value): 

1088 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct 

1089 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) 

1090 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR 

1091 

1092 obs-NO-WS-CTL is control characters except WSP/CR/LF. 

1093 

1094 So, basically, we have printable runs, plus control characters or nulls in 

1095 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the 

1096 obsolete syntax in its specification, but requires whitespace on either 

1097 side of the encoded words, I can see no reason to need to separate the 

1098 non-printable-non-whitespace from the printable runs if they occur, so we 

1099 parse this into xtext tokens separated by WSP tokens. 

1100 

1101 Because an 'unstructured' value must by definition constitute the entire 

1102 value, this 'get' routine does not return a remaining value, only the 

1103 parsed TokenList. 

1104 

1105 """ 

1106 # XXX: but what about bare CR and LF? They might signal the start or 

1107 # end of an encoded word. YAGNI for now, since our current parsers 

1108 # will never send us strings with bare CR or LF. 

1109 

1110 unstructured = UnstructuredTokenList() 

1111 while value: 

1112 if value[0] in WSP: 

1113 token, value = get_fws(value) 

1114 unstructured.append(token) 

1115 continue 

1116 valid_ew = True 

1117 if value.startswith('=?'): 

1118 try: 

1119 token, value = get_encoded_word(value) 

1120 except _InvalidEwError: 

1121 valid_ew = False 

1122 except errors.HeaderParseError: 

1123 # XXX: Need to figure out how to register defects when 

1124 # appropriate here. 

1125 pass 

1126 else: 

1127 have_ws = True 

1128 if len(unstructured) > 0: 

1129 if unstructured[-1].token_type != 'fws': 

1130 unstructured.defects.append(errors.InvalidHeaderDefect( 

1131 "missing whitespace before encoded word")) 

1132 have_ws = False 

1133 if have_ws and len(unstructured) > 1: 

1134 if unstructured[-2].token_type == 'encoded-word': 

1135 unstructured[-1] = EWWhiteSpaceTerminal( 

1136 unstructured[-1], 'fws') 

1137 unstructured.append(token) 

1138 continue 

1139 tok, *remainder = _wsp_splitter(value, 1) 

1140 # Split in the middle of an atom if there is a rfc2047 encoded word 

1141 # which does not have WSP on both sides. The defect will be registered 

1142 # the next time through the loop. 

1143 # This needs to only be performed when the encoded word is valid; 

1144 # otherwise, performing it on an invalid encoded word can cause 

1145 # the parser to go in an infinite loop. 

1146 if valid_ew and rfc2047_matcher.search(tok): 

1147 tok, *remainder = value.partition('=?') 

1148 vtext = ValueTerminal(tok, 'vtext') 

1149 _validate_xtext(vtext) 

1150 unstructured.append(vtext) 

1151 value = ''.join(remainder) 

1152 return unstructured 

1153 

1154def get_qp_ctext(value): 

1155 r"""ctext = <printable ascii except \ ( )> 

1156 

1157 This is not the RFC ctext, since we are handling nested comments in comment 

1158 and unquoting quoted-pairs here. We allow anything except the '()' 

1159 characters, but if we find any ASCII other than the RFC defined printable 

1160 ASCII, a NonPrintableDefect is added to the token's defects list. Since 

1161 quoted pairs are converted to their unquoted values, what is returned is 

1162 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value 

1163 is ' '. 

1164 

1165 """ 

1166 ptext, value, _ = _get_ptext_to_endchars(value, '()') 

1167 ptext = WhiteSpaceTerminal(ptext, 'ptext') 

1168 _validate_xtext(ptext) 

1169 return ptext, value 

1170 

1171def get_qcontent(value): 

1172 """qcontent = qtext / quoted-pair 

1173 

1174 We allow anything except the DQUOTE character, but if we find any ASCII 

1175 other than the RFC defined printable ASCII, a NonPrintableDefect is 

1176 added to the token's defects list. Any quoted pairs are converted to their 

1177 unquoted values, so what is returned is a 'ptext' token. In this case it 

1178 is a ValueTerminal. 

1179 

1180 """ 

1181 ptext, value, _ = _get_ptext_to_endchars(value, '"') 

1182 ptext = ValueTerminal(ptext, 'ptext') 

1183 _validate_xtext(ptext) 

1184 return ptext, value 

1185 

1186def get_atext(value): 

1187 """atext = <matches _atext_matcher> 

1188 

1189 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to 

1190 the token's defects list if we find non-atext characters. 

1191 """ 

1192 m = _non_atom_end_matcher(value) 

1193 if not m: 

1194 raise errors.HeaderParseError( 

1195 "expected atext but found '{}'".format(value)) 

1196 atext = m.group() 

1197 value = value[len(atext):] 

1198 atext = ValueTerminal(atext, 'atext') 

1199 _validate_xtext(atext) 

1200 return atext, value 

1201 

1202def get_bare_quoted_string(value): 

1203 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE 

1204 

1205 A quoted-string without the leading or trailing white space. Its 

1206 value is the text between the quote marks, with whitespace 

1207 preserved and quoted pairs decoded. 

1208 """ 

1209 if value[0] != '"': 

1210 raise errors.HeaderParseError( 

1211 "expected '\"' but found '{}'".format(value)) 

1212 bare_quoted_string = BareQuotedString() 

1213 value = value[1:] 

1214 if value and value[0] == '"': 

1215 token, value = get_qcontent(value) 

1216 bare_quoted_string.append(token) 

1217 while value and value[0] != '"': 

1218 if value[0] in WSP: 

1219 token, value = get_fws(value) 

1220 elif value[:2] == '=?': 

1221 try: 

1222 token, value = get_encoded_word(value) 

1223 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 

1224 "encoded word inside quoted string")) 

1225 except errors.HeaderParseError: 

1226 token, value = get_qcontent(value) 

1227 else: 

1228 token, value = get_qcontent(value) 

1229 bare_quoted_string.append(token) 

1230 if not value: 

1231 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 

1232 "end of header inside quoted string")) 

1233 return bare_quoted_string, value 

1234 return bare_quoted_string, value[1:] 

1235 

1236def get_comment(value): 

1237 """comment = "(" *([FWS] ccontent) [FWS] ")" 

1238 ccontent = ctext / quoted-pair / comment 

1239 

1240 We handle nested comments here, and quoted-pair in our qp-ctext routine. 

1241 """ 

1242 if value and value[0] != '(': 

1243 raise errors.HeaderParseError( 

1244 "expected '(' but found '{}'".format(value)) 

1245 comment = Comment() 

1246 value = value[1:] 

1247 while value and value[0] != ")": 

1248 if value[0] in WSP: 

1249 token, value = get_fws(value) 

1250 elif value[0] == '(': 

1251 token, value = get_comment(value) 

1252 else: 

1253 token, value = get_qp_ctext(value) 

1254 comment.append(token) 

1255 if not value: 

1256 comment.defects.append(errors.InvalidHeaderDefect( 

1257 "end of header inside comment")) 

1258 return comment, value 

1259 return comment, value[1:] 

1260 

1261def get_cfws(value): 

1262 """CFWS = (1*([FWS] comment) [FWS]) / FWS 

1263 

1264 """ 

1265 cfws = CFWSList() 

1266 while value and value[0] in CFWS_LEADER: 

1267 if value[0] in WSP: 

1268 token, value = get_fws(value) 

1269 else: 

1270 token, value = get_comment(value) 

1271 cfws.append(token) 

1272 return cfws, value 

1273 

1274def get_quoted_string(value): 

1275 """quoted-string = [CFWS] <bare-quoted-string> [CFWS] 

1276 

1277 'bare-quoted-string' is an intermediate class defined by this 

1278 parser and not by the RFC grammar. It is the quoted string 

1279 without any attached CFWS. 

1280 """ 

1281 quoted_string = QuotedString() 

1282 if value and value[0] in CFWS_LEADER: 

1283 token, value = get_cfws(value) 

1284 quoted_string.append(token) 

1285 token, value = get_bare_quoted_string(value) 

1286 quoted_string.append(token) 

1287 if value and value[0] in CFWS_LEADER: 

1288 token, value = get_cfws(value) 

1289 quoted_string.append(token) 

1290 return quoted_string, value 

1291 

1292def get_atom(value): 

1293 """atom = [CFWS] 1*atext [CFWS] 

1294 

1295 An atom could be an rfc2047 encoded word. 

1296 """ 

1297 atom = Atom() 

1298 if value and value[0] in CFWS_LEADER: 

1299 token, value = get_cfws(value) 

1300 atom.append(token) 

1301 if value and value[0] in ATOM_ENDS: 

1302 raise errors.HeaderParseError( 

1303 "expected atom but found '{}'".format(value)) 

1304 if value.startswith('=?'): 

1305 try: 

1306 token, value = get_encoded_word(value) 

1307 except errors.HeaderParseError: 

1308 # XXX: need to figure out how to register defects when 

1309 # appropriate here. 

1310 token, value = get_atext(value) 

1311 else: 

1312 token, value = get_atext(value) 

1313 atom.append(token) 

1314 if value and value[0] in CFWS_LEADER: 

1315 token, value = get_cfws(value) 

1316 atom.append(token) 

1317 return atom, value 

1318 

1319def get_dot_atom_text(value): 

1320 """ dot-text = 1*atext *("." 1*atext) 

1321 

1322 """ 

1323 dot_atom_text = DotAtomText() 

1324 if not value or value[0] in ATOM_ENDS: 

1325 raise errors.HeaderParseError("expected atom at a start of " 

1326 "dot-atom-text but found '{}'".format(value)) 

1327 while value and value[0] not in ATOM_ENDS: 

1328 token, value = get_atext(value) 

1329 dot_atom_text.append(token) 

1330 if value and value[0] == '.': 

1331 dot_atom_text.append(DOT) 

1332 value = value[1:] 

1333 if dot_atom_text[-1] is DOT: 

1334 raise errors.HeaderParseError("expected atom at end of dot-atom-text " 

1335 "but found '{}'".format('.'+value)) 

1336 return dot_atom_text, value 

1337 

1338def get_dot_atom(value): 

1339 """ dot-atom = [CFWS] dot-atom-text [CFWS] 

1340 

1341 Any place we can have a dot atom, we could instead have an rfc2047 encoded 

1342 word. 

1343 """ 

1344 dot_atom = DotAtom() 

1345 if value[0] in CFWS_LEADER: 

1346 token, value = get_cfws(value) 

1347 dot_atom.append(token) 

1348 if value.startswith('=?'): 

1349 try: 

1350 token, value = get_encoded_word(value) 

1351 except errors.HeaderParseError: 

1352 # XXX: need to figure out how to register defects when 

1353 # appropriate here. 

1354 token, value = get_dot_atom_text(value) 

1355 else: 

1356 token, value = get_dot_atom_text(value) 

1357 dot_atom.append(token) 

1358 if value and value[0] in CFWS_LEADER: 

1359 token, value = get_cfws(value) 

1360 dot_atom.append(token) 

1361 return dot_atom, value 

1362 

1363def get_word(value): 

1364 """word = atom / quoted-string 

1365 

1366 Either atom or quoted-string may start with CFWS. We have to peel off this 

1367 CFWS first to determine which type of word to parse. Afterward we splice 

1368 the leading CFWS, if any, into the parsed sub-token. 

1369 

1370 If neither an atom or a quoted-string is found before the next special, a 

1371 HeaderParseError is raised. 

1372 

1373 The token returned is either an Atom or a QuotedString, as appropriate. 

1374 This means the 'word' level of the formal grammar is not represented in the 

1375 parse tree; this is because having that extra layer when manipulating the 

1376 parse tree is more confusing than it is helpful. 

1377 

1378 """ 

1379 if value[0] in CFWS_LEADER: 

1380 leader, value = get_cfws(value) 

1381 else: 

1382 leader = None 

1383 if not value: 

1384 raise errors.HeaderParseError( 

1385 "Expected 'atom' or 'quoted-string' but found nothing.") 

1386 if value[0]=='"': 

1387 token, value = get_quoted_string(value) 

1388 elif value[0] in SPECIALS: 

1389 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " 

1390 "but found '{}'".format(value)) 

1391 else: 

1392 token, value = get_atom(value) 

1393 if leader is not None: 

1394 token[:0] = [leader] 

1395 return token, value 

1396 

1397def get_phrase(value): 

1398 """ phrase = 1*word / obs-phrase 

1399 obs-phrase = word *(word / "." / CFWS) 

1400 

1401 This means a phrase can be a sequence of words, periods, and CFWS in any 

1402 order as long as it starts with at least one word. If anything other than 

1403 words is detected, an ObsoleteHeaderDefect is added to the token's defect 

1404 list. We also accept a phrase that starts with CFWS followed by a dot; 

1405 this is registered as an InvalidHeaderDefect, since it is not supported by 

1406 even the obsolete grammar. 

1407 

1408 """ 

1409 phrase = Phrase() 

1410 try: 

1411 token, value = get_word(value) 

1412 phrase.append(token) 

1413 except errors.HeaderParseError: 

1414 phrase.defects.append(errors.InvalidHeaderDefect( 

1415 "phrase does not start with word")) 

1416 while value and value[0] not in PHRASE_ENDS: 

1417 if value[0]=='.': 

1418 phrase.append(DOT) 

1419 phrase.defects.append(errors.ObsoleteHeaderDefect( 

1420 "period in 'phrase'")) 

1421 value = value[1:] 

1422 else: 

1423 try: 

1424 token, value = get_word(value) 

1425 except errors.HeaderParseError: 

1426 if value[0] in CFWS_LEADER: 

1427 token, value = get_cfws(value) 

1428 phrase.defects.append(errors.ObsoleteHeaderDefect( 

1429 "comment found without atom")) 

1430 else: 

1431 raise 

1432 phrase.append(token) 

1433 return phrase, value 

1434 

1435def get_local_part(value): 

1436 """ local-part = dot-atom / quoted-string / obs-local-part 

1437 

1438 """ 

1439 local_part = LocalPart() 

1440 leader = None 

1441 if value[0] in CFWS_LEADER: 

1442 leader, value = get_cfws(value) 

1443 if not value: 

1444 raise errors.HeaderParseError( 

1445 "expected local-part but found '{}'".format(value)) 

1446 try: 

1447 token, value = get_dot_atom(value) 

1448 except errors.HeaderParseError: 

1449 try: 

1450 token, value = get_word(value) 

1451 except errors.HeaderParseError: 

1452 if value[0] != '\\' and value[0] in PHRASE_ENDS: 

1453 raise 

1454 token = TokenList() 

1455 if leader is not None: 

1456 token[:0] = [leader] 

1457 local_part.append(token) 

1458 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 

1459 obs_local_part, value = get_obs_local_part(str(local_part) + value) 

1460 if obs_local_part.token_type == 'invalid-obs-local-part': 

1461 local_part.defects.append(errors.InvalidHeaderDefect( 

1462 "local-part is not dot-atom, quoted-string, or obs-local-part")) 

1463 else: 

1464 local_part.defects.append(errors.ObsoleteHeaderDefect( 

1465 "local-part is not a dot-atom (contains CFWS)")) 

1466 local_part[0] = obs_local_part 

1467 try: 

1468 local_part.value.encode('ascii') 

1469 except UnicodeEncodeError: 

1470 local_part.defects.append(errors.NonASCIILocalPartDefect( 

1471 "local-part contains non-ASCII characters)")) 

1472 return local_part, value 

1473 

1474def get_obs_local_part(value): 

1475 """ obs-local-part = word *("." word) 

1476 """ 

1477 obs_local_part = ObsLocalPart() 

1478 last_non_ws_was_dot = False 

1479 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 

1480 if value[0] == '.': 

1481 if last_non_ws_was_dot: 

1482 obs_local_part.defects.append(errors.InvalidHeaderDefect( 

1483 "invalid repeated '.'")) 

1484 obs_local_part.append(DOT) 

1485 last_non_ws_was_dot = True 

1486 value = value[1:] 

1487 continue 

1488 elif value[0]=='\\': 

1489 obs_local_part.append(ValueTerminal(value[0], 

1490 'misplaced-special')) 

1491 value = value[1:] 

1492 obs_local_part.defects.append(errors.InvalidHeaderDefect( 

1493 "'\\' character outside of quoted-string/ccontent")) 

1494 last_non_ws_was_dot = False 

1495 continue 

1496 if obs_local_part and obs_local_part[-1].token_type != 'dot': 

1497 obs_local_part.defects.append(errors.InvalidHeaderDefect( 

1498 "missing '.' between words")) 

1499 try: 

1500 token, value = get_word(value) 

1501 last_non_ws_was_dot = False 

1502 except errors.HeaderParseError: 

1503 if value[0] not in CFWS_LEADER: 

1504 raise 

1505 token, value = get_cfws(value) 

1506 obs_local_part.append(token) 

1507 if (obs_local_part[0].token_type == 'dot' or 

1508 obs_local_part[0].token_type=='cfws' and 

1509 obs_local_part[1].token_type=='dot'): 

1510 obs_local_part.defects.append(errors.InvalidHeaderDefect( 

1511 "Invalid leading '.' in local part")) 

1512 if (obs_local_part[-1].token_type == 'dot' or 

1513 obs_local_part[-1].token_type=='cfws' and 

1514 obs_local_part[-2].token_type=='dot'): 

1515 obs_local_part.defects.append(errors.InvalidHeaderDefect( 

1516 "Invalid trailing '.' in local part")) 

1517 if obs_local_part.defects: 

1518 obs_local_part.token_type = 'invalid-obs-local-part' 

1519 return obs_local_part, value 

1520 

1521def get_dtext(value): 

1522 r""" dtext = <printable ascii except \ [ ]> / obs-dtext 

1523 obs-dtext = obs-NO-WS-CTL / quoted-pair 

1524 

1525 We allow anything except the excluded characters, but if we find any 

1526 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is 

1527 added to the token's defects list. Quoted pairs are converted to their 

1528 unquoted values, so what is returned is a ptext token, in this case a 

1529 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is 

1530 added to the returned token's defect list. 

1531 

1532 """ 

1533 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]') 

1534 ptext = ValueTerminal(ptext, 'ptext') 

1535 if had_qp: 

1536 ptext.defects.append(errors.ObsoleteHeaderDefect( 

1537 "quoted printable found in domain-literal")) 

1538 _validate_xtext(ptext) 

1539 return ptext, value 

1540 

1541def _check_for_early_dl_end(value, domain_literal): 

1542 if value: 

1543 return False 

1544 domain_literal.append(errors.InvalidHeaderDefect( 

1545 "end of input inside domain-literal")) 

1546 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 

1547 return True 

1548 

1549def get_domain_literal(value): 

1550 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] 

1551 

1552 """ 

1553 domain_literal = DomainLiteral() 

1554 if value[0] in CFWS_LEADER: 

1555 token, value = get_cfws(value) 

1556 domain_literal.append(token) 

1557 if not value: 

1558 raise errors.HeaderParseError("expected domain-literal") 

1559 if value[0] != '[': 

1560 raise errors.HeaderParseError("expected '[' at start of domain-literal " 

1561 "but found '{}'".format(value)) 

1562 value = value[1:] 

1563 if _check_for_early_dl_end(value, domain_literal): 

1564 return domain_literal, value 

1565 domain_literal.append(ValueTerminal('[', 'domain-literal-start')) 

1566 if value[0] in WSP: 

1567 token, value = get_fws(value) 

1568 domain_literal.append(token) 

1569 token, value = get_dtext(value) 

1570 domain_literal.append(token) 

1571 if _check_for_early_dl_end(value, domain_literal): 

1572 return domain_literal, value 

1573 if value[0] in WSP: 

1574 token, value = get_fws(value) 

1575 domain_literal.append(token) 

1576 if _check_for_early_dl_end(value, domain_literal): 

1577 return domain_literal, value 

1578 if value[0] != ']': 

1579 raise errors.HeaderParseError("expected ']' at end of domain-literal " 

1580 "but found '{}'".format(value)) 

1581 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 

1582 value = value[1:] 

1583 if value and value[0] in CFWS_LEADER: 

1584 token, value = get_cfws(value) 

1585 domain_literal.append(token) 

1586 return domain_literal, value 

1587 

1588def get_domain(value): 

1589 """ domain = dot-atom / domain-literal / obs-domain 

1590 obs-domain = atom *("." atom)) 

1591 

1592 """ 

1593 domain = Domain() 

1594 leader = None 

1595 if value[0] in CFWS_LEADER: 

1596 leader, value = get_cfws(value) 

1597 if not value: 

1598 raise errors.HeaderParseError( 

1599 "expected domain but found '{}'".format(value)) 

1600 if value[0] == '[': 

1601 token, value = get_domain_literal(value) 

1602 if leader is not None: 

1603 token[:0] = [leader] 

1604 domain.append(token) 

1605 return domain, value 

1606 try: 

1607 token, value = get_dot_atom(value) 

1608 except errors.HeaderParseError: 

1609 token, value = get_atom(value) 

1610 if value and value[0] == '@': 

1611 raise errors.HeaderParseError('Invalid Domain') 

1612 if leader is not None: 

1613 token[:0] = [leader] 

1614 domain.append(token) 

1615 if value and value[0] == '.': 

1616 domain.defects.append(errors.ObsoleteHeaderDefect( 

1617 "domain is not a dot-atom (contains CFWS)")) 

1618 if domain[0].token_type == 'dot-atom': 

1619 domain[:] = domain[0] 

1620 while value and value[0] == '.': 

1621 domain.append(DOT) 

1622 token, value = get_atom(value[1:]) 

1623 domain.append(token) 

1624 return domain, value 

1625 

1626def get_addr_spec(value): 

1627 """ addr-spec = local-part "@" domain 

1628 

1629 """ 

1630 addr_spec = AddrSpec() 

1631 token, value = get_local_part(value) 

1632 addr_spec.append(token) 

1633 if not value or value[0] != '@': 

1634 addr_spec.defects.append(errors.InvalidHeaderDefect( 

1635 "addr-spec local part with no domain")) 

1636 return addr_spec, value 

1637 addr_spec.append(ValueTerminal('@', 'address-at-symbol')) 

1638 token, value = get_domain(value[1:]) 

1639 addr_spec.append(token) 

1640 return addr_spec, value 

1641 

1642def get_obs_route(value): 

1643 """ obs-route = obs-domain-list ":" 

1644 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain]) 

1645 

1646 Returns an obs-route token with the appropriate sub-tokens (that is, 

1647 there is no obs-domain-list in the parse tree). 

1648 """ 

1649 obs_route = ObsRoute() 

1650 while value and (value[0]==',' or value[0] in CFWS_LEADER): 

1651 if value[0] in CFWS_LEADER: 

1652 token, value = get_cfws(value) 

1653 obs_route.append(token) 

1654 elif value[0] == ',': 

1655 obs_route.append(ListSeparator) 

1656 value = value[1:] 

1657 if not value or value[0] != '@': 

1658 raise errors.HeaderParseError( 

1659 "expected obs-route domain but found '{}'".format(value)) 

1660 obs_route.append(RouteComponentMarker) 

1661 token, value = get_domain(value[1:]) 

1662 obs_route.append(token) 

1663 while value and value[0]==',': 

1664 obs_route.append(ListSeparator) 

1665 value = value[1:] 

1666 if not value: 

1667 break 

1668 if value[0] in CFWS_LEADER: 

1669 token, value = get_cfws(value) 

1670 obs_route.append(token) 

1671 if value[0] == '@': 

1672 obs_route.append(RouteComponentMarker) 

1673 token, value = get_domain(value[1:]) 

1674 obs_route.append(token) 

1675 if not value: 

1676 raise errors.HeaderParseError("end of header while parsing obs-route") 

1677 if value[0] != ':': 

1678 raise errors.HeaderParseError( "expected ':' marking end of " 

1679 "obs-route but found '{}'".format(value)) 

1680 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker')) 

1681 return obs_route, value[1:] 

1682 

1683def get_angle_addr(value): 

1684 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr 

1685 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] 

1686 

1687 """ 

1688 angle_addr = AngleAddr() 

1689 if value[0] in CFWS_LEADER: 

1690 token, value = get_cfws(value) 

1691 angle_addr.append(token) 

1692 if not value or value[0] != '<': 

1693 raise errors.HeaderParseError( 

1694 "expected angle-addr but found '{}'".format(value)) 

1695 angle_addr.append(ValueTerminal('<', 'angle-addr-start')) 

1696 value = value[1:] 

1697 # Although it is not legal per RFC5322, SMTP uses '<>' in certain 

1698 # circumstances. 

1699 if value[0] == '>': 

1700 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 

1701 angle_addr.defects.append(errors.InvalidHeaderDefect( 

1702 "null addr-spec in angle-addr")) 

1703 value = value[1:] 

1704 return angle_addr, value 

1705 try: 

1706 token, value = get_addr_spec(value) 

1707 except errors.HeaderParseError: 

1708 try: 

1709 token, value = get_obs_route(value) 

1710 angle_addr.defects.append(errors.ObsoleteHeaderDefect( 

1711 "obsolete route specification in angle-addr")) 

1712 except errors.HeaderParseError: 

1713 raise errors.HeaderParseError( 

1714 "expected addr-spec or obs-route but found '{}'".format(value)) 

1715 angle_addr.append(token) 

1716 token, value = get_addr_spec(value) 

1717 angle_addr.append(token) 

1718 if value and value[0] == '>': 

1719 value = value[1:] 

1720 else: 

1721 angle_addr.defects.append(errors.InvalidHeaderDefect( 

1722 "missing trailing '>' on angle-addr")) 

1723 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 

1724 if value and value[0] in CFWS_LEADER: 

1725 token, value = get_cfws(value) 

1726 angle_addr.append(token) 

1727 return angle_addr, value 

1728 

1729def get_display_name(value): 

1730 """ display-name = phrase 

1731 

1732 Because this is simply a name-rule, we don't return a display-name 

1733 token containing a phrase, but rather a display-name token with 

1734 the content of the phrase. 

1735 

1736 """ 

1737 display_name = DisplayName() 

1738 token, value = get_phrase(value) 

1739 display_name.extend(token[:]) 

1740 display_name.defects = token.defects[:] 

1741 return display_name, value 

1742 

1743 

1744def get_name_addr(value): 

1745 """ name-addr = [display-name] angle-addr 

1746 

1747 """ 

1748 name_addr = NameAddr() 

1749 # Both the optional display name and the angle-addr can start with cfws. 

1750 leader = None 

1751 if value[0] in CFWS_LEADER: 

1752 leader, value = get_cfws(value) 

1753 if not value: 

1754 raise errors.HeaderParseError( 

1755 "expected name-addr but found '{}'".format(leader)) 

1756 if value[0] != '<': 

1757 if value[0] in PHRASE_ENDS: 

1758 raise errors.HeaderParseError( 

1759 "expected name-addr but found '{}'".format(value)) 

1760 token, value = get_display_name(value) 

1761 if not value: 

1762 raise errors.HeaderParseError( 

1763 "expected name-addr but found '{}'".format(token)) 

1764 if leader is not None: 

1765 token[0][:0] = [leader] 

1766 leader = None 

1767 name_addr.append(token) 

1768 token, value = get_angle_addr(value) 

1769 if leader is not None: 

1770 token[:0] = [leader] 

1771 name_addr.append(token) 

1772 return name_addr, value 

1773 

1774def get_mailbox(value): 

1775 """ mailbox = name-addr / addr-spec 

1776 

1777 """ 

1778 # The only way to figure out if we are dealing with a name-addr or an 

1779 # addr-spec is to try parsing each one. 

1780 mailbox = Mailbox() 

1781 try: 

1782 token, value = get_name_addr(value) 

1783 except errors.HeaderParseError: 

1784 try: 

1785 token, value = get_addr_spec(value) 

1786 except errors.HeaderParseError: 

1787 raise errors.HeaderParseError( 

1788 "expected mailbox but found '{}'".format(value)) 

1789 if any(isinstance(x, errors.InvalidHeaderDefect) 

1790 for x in token.all_defects): 

1791 mailbox.token_type = 'invalid-mailbox' 

1792 mailbox.append(token) 

1793 return mailbox, value 

1794 

1795def get_invalid_mailbox(value, endchars): 

1796 """ Read everything up to one of the chars in endchars. 

1797 

1798 This is outside the formal grammar. The InvalidMailbox TokenList that is 

1799 returned acts like a Mailbox, but the data attributes are None. 

1800 

1801 """ 

1802 invalid_mailbox = InvalidMailbox() 

1803 while value and value[0] not in endchars: 

1804 if value[0] in PHRASE_ENDS: 

1805 invalid_mailbox.append(ValueTerminal(value[0], 

1806 'misplaced-special')) 

1807 value = value[1:] 

1808 else: 

1809 token, value = get_phrase(value) 

1810 invalid_mailbox.append(token) 

1811 return invalid_mailbox, value 

1812 

1813def get_mailbox_list(value): 

1814 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list 

1815 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) 

1816 

1817 For this routine we go outside the formal grammar in order to improve error 

1818 handling. We recognize the end of the mailbox list only at the end of the 

1819 value or at a ';' (the group terminator). This is so that we can turn 

1820 invalid mailboxes into InvalidMailbox tokens and continue parsing any 

1821 remaining valid mailboxes. We also allow all mailbox entries to be null, 

1822 and this condition is handled appropriately at a higher level. 

1823 

1824 """ 

1825 mailbox_list = MailboxList() 

1826 while value and value[0] != ';': 

1827 try: 

1828 token, value = get_mailbox(value) 

1829 mailbox_list.append(token) 

1830 except errors.HeaderParseError: 

1831 leader = None 

1832 if value[0] in CFWS_LEADER: 

1833 leader, value = get_cfws(value) 

1834 if not value or value[0] in ',;': 

1835 mailbox_list.append(leader) 

1836 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 

1837 "empty element in mailbox-list")) 

1838 else: 

1839 token, value = get_invalid_mailbox(value, ',;') 

1840 if leader is not None: 

1841 token[:0] = [leader] 

1842 mailbox_list.append(token) 

1843 mailbox_list.defects.append(errors.InvalidHeaderDefect( 

1844 "invalid mailbox in mailbox-list")) 

1845 elif value[0] == ',': 

1846 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 

1847 "empty element in mailbox-list")) 

1848 else: 

1849 token, value = get_invalid_mailbox(value, ',;') 

1850 if leader is not None: 

1851 token[:0] = [leader] 

1852 mailbox_list.append(token) 

1853 mailbox_list.defects.append(errors.InvalidHeaderDefect( 

1854 "invalid mailbox in mailbox-list")) 

1855 if value and value[0] not in ',;': 

1856 # Crap after mailbox; treat it as an invalid mailbox. 

1857 # The mailbox info will still be available. 

1858 mailbox = mailbox_list[-1] 

1859 mailbox.token_type = 'invalid-mailbox' 

1860 token, value = get_invalid_mailbox(value, ',;') 

1861 mailbox.extend(token) 

1862 mailbox_list.defects.append(errors.InvalidHeaderDefect( 

1863 "invalid mailbox in mailbox-list")) 

1864 if value and value[0] == ',': 

1865 mailbox_list.append(ListSeparator) 

1866 value = value[1:] 

1867 return mailbox_list, value 

1868 

1869 

1870def get_group_list(value): 

1871 """ group-list = mailbox-list / CFWS / obs-group-list 

1872 obs-group-list = 1*([CFWS] ",") [CFWS] 

1873 

1874 """ 

1875 group_list = GroupList() 

1876 if not value: 

1877 group_list.defects.append(errors.InvalidHeaderDefect( 

1878 "end of header before group-list")) 

1879 return group_list, value 

1880 leader = None 

1881 if value and value[0] in CFWS_LEADER: 

1882 leader, value = get_cfws(value) 

1883 if not value: 

1884 # This should never happen in email parsing, since CFWS-only is a 

1885 # legal alternative to group-list in a group, which is the only 

1886 # place group-list appears. 

1887 group_list.defects.append(errors.InvalidHeaderDefect( 

1888 "end of header in group-list")) 

1889 group_list.append(leader) 

1890 return group_list, value 

1891 if value[0] == ';': 

1892 group_list.append(leader) 

1893 return group_list, value 

1894 token, value = get_mailbox_list(value) 

1895 if len(token.all_mailboxes)==0: 

1896 if leader is not None: 

1897 group_list.append(leader) 

1898 group_list.extend(token) 

1899 group_list.defects.append(errors.ObsoleteHeaderDefect( 

1900 "group-list with empty entries")) 

1901 return group_list, value 

1902 if leader is not None: 

1903 token[:0] = [leader] 

1904 group_list.append(token) 

1905 return group_list, value 

1906 

1907def get_group(value): 

1908 """ group = display-name ":" [group-list] ";" [CFWS] 

1909 

1910 """ 

1911 group = Group() 

1912 token, value = get_display_name(value) 

1913 if not value or value[0] != ':': 

1914 raise errors.HeaderParseError("expected ':' at end of group " 

1915 "display name but found '{}'".format(value)) 

1916 group.append(token) 

1917 group.append(ValueTerminal(':', 'group-display-name-terminator')) 

1918 value = value[1:] 

1919 if value and value[0] == ';': 

1920 group.append(ValueTerminal(';', 'group-terminator')) 

1921 return group, value[1:] 

1922 token, value = get_group_list(value) 

1923 group.append(token) 

1924 if not value: 

1925 group.defects.append(errors.InvalidHeaderDefect( 

1926 "end of header in group")) 

1927 elif value[0] != ';': 

1928 raise errors.HeaderParseError( 

1929 "expected ';' at end of group but found {}".format(value)) 

1930 group.append(ValueTerminal(';', 'group-terminator')) 

1931 value = value[1:] 

1932 if value and value[0] in CFWS_LEADER: 

1933 token, value = get_cfws(value) 

1934 group.append(token) 

1935 return group, value 

1936 

1937def get_address(value): 

1938 """ address = mailbox / group 

1939 

1940 Note that counter-intuitively, an address can be either a single address or 

1941 a list of addresses (a group). This is why the returned Address object has 

1942 a 'mailboxes' attribute which treats a single address as a list of length 

1943 one. When you need to differentiate between to two cases, extract the single 

1944 element, which is either a mailbox or a group token. 

1945 

1946 """ 

1947 # The formal grammar isn't very helpful when parsing an address. mailbox 

1948 # and group, especially when allowing for obsolete forms, start off very 

1949 # similarly. It is only when you reach one of @, <, or : that you know 

1950 # what you've got. So, we try each one in turn, starting with the more 

1951 # likely of the two. We could perhaps make this more efficient by looking 

1952 # for a phrase and then branching based on the next character, but that 

1953 # would be a premature optimization. 

1954 address = Address() 

1955 try: 

1956 token, value = get_group(value) 

1957 except errors.HeaderParseError: 

1958 try: 

1959 token, value = get_mailbox(value) 

1960 except errors.HeaderParseError: 

1961 raise errors.HeaderParseError( 

1962 "expected address but found '{}'".format(value)) 

1963 address.append(token) 

1964 return address, value 

1965 

1966def get_address_list(value): 

1967 """ address_list = (address *("," address)) / obs-addr-list 

1968 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) 

1969 

1970 We depart from the formal grammar here by continuing to parse until the end 

1971 of the input, assuming the input to be entirely composed of an 

1972 address-list. This is always true in email parsing, and allows us 

1973 to skip invalid addresses to parse additional valid ones. 

1974 

1975 """ 

1976 address_list = AddressList() 

1977 while value: 

1978 try: 

1979 token, value = get_address(value) 

1980 address_list.append(token) 

1981 except errors.HeaderParseError as err: 

1982 leader = None 

1983 if value[0] in CFWS_LEADER: 

1984 leader, value = get_cfws(value) 

1985 if not value or value[0] == ',': 

1986 address_list.append(leader) 

1987 address_list.defects.append(errors.ObsoleteHeaderDefect( 

1988 "address-list entry with no content")) 

1989 else: 

1990 token, value = get_invalid_mailbox(value, ',') 

1991 if leader is not None: 

1992 token[:0] = [leader] 

1993 address_list.append(Address([token])) 

1994 address_list.defects.append(errors.InvalidHeaderDefect( 

1995 "invalid address in address-list")) 

1996 elif value[0] == ',': 

1997 address_list.defects.append(errors.ObsoleteHeaderDefect( 

1998 "empty element in address-list")) 

1999 else: 

2000 token, value = get_invalid_mailbox(value, ',') 

2001 if leader is not None: 

2002 token[:0] = [leader] 

2003 address_list.append(Address([token])) 

2004 address_list.defects.append(errors.InvalidHeaderDefect( 

2005 "invalid address in address-list")) 

2006 if value and value[0] != ',': 

2007 # Crap after address; treat it as an invalid mailbox. 

2008 # The mailbox info will still be available. 

2009 mailbox = address_list[-1][0] 

2010 mailbox.token_type = 'invalid-mailbox' 

2011 token, value = get_invalid_mailbox(value, ',') 

2012 mailbox.extend(token) 

2013 address_list.defects.append(errors.InvalidHeaderDefect( 

2014 "invalid address in address-list")) 

2015 if value: # Must be a , at this point. 

2016 address_list.append(ValueTerminal(',', 'list-separator')) 

2017 value = value[1:] 

2018 return address_list, value 

2019 

2020 

2021def get_no_fold_literal(value): 

2022 """ no-fold-literal = "[" *dtext "]" 

2023 """ 

2024 no_fold_literal = NoFoldLiteral() 

2025 if not value: 

2026 raise errors.HeaderParseError( 

2027 "expected no-fold-literal but found '{}'".format(value)) 

2028 if value[0] != '[': 

2029 raise errors.HeaderParseError( 

2030 "expected '[' at the start of no-fold-literal " 

2031 "but found '{}'".format(value)) 

2032 no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start')) 

2033 value = value[1:] 

2034 token, value = get_dtext(value) 

2035 no_fold_literal.append(token) 

2036 if not value or value[0] != ']': 

2037 raise errors.HeaderParseError( 

2038 "expected ']' at the end of no-fold-literal " 

2039 "but found '{}'".format(value)) 

2040 no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end')) 

2041 return no_fold_literal, value[1:] 

2042 

2043def get_msg_id(value): 

2044 """msg-id = [CFWS] "<" id-left '@' id-right ">" [CFWS] 

2045 id-left = dot-atom-text / obs-id-left 

2046 id-right = dot-atom-text / no-fold-literal / obs-id-right 

2047 no-fold-literal = "[" *dtext "]" 

2048 """ 

2049 msg_id = MsgID() 

2050 if value and value[0] in CFWS_LEADER: 

2051 token, value = get_cfws(value) 

2052 msg_id.append(token) 

2053 if not value or value[0] != '<': 

2054 raise errors.HeaderParseError( 

2055 "expected msg-id but found '{}'".format(value)) 

2056 msg_id.append(ValueTerminal('<', 'msg-id-start')) 

2057 value = value[1:] 

2058 # Parse id-left. 

2059 try: 

2060 token, value = get_dot_atom_text(value) 

2061 except errors.HeaderParseError: 

2062 try: 

2063 # obs-id-left is same as local-part of add-spec. 

2064 token, value = get_obs_local_part(value) 

2065 msg_id.defects.append(errors.ObsoleteHeaderDefect( 

2066 "obsolete id-left in msg-id")) 

2067 except errors.HeaderParseError: 

2068 raise errors.HeaderParseError( 

2069 "expected dot-atom-text or obs-id-left" 

2070 " but found '{}'".format(value)) 

2071 msg_id.append(token) 

2072 if not value or value[0] != '@': 

2073 msg_id.defects.append(errors.InvalidHeaderDefect( 

2074 "msg-id with no id-right")) 

2075 # Even though there is no id-right, if the local part 

2076 # ends with `>` let's just parse it too and return 

2077 # along with the defect. 

2078 if value and value[0] == '>': 

2079 msg_id.append(ValueTerminal('>', 'msg-id-end')) 

2080 value = value[1:] 

2081 return msg_id, value 

2082 msg_id.append(ValueTerminal('@', 'address-at-symbol')) 

2083 value = value[1:] 

2084 # Parse id-right. 

2085 try: 

2086 token, value = get_dot_atom_text(value) 

2087 except errors.HeaderParseError: 

2088 try: 

2089 token, value = get_no_fold_literal(value) 

2090 except errors.HeaderParseError as e: 

2091 try: 

2092 token, value = get_domain(value) 

2093 msg_id.defects.append(errors.ObsoleteHeaderDefect( 

2094 "obsolete id-right in msg-id")) 

2095 except errors.HeaderParseError: 

2096 raise errors.HeaderParseError( 

2097 "expected dot-atom-text, no-fold-literal or obs-id-right" 

2098 " but found '{}'".format(value)) 

2099 msg_id.append(token) 

2100 if value and value[0] == '>': 

2101 value = value[1:] 

2102 else: 

2103 msg_id.defects.append(errors.InvalidHeaderDefect( 

2104 "missing trailing '>' on msg-id")) 

2105 msg_id.append(ValueTerminal('>', 'msg-id-end')) 

2106 if value and value[0] in CFWS_LEADER: 

2107 token, value = get_cfws(value) 

2108 msg_id.append(token) 

2109 return msg_id, value 

2110 

2111 

2112def parse_message_id(value): 

2113 """message-id = "Message-ID:" msg-id CRLF 

2114 """ 

2115 message_id = MessageID() 

2116 try: 

2117 token, value = get_msg_id(value) 

2118 message_id.append(token) 

2119 except errors.HeaderParseError as ex: 

2120 token = get_unstructured(value) 

2121 message_id = InvalidMessageID(token) 

2122 message_id.defects.append( 

2123 errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex))) 

2124 else: 

2125 # Value after parsing a valid msg_id should be None. 

2126 if value: 

2127 message_id.defects.append(errors.InvalidHeaderDefect( 

2128 "Unexpected {!r}".format(value))) 

2129 

2130 return message_id 

2131 

2132# 

2133# XXX: As I begin to add additional header parsers, I'm realizing we probably 

2134# have two level of parser routines: the get_XXX methods that get a token in 

2135# the grammar, and parse_XXX methods that parse an entire field value. So 

2136# get_address_list above should really be a parse_ method, as probably should 

2137# be get_unstructured. 

2138# 

2139 

2140def parse_mime_version(value): 

2141 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS] 

2142 

2143 """ 

2144 # The [CFWS] is implicit in the RFC 2045 BNF. 

2145 # XXX: This routine is a bit verbose, should factor out a get_int method. 

2146 mime_version = MIMEVersion() 

2147 if not value: 

2148 mime_version.defects.append(errors.HeaderMissingRequiredValue( 

2149 "Missing MIME version number (eg: 1.0)")) 

2150 return mime_version 

2151 if value[0] in CFWS_LEADER: 

2152 token, value = get_cfws(value) 

2153 mime_version.append(token) 

2154 if not value: 

2155 mime_version.defects.append(errors.HeaderMissingRequiredValue( 

2156 "Expected MIME version number but found only CFWS")) 

2157 digits = '' 

2158 while value and value[0] != '.' and value[0] not in CFWS_LEADER: 

2159 digits += value[0] 

2160 value = value[1:] 

2161 if not digits.isdigit(): 

2162 mime_version.defects.append(errors.InvalidHeaderDefect( 

2163 "Expected MIME major version number but found {!r}".format(digits))) 

2164 mime_version.append(ValueTerminal(digits, 'xtext')) 

2165 else: 

2166 mime_version.major = int(digits) 

2167 mime_version.append(ValueTerminal(digits, 'digits')) 

2168 if value and value[0] in CFWS_LEADER: 

2169 token, value = get_cfws(value) 

2170 mime_version.append(token) 

2171 if not value or value[0] != '.': 

2172 if mime_version.major is not None: 

2173 mime_version.defects.append(errors.InvalidHeaderDefect( 

2174 "Incomplete MIME version; found only major number")) 

2175 if value: 

2176 mime_version.append(ValueTerminal(value, 'xtext')) 

2177 return mime_version 

2178 mime_version.append(ValueTerminal('.', 'version-separator')) 

2179 value = value[1:] 

2180 if value and value[0] in CFWS_LEADER: 

2181 token, value = get_cfws(value) 

2182 mime_version.append(token) 

2183 if not value: 

2184 if mime_version.major is not None: 

2185 mime_version.defects.append(errors.InvalidHeaderDefect( 

2186 "Incomplete MIME version; found only major number")) 

2187 return mime_version 

2188 digits = '' 

2189 while value and value[0] not in CFWS_LEADER: 

2190 digits += value[0] 

2191 value = value[1:] 

2192 if not digits.isdigit(): 

2193 mime_version.defects.append(errors.InvalidHeaderDefect( 

2194 "Expected MIME minor version number but found {!r}".format(digits))) 

2195 mime_version.append(ValueTerminal(digits, 'xtext')) 

2196 else: 

2197 mime_version.minor = int(digits) 

2198 mime_version.append(ValueTerminal(digits, 'digits')) 

2199 if value and value[0] in CFWS_LEADER: 

2200 token, value = get_cfws(value) 

2201 mime_version.append(token) 

2202 if value: 

2203 mime_version.defects.append(errors.InvalidHeaderDefect( 

2204 "Excess non-CFWS text after MIME version")) 

2205 mime_version.append(ValueTerminal(value, 'xtext')) 

2206 return mime_version 

2207 

2208def get_invalid_parameter(value): 

2209 """ Read everything up to the next ';'. 

2210 

2211 This is outside the formal grammar. The InvalidParameter TokenList that is 

2212 returned acts like a Parameter, but the data attributes are None. 

2213 

2214 """ 

2215 invalid_parameter = InvalidParameter() 

2216 while value and value[0] != ';': 

2217 if value[0] in PHRASE_ENDS: 

2218 invalid_parameter.append(ValueTerminal(value[0], 

2219 'misplaced-special')) 

2220 value = value[1:] 

2221 else: 

2222 token, value = get_phrase(value) 

2223 invalid_parameter.append(token) 

2224 return invalid_parameter, value 

2225 

2226def get_ttext(value): 

2227 """ttext = <matches _ttext_matcher> 

2228 

2229 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's 

2230 defects list if we find non-ttext characters. We also register defects for 

2231 *any* non-printables even though the RFC doesn't exclude all of them, 

2232 because we follow the spirit of RFC 5322. 

2233 

2234 """ 

2235 m = _non_token_end_matcher(value) 

2236 if not m: 

2237 raise errors.HeaderParseError( 

2238 "expected ttext but found '{}'".format(value)) 

2239 ttext = m.group() 

2240 value = value[len(ttext):] 

2241 ttext = ValueTerminal(ttext, 'ttext') 

2242 _validate_xtext(ttext) 

2243 return ttext, value 

2244 

2245def get_token(value): 

2246 """token = [CFWS] 1*ttext [CFWS] 

2247 

2248 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or 

2249 tspecials. We also exclude tabs even though the RFC doesn't. 

2250 

2251 The RFC implies the CFWS but is not explicit about it in the BNF. 

2252 

2253 """ 

2254 mtoken = Token() 

2255 if value and value[0] in CFWS_LEADER: 

2256 token, value = get_cfws(value) 

2257 mtoken.append(token) 

2258 if value and value[0] in TOKEN_ENDS: 

2259 raise errors.HeaderParseError( 

2260 "expected token but found '{}'".format(value)) 

2261 token, value = get_ttext(value) 

2262 mtoken.append(token) 

2263 if value and value[0] in CFWS_LEADER: 

2264 token, value = get_cfws(value) 

2265 mtoken.append(token) 

2266 return mtoken, value 

2267 

2268def get_attrtext(value): 

2269 """attrtext = 1*(any non-ATTRIBUTE_ENDS character) 

2270 

2271 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the 

2272 token's defects list if we find non-attrtext characters. We also register 

2273 defects for *any* non-printables even though the RFC doesn't exclude all of 

2274 them, because we follow the spirit of RFC 5322. 

2275 

2276 """ 

2277 m = _non_attribute_end_matcher(value) 

2278 if not m: 

2279 raise errors.HeaderParseError( 

2280 "expected attrtext but found {!r}".format(value)) 

2281 attrtext = m.group() 

2282 value = value[len(attrtext):] 

2283 attrtext = ValueTerminal(attrtext, 'attrtext') 

2284 _validate_xtext(attrtext) 

2285 return attrtext, value 

2286 

2287def get_attribute(value): 

2288 """ [CFWS] 1*attrtext [CFWS] 

2289 

2290 This version of the BNF makes the CFWS explicit, and as usual we use a 

2291 value terminal for the actual run of characters. The RFC equivalent of 

2292 attrtext is the token characters, with the subtraction of '*', "'", and '%'. 

2293 We include tab in the excluded set just as we do for token. 

2294 

2295 """ 

2296 attribute = Attribute() 

2297 if value and value[0] in CFWS_LEADER: 

2298 token, value = get_cfws(value) 

2299 attribute.append(token) 

2300 if value and value[0] in ATTRIBUTE_ENDS: 

2301 raise errors.HeaderParseError( 

2302 "expected token but found '{}'".format(value)) 

2303 token, value = get_attrtext(value) 

2304 attribute.append(token) 

2305 if value and value[0] in CFWS_LEADER: 

2306 token, value = get_cfws(value) 

2307 attribute.append(token) 

2308 return attribute, value 

2309 

2310def get_extended_attrtext(value): 

2311 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%') 

2312 

2313 This is a special parsing routine so that we get a value that 

2314 includes % escapes as a single string (which we decode as a single 

2315 string later). 

2316 

2317 """ 

2318 m = _non_extended_attribute_end_matcher(value) 

2319 if not m: 

2320 raise errors.HeaderParseError( 

2321 "expected extended attrtext but found {!r}".format(value)) 

2322 attrtext = m.group() 

2323 value = value[len(attrtext):] 

2324 attrtext = ValueTerminal(attrtext, 'extended-attrtext') 

2325 _validate_xtext(attrtext) 

2326 return attrtext, value 

2327 

2328def get_extended_attribute(value): 

2329 """ [CFWS] 1*extended_attrtext [CFWS] 

2330 

2331 This is like the non-extended version except we allow % characters, so that 

2332 we can pick up an encoded value as a single string. 

2333 

2334 """ 

2335 # XXX: should we have an ExtendedAttribute TokenList? 

2336 attribute = Attribute() 

2337 if value and value[0] in CFWS_LEADER: 

2338 token, value = get_cfws(value) 

2339 attribute.append(token) 

2340 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS: 

2341 raise errors.HeaderParseError( 

2342 "expected token but found '{}'".format(value)) 

2343 token, value = get_extended_attrtext(value) 

2344 attribute.append(token) 

2345 if value and value[0] in CFWS_LEADER: 

2346 token, value = get_cfws(value) 

2347 attribute.append(token) 

2348 return attribute, value 

2349 

2350def get_section(value): 

2351 """ '*' digits 

2352 

2353 The formal BNF is more complicated because leading 0s are not allowed. We 

2354 check for that and add a defect. We also assume no CFWS is allowed between 

2355 the '*' and the digits, though the RFC is not crystal clear on that. 

2356 The caller should already have dealt with leading CFWS. 

2357 

2358 """ 

2359 section = Section() 

2360 if not value or value[0] != '*': 

2361 raise errors.HeaderParseError("Expected section but found {}".format( 

2362 value)) 

2363 section.append(ValueTerminal('*', 'section-marker')) 

2364 value = value[1:] 

2365 if not value or not value[0].isdigit(): 

2366 raise errors.HeaderParseError("Expected section number but " 

2367 "found {}".format(value)) 

2368 digits = '' 

2369 while value and value[0].isdigit(): 

2370 digits += value[0] 

2371 value = value[1:] 

2372 if digits[0] == '0' and digits != '0': 

2373 section.defects.append(errors.InvalidHeaderError( 

2374 "section number has an invalid leading 0")) 

2375 section.number = int(digits) 

2376 section.append(ValueTerminal(digits, 'digits')) 

2377 return section, value 

2378 

2379 

2380def get_value(value): 

2381 """ quoted-string / attribute 

2382 

2383 """ 

2384 v = Value() 

2385 if not value: 

2386 raise errors.HeaderParseError("Expected value but found end of string") 

2387 leader = None 

2388 if value[0] in CFWS_LEADER: 

2389 leader, value = get_cfws(value) 

2390 if not value: 

2391 raise errors.HeaderParseError("Expected value but found " 

2392 "only {}".format(leader)) 

2393 if value[0] == '"': 

2394 token, value = get_quoted_string(value) 

2395 else: 

2396 token, value = get_extended_attribute(value) 

2397 if leader is not None: 

2398 token[:0] = [leader] 

2399 v.append(token) 

2400 return v, value 

2401 

2402def get_parameter(value): 

2403 """ attribute [section] ["*"] [CFWS] "=" value 

2404 

2405 The CFWS is implied by the RFC but not made explicit in the BNF. This 

2406 simplified form of the BNF from the RFC is made to conform with the RFC BNF 

2407 through some extra checks. We do it this way because it makes both error 

2408 recovery and working with the resulting parse tree easier. 

2409 """ 

2410 # It is possible CFWS would also be implicitly allowed between the section 

2411 # and the 'extended-attribute' marker (the '*') , but we've never seen that 

2412 # in the wild and we will therefore ignore the possibility. 

2413 param = Parameter() 

2414 token, value = get_attribute(value) 

2415 param.append(token) 

2416 if not value or value[0] == ';': 

2417 param.defects.append(errors.InvalidHeaderDefect("Parameter contains " 

2418 "name ({}) but no value".format(token))) 

2419 return param, value 

2420 if value[0] == '*': 

2421 try: 

2422 token, value = get_section(value) 

2423 param.sectioned = True 

2424 param.append(token) 

2425 except errors.HeaderParseError: 

2426 pass 

2427 if not value: 

2428 raise errors.HeaderParseError("Incomplete parameter") 

2429 if value[0] == '*': 

2430 param.append(ValueTerminal('*', 'extended-parameter-marker')) 

2431 value = value[1:] 

2432 param.extended = True 

2433 if value[0] != '=': 

2434 raise errors.HeaderParseError("Parameter not followed by '='") 

2435 param.append(ValueTerminal('=', 'parameter-separator')) 

2436 value = value[1:] 

2437 leader = None 

2438 if value and value[0] in CFWS_LEADER: 

2439 token, value = get_cfws(value) 

2440 param.append(token) 

2441 remainder = None 

2442 appendto = param 

2443 if param.extended and value and value[0] == '"': 

2444 # Now for some serious hackery to handle the common invalid case of 

2445 # double quotes around an extended value. We also accept (with defect) 

2446 # a value marked as encoded that isn't really. 

2447 qstring, remainder = get_quoted_string(value) 

2448 inner_value = qstring.stripped_value 

2449 semi_valid = False 

2450 if param.section_number == 0: 

2451 if inner_value and inner_value[0] == "'": 

2452 semi_valid = True 

2453 else: 

2454 token, rest = get_attrtext(inner_value) 

2455 if rest and rest[0] == "'": 

2456 semi_valid = True 

2457 else: 

2458 try: 

2459 token, rest = get_extended_attrtext(inner_value) 

2460 except: 

2461 pass 

2462 else: 

2463 if not rest: 

2464 semi_valid = True 

2465 if semi_valid: 

2466 param.defects.append(errors.InvalidHeaderDefect( 

2467 "Quoted string value for extended parameter is invalid")) 

2468 param.append(qstring) 

2469 for t in qstring: 

2470 if t.token_type == 'bare-quoted-string': 

2471 t[:] = [] 

2472 appendto = t 

2473 break 

2474 value = inner_value 

2475 else: 

2476 remainder = None 

2477 param.defects.append(errors.InvalidHeaderDefect( 

2478 "Parameter marked as extended but appears to have a " 

2479 "quoted string value that is non-encoded")) 

2480 if value and value[0] == "'": 

2481 token = None 

2482 else: 

2483 token, value = get_value(value) 

2484 if not param.extended or param.section_number > 0: 

2485 if not value or value[0] != "'": 

2486 appendto.append(token) 

2487 if remainder is not None: 

2488 assert not value, value 

2489 value = remainder 

2490 return param, value 

2491 param.defects.append(errors.InvalidHeaderDefect( 

2492 "Apparent initial-extended-value but attribute " 

2493 "was not marked as extended or was not initial section")) 

2494 if not value: 

2495 # Assume the charset/lang is missing and the token is the value. 

2496 param.defects.append(errors.InvalidHeaderDefect( 

2497 "Missing required charset/lang delimiters")) 

2498 appendto.append(token) 

2499 if remainder is None: 

2500 return param, value 

2501 else: 

2502 if token is not None: 

2503 for t in token: 

2504 if t.token_type == 'extended-attrtext': 

2505 break 

2506 t.token_type == 'attrtext' 

2507 appendto.append(t) 

2508 param.charset = t.value 

2509 if value[0] != "'": 

2510 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 

2511 "delimiter, but found {!r}".format(value)) 

2512 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 

2513 value = value[1:] 

2514 if value and value[0] != "'": 

2515 token, value = get_attrtext(value) 

2516 appendto.append(token) 

2517 param.lang = token.value 

2518 if not value or value[0] != "'": 

2519 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 

2520 "delimiter, but found {}".format(value)) 

2521 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 

2522 value = value[1:] 

2523 if remainder is not None: 

2524 # Treat the rest of value as bare quoted string content. 

2525 v = Value() 

2526 while value: 

2527 if value[0] in WSP: 

2528 token, value = get_fws(value) 

2529 elif value[0] == '"': 

2530 token = ValueTerminal('"', 'DQUOTE') 

2531 value = value[1:] 

2532 else: 

2533 token, value = get_qcontent(value) 

2534 v.append(token) 

2535 token = v 

2536 else: 

2537 token, value = get_value(value) 

2538 appendto.append(token) 

2539 if remainder is not None: 

2540 assert not value, value 

2541 value = remainder 

2542 return param, value 

2543 

2544def parse_mime_parameters(value): 

2545 """ parameter *( ";" parameter ) 

2546 

2547 That BNF is meant to indicate this routine should only be called after 

2548 finding and handling the leading ';'. There is no corresponding rule in 

2549 the formal RFC grammar, but it is more convenient for us for the set of 

2550 parameters to be treated as its own TokenList. 

2551 

2552 This is 'parse' routine because it consumes the remaining value, but it 

2553 would never be called to parse a full header. Instead it is called to 

2554 parse everything after the non-parameter value of a specific MIME header. 

2555 

2556 """ 

2557 mime_parameters = MimeParameters() 

2558 while value: 

2559 try: 

2560 token, value = get_parameter(value) 

2561 mime_parameters.append(token) 

2562 except errors.HeaderParseError as err: 

2563 leader = None 

2564 if value[0] in CFWS_LEADER: 

2565 leader, value = get_cfws(value) 

2566 if not value: 

2567 mime_parameters.append(leader) 

2568 return mime_parameters 

2569 if value[0] == ';': 

2570 if leader is not None: 

2571 mime_parameters.append(leader) 

2572 mime_parameters.defects.append(errors.InvalidHeaderDefect( 

2573 "parameter entry with no content")) 

2574 else: 

2575 token, value = get_invalid_parameter(value) 

2576 if leader: 

2577 token[:0] = [leader] 

2578 mime_parameters.append(token) 

2579 mime_parameters.defects.append(errors.InvalidHeaderDefect( 

2580 "invalid parameter {!r}".format(token))) 

2581 if value and value[0] != ';': 

2582 # Junk after the otherwise valid parameter. Mark it as 

2583 # invalid, but it will have a value. 

2584 param = mime_parameters[-1] 

2585 param.token_type = 'invalid-parameter' 

2586 token, value = get_invalid_parameter(value) 

2587 param.extend(token) 

2588 mime_parameters.defects.append(errors.InvalidHeaderDefect( 

2589 "parameter with invalid trailing text {!r}".format(token))) 

2590 if value: 

2591 # Must be a ';' at this point. 

2592 mime_parameters.append(ValueTerminal(';', 'parameter-separator')) 

2593 value = value[1:] 

2594 return mime_parameters 

2595 

2596def _find_mime_parameters(tokenlist, value): 

2597 """Do our best to find the parameters in an invalid MIME header 

2598 

2599 """ 

2600 while value and value[0] != ';': 

2601 if value[0] in PHRASE_ENDS: 

2602 tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) 

2603 value = value[1:] 

2604 else: 

2605 token, value = get_phrase(value) 

2606 tokenlist.append(token) 

2607 if not value: 

2608 return 

2609 tokenlist.append(ValueTerminal(';', 'parameter-separator')) 

2610 tokenlist.append(parse_mime_parameters(value[1:])) 

2611 

2612def parse_content_type_header(value): 

2613 """ maintype "/" subtype *( ";" parameter ) 

2614 

2615 The maintype and substype are tokens. Theoretically they could 

2616 be checked against the official IANA list + x-token, but we 

2617 don't do that. 

2618 """ 

2619 ctype = ContentType() 

2620 recover = False 

2621 if not value: 

2622 ctype.defects.append(errors.HeaderMissingRequiredValue( 

2623 "Missing content type specification")) 

2624 return ctype 

2625 try: 

2626 token, value = get_token(value) 

2627 except errors.HeaderParseError: 

2628 ctype.defects.append(errors.InvalidHeaderDefect( 

2629 "Expected content maintype but found {!r}".format(value))) 

2630 _find_mime_parameters(ctype, value) 

2631 return ctype 

2632 ctype.append(token) 

2633 # XXX: If we really want to follow the formal grammar we should make 

2634 # mantype and subtype specialized TokenLists here. Probably not worth it. 

2635 if not value or value[0] != '/': 

2636 ctype.defects.append(errors.InvalidHeaderDefect( 

2637 "Invalid content type")) 

2638 if value: 

2639 _find_mime_parameters(ctype, value) 

2640 return ctype 

2641 ctype.maintype = token.value.strip().lower() 

2642 ctype.append(ValueTerminal('/', 'content-type-separator')) 

2643 value = value[1:] 

2644 try: 

2645 token, value = get_token(value) 

2646 except errors.HeaderParseError: 

2647 ctype.defects.append(errors.InvalidHeaderDefect( 

2648 "Expected content subtype but found {!r}".format(value))) 

2649 _find_mime_parameters(ctype, value) 

2650 return ctype 

2651 ctype.append(token) 

2652 ctype.subtype = token.value.strip().lower() 

2653 if not value: 

2654 return ctype 

2655 if value[0] != ';': 

2656 ctype.defects.append(errors.InvalidHeaderDefect( 

2657 "Only parameters are valid after content type, but " 

2658 "found {!r}".format(value))) 

2659 # The RFC requires that a syntactically invalid content-type be treated 

2660 # as text/plain. Perhaps we should postel this, but we should probably 

2661 # only do that if we were checking the subtype value against IANA. 

2662 del ctype.maintype, ctype.subtype 

2663 _find_mime_parameters(ctype, value) 

2664 return ctype 

2665 ctype.append(ValueTerminal(';', 'parameter-separator')) 

2666 ctype.append(parse_mime_parameters(value[1:])) 

2667 return ctype 

2668 

2669def parse_content_disposition_header(value): 

2670 """ disposition-type *( ";" parameter ) 

2671 

2672 """ 

2673 disp_header = ContentDisposition() 

2674 if not value: 

2675 disp_header.defects.append(errors.HeaderMissingRequiredValue( 

2676 "Missing content disposition")) 

2677 return disp_header 

2678 try: 

2679 token, value = get_token(value) 

2680 except errors.HeaderParseError: 

2681 disp_header.defects.append(errors.InvalidHeaderDefect( 

2682 "Expected content disposition but found {!r}".format(value))) 

2683 _find_mime_parameters(disp_header, value) 

2684 return disp_header 

2685 disp_header.append(token) 

2686 disp_header.content_disposition = token.value.strip().lower() 

2687 if not value: 

2688 return disp_header 

2689 if value[0] != ';': 

2690 disp_header.defects.append(errors.InvalidHeaderDefect( 

2691 "Only parameters are valid after content disposition, but " 

2692 "found {!r}".format(value))) 

2693 _find_mime_parameters(disp_header, value) 

2694 return disp_header 

2695 disp_header.append(ValueTerminal(';', 'parameter-separator')) 

2696 disp_header.append(parse_mime_parameters(value[1:])) 

2697 return disp_header 

2698 

2699def parse_content_transfer_encoding_header(value): 

2700 """ mechanism 

2701 

2702 """ 

2703 # We should probably validate the values, since the list is fixed. 

2704 cte_header = ContentTransferEncoding() 

2705 if not value: 

2706 cte_header.defects.append(errors.HeaderMissingRequiredValue( 

2707 "Missing content transfer encoding")) 

2708 return cte_header 

2709 try: 

2710 token, value = get_token(value) 

2711 except errors.HeaderParseError: 

2712 cte_header.defects.append(errors.InvalidHeaderDefect( 

2713 "Expected content transfer encoding but found {!r}".format(value))) 

2714 else: 

2715 cte_header.append(token) 

2716 cte_header.cte = token.value.strip().lower() 

2717 if not value: 

2718 return cte_header 

2719 while value: 

2720 cte_header.defects.append(errors.InvalidHeaderDefect( 

2721 "Extra text after content transfer encoding")) 

2722 if value[0] in PHRASE_ENDS: 

2723 cte_header.append(ValueTerminal(value[0], 'misplaced-special')) 

2724 value = value[1:] 

2725 else: 

2726 token, value = get_phrase(value) 

2727 cte_header.append(token) 

2728 return cte_header 

2729 

2730 

2731# 

2732# Header folding 

2733# 

2734# Header folding is complex, with lots of rules and corner cases. The 

2735# following code does its best to obey the rules and handle the corner 

2736# cases, but you can be sure there are few bugs:) 

2737# 

2738# This folder generally canonicalizes as it goes, preferring the stringified 

2739# version of each token. The tokens contain information that supports the 

2740# folder, including which tokens can be encoded in which ways. 

2741# 

2742# Folded text is accumulated in a simple list of strings ('lines'), each 

2743# one of which should be less than policy.max_line_length ('maxlen'). 

2744# 

2745 

2746def _steal_trailing_WSP_if_exists(lines): 

2747 wsp = '' 

2748 if lines and lines[-1] and lines[-1][-1] in WSP: 

2749 wsp = lines[-1][-1] 

2750 lines[-1] = lines[-1][:-1] 

2751 return wsp 

2752 

2753def _refold_parse_tree(parse_tree, *, policy): 

2754 """Return string of contents of parse_tree folded according to RFC rules. 

2755 

2756 """ 

2757 # max_line_length 0/None means no limit, ie: infinitely long. 

2758 maxlen = policy.max_line_length or sys.maxsize 

2759 encoding = 'utf-8' if policy.utf8 else 'us-ascii' 

2760 lines = [''] 

2761 last_ew = None 

2762 wrap_as_ew_blocked = 0 

2763 want_encoding = False 

2764 end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') 

2765 parts = list(parse_tree) 

2766 while parts: 

2767 part = parts.pop(0) 

2768 if part is end_ew_not_allowed: 

2769 wrap_as_ew_blocked -= 1 

2770 continue 

2771 tstr = str(part) 

2772 if part.token_type == 'ptext' and set(tstr) & SPECIALS: 

2773 # Encode if tstr contains special characters. 

2774 want_encoding = True 

2775 try: 

2776 tstr.encode(encoding) 

2777 charset = encoding 

2778 except UnicodeEncodeError: 

2779 if any(isinstance(x, errors.UndecodableBytesDefect) 

2780 for x in part.all_defects): 

2781 charset = 'unknown-8bit' 

2782 else: 

2783 # If policy.utf8 is false this should really be taken from a 

2784 # 'charset' property on the policy. 

2785 charset = 'utf-8' 

2786 want_encoding = True 

2787 if part.token_type == 'mime-parameters': 

2788 # Mime parameter folding (using RFC2231) is extra special. 

2789 _fold_mime_parameters(part, lines, maxlen, encoding) 

2790 continue 

2791 if want_encoding and not wrap_as_ew_blocked: 

2792 if not part.as_ew_allowed: 

2793 want_encoding = False 

2794 last_ew = None 

2795 if part.syntactic_break: 

2796 encoded_part = part.fold(policy=policy)[:-len(policy.linesep)] 

2797 if policy.linesep not in encoded_part: 

2798 # It fits on a single line 

2799 if len(encoded_part) > maxlen - len(lines[-1]): 

2800 # But not on this one, so start a new one. 

2801 newline = _steal_trailing_WSP_if_exists(lines) 

2802 # XXX what if encoded_part has no leading FWS? 

2803 lines.append(newline) 

2804 lines[-1] += encoded_part 

2805 continue 

2806 # Either this is not a major syntactic break, so we don't 

2807 # want it on a line by itself even if it fits, or it 

2808 # doesn't fit on a line by itself. Either way, fall through 

2809 # to unpacking the subparts and wrapping them. 

2810 if not hasattr(part, 'encode'): 

2811 # It's not a Terminal, do each piece individually. 

2812 parts = list(part) + parts 

2813 else: 

2814 # It's a terminal, wrap it as an encoded word, possibly 

2815 # combining it with previously encoded words if allowed. 

2816 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, 

2817 part.ew_combine_allowed, charset) 

2818 want_encoding = False 

2819 continue 

2820 if len(tstr) <= maxlen - len(lines[-1]): 

2821 lines[-1] += tstr 

2822 continue 

2823 # This part is too long to fit. The RFC wants us to break at 

2824 # "major syntactic breaks", so unless we don't consider this 

2825 # to be one, check if it will fit on the next line by itself. 

2826 if (part.syntactic_break and 

2827 len(tstr) + 1 <= maxlen): 

2828 newline = _steal_trailing_WSP_if_exists(lines) 

2829 if newline or part.startswith_fws(): 

2830 lines.append(newline + tstr) 

2831 last_ew = None 

2832 continue 

2833 if not hasattr(part, 'encode'): 

2834 # It's not a terminal, try folding the subparts. 

2835 newparts = list(part) 

2836 if not part.as_ew_allowed: 

2837 wrap_as_ew_blocked += 1 

2838 newparts.append(end_ew_not_allowed) 

2839 parts = newparts + parts 

2840 continue 

2841 if part.as_ew_allowed and not wrap_as_ew_blocked: 

2842 # It doesn't need CTE encoding, but encode it anyway so we can 

2843 # wrap it. 

2844 parts.insert(0, part) 

2845 want_encoding = True 

2846 continue 

2847 # We can't figure out how to wrap, it, so give up. 

2848 newline = _steal_trailing_WSP_if_exists(lines) 

2849 if newline or part.startswith_fws(): 

2850 lines.append(newline + tstr) 

2851 else: 

2852 # We can't fold it onto the next line either... 

2853 lines[-1] += tstr 

2854 return policy.linesep.join(lines) + policy.linesep 

2855 

2856def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): 

2857 """Fold string to_encode into lines as encoded word, combining if allowed. 

2858 Return the new value for last_ew, or None if ew_combine_allowed is False. 

2859 

2860 If there is already an encoded word in the last line of lines (indicated by 

2861 a non-None value for last_ew) and ew_combine_allowed is true, decode the 

2862 existing ew, combine it with to_encode, and re-encode. Otherwise, encode 

2863 to_encode. In either case, split to_encode as necessary so that the 

2864 encoded segments fit within maxlen. 

2865 

2866 """ 

2867 if last_ew is not None and ew_combine_allowed: 

2868 to_encode = str( 

2869 get_unstructured(lines[-1][last_ew:] + to_encode)) 

2870 lines[-1] = lines[-1][:last_ew] 

2871 if to_encode[0] in WSP: 

2872 # We're joining this to non-encoded text, so don't encode 

2873 # the leading blank. 

2874 leading_wsp = to_encode[0] 

2875 to_encode = to_encode[1:] 

2876 if (len(lines[-1]) == maxlen): 

2877 lines.append(_steal_trailing_WSP_if_exists(lines)) 

2878 lines[-1] += leading_wsp 

2879 trailing_wsp = '' 

2880 if to_encode[-1] in WSP: 

2881 # Likewise for the trailing space. 

2882 trailing_wsp = to_encode[-1] 

2883 to_encode = to_encode[:-1] 

2884 new_last_ew = len(lines[-1]) if last_ew is None else last_ew 

2885 

2886 encode_as = 'utf-8' if charset == 'us-ascii' else charset 

2887 

2888 # The RFC2047 chrome takes up 7 characters plus the length 

2889 # of the charset name. 

2890 chrome_len = len(encode_as) + 7 

2891 

2892 if (chrome_len + 1) >= maxlen: 

2893 raise errors.HeaderParseError( 

2894 "max_line_length is too small to fit an encoded word") 

2895 

2896 while to_encode: 

2897 remaining_space = maxlen - len(lines[-1]) 

2898 text_space = remaining_space - chrome_len 

2899 if text_space <= 0: 

2900 lines.append(' ') 

2901 continue 

2902 

2903 to_encode_word = to_encode[:text_space] 

2904 encoded_word = _ew.encode(to_encode_word, charset=encode_as) 

2905 excess = len(encoded_word) - remaining_space 

2906 while excess > 0: 

2907 # Since the chunk to encode is guaranteed to fit into less than 100 characters, 

2908 # shrinking it by one at a time shouldn't take long. 

2909 to_encode_word = to_encode_word[:-1] 

2910 encoded_word = _ew.encode(to_encode_word, charset=encode_as) 

2911 excess = len(encoded_word) - remaining_space 

2912 lines[-1] += encoded_word 

2913 to_encode = to_encode[len(to_encode_word):] 

2914 

2915 if to_encode: 

2916 lines.append(' ') 

2917 new_last_ew = len(lines[-1]) 

2918 lines[-1] += trailing_wsp 

2919 return new_last_ew if ew_combine_allowed else None 

2920 

2921def _fold_mime_parameters(part, lines, maxlen, encoding): 

2922 """Fold TokenList 'part' into the 'lines' list as mime parameters. 

2923 

2924 Using the decoded list of parameters and values, format them according to 

2925 the RFC rules, including using RFC2231 encoding if the value cannot be 

2926 expressed in 'encoding' and/or the parameter+value is too long to fit 

2927 within 'maxlen'. 

2928 

2929 """ 

2930 # Special case for RFC2231 encoding: start from decoded values and use 

2931 # RFC2231 encoding iff needed. 

2932 # 

2933 # Note that the 1 and 2s being added to the length calculations are 

2934 # accounting for the possibly-needed spaces and semicolons we'll be adding. 

2935 # 

2936 for name, value in part.params: 

2937 # XXX What if this ';' puts us over maxlen the first time through the 

2938 # loop? We should split the header value onto a newline in that case, 

2939 # but to do that we need to recognize the need earlier or reparse the 

2940 # header, so I'm going to ignore that bug for now. It'll only put us 

2941 # one character over. 

2942 if not lines[-1].rstrip().endswith(';'): 

2943 lines[-1] += ';' 

2944 charset = encoding 

2945 error_handler = 'strict' 

2946 try: 

2947 value.encode(encoding) 

2948 encoding_required = False 

2949 except UnicodeEncodeError: 

2950 encoding_required = True 

2951 if utils._has_surrogates(value): 

2952 charset = 'unknown-8bit' 

2953 error_handler = 'surrogateescape' 

2954 else: 

2955 charset = 'utf-8' 

2956 if encoding_required: 

2957 encoded_value = urllib.parse.quote( 

2958 value, safe='', errors=error_handler) 

2959 tstr = "{}*={}''{}".format(name, charset, encoded_value) 

2960 else: 

2961 tstr = '{}={}'.format(name, quote_string(value)) 

2962 if len(lines[-1]) + len(tstr) + 1 < maxlen: 

2963 lines[-1] = lines[-1] + ' ' + tstr 

2964 continue 

2965 elif len(tstr) + 2 <= maxlen: 

2966 lines.append(' ' + tstr) 

2967 continue 

2968 # We need multiple sections. We are allowed to mix encoded and 

2969 # non-encoded sections, but we aren't going to. We'll encode them all. 

2970 section = 0 

2971 extra_chrome = charset + "''" 

2972 while value: 

2973 chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) 

2974 if maxlen <= chrome_len + 3: 

2975 # We need room for the leading blank, the trailing semicolon, 

2976 # and at least one character of the value. If we don't 

2977 # have that, we'd be stuck, so in that case fall back to 

2978 # the RFC standard width. 

2979 maxlen = 78 

2980 splitpoint = maxchars = maxlen - chrome_len - 2 

2981 while True: 

2982 partial = value[:splitpoint] 

2983 encoded_value = urllib.parse.quote( 

2984 partial, safe='', errors=error_handler) 

2985 if len(encoded_value) <= maxchars: 

2986 break 

2987 splitpoint -= 1 

2988 lines.append(" {}*{}*={}{}".format( 

2989 name, section, extra_chrome, encoded_value)) 

2990 extra_chrome = '' 

2991 section += 1 

2992 value = value[splitpoint:] 

2993 if value: 

2994 lines[-1] += ';'