1# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9from __future__ import unicode_literals
10from __future__ import print_function
11from __future__ import division
12from __future__ import absolute_import
13from future.builtins import int
14
15__all__ = [
16 'mktime_tz',
17 'parsedate',
18 'parsedate_tz',
19 'quote',
20 ]
21
22import time, calendar
23
24SPACE = ' '
25EMPTYSTRING = ''
26COMMASPACE = ', '
27
28# Parse a date field
29_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
30 'aug', 'sep', 'oct', 'nov', 'dec',
31 'january', 'february', 'march', 'april', 'may', 'june', 'july',
32 'august', 'september', 'october', 'november', 'december']
33
34_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
35
36# The timezone table does not include the military time zones defined
37# in RFC822, other than Z. According to RFC1123, the description in
38# RFC822 gets the signs wrong, so we can't rely on any such time
39# zones. RFC1123 recommends that numeric timezone indicators be used
40# instead of timezone names.
41
42_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
43 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
44 'EST': -500, 'EDT': -400, # Eastern
45 'CST': -600, 'CDT': -500, # Central
46 'MST': -700, 'MDT': -600, # Mountain
47 'PST': -800, 'PDT': -700 # Pacific
48 }
49
50
51def parsedate_tz(data):
52 """Convert a date string to a time tuple.
53
54 Accounts for military timezones.
55 """
56 res = _parsedate_tz(data)
57 if not res:
58 return
59 if res[9] is None:
60 res[9] = 0
61 return tuple(res)
62
63def _parsedate_tz(data):
64 """Convert date to extended time tuple.
65
66 The last (additional) element is the time zone offset in seconds, except if
67 the timezone was specified as -0000. In that case the last element is
68 None. This indicates a UTC timestamp that explicitly declaims knowledge of
69 the source timezone, as opposed to a +0000 timestamp that indicates the
70 source timezone really was UTC.
71
72 """
73 if not data:
74 return
75 data = data.split()
76 # The FWS after the comma after the day-of-week is optional, so search and
77 # adjust for this.
78 if data[0].endswith(',') or data[0].lower() in _daynames:
79 # There's a dayname here. Skip it
80 del data[0]
81 else:
82 i = data[0].rfind(',')
83 if i >= 0:
84 data[0] = data[0][i+1:]
85 if len(data) == 3: # RFC 850 date, deprecated
86 stuff = data[0].split('-')
87 if len(stuff) == 3:
88 data = stuff + data[1:]
89 if len(data) == 4:
90 s = data[3]
91 i = s.find('+')
92 if i == -1:
93 i = s.find('-')
94 if i > 0:
95 data[3:] = [s[:i], s[i:]]
96 else:
97 data.append('') # Dummy tz
98 if len(data) < 5:
99 return None
100 data = data[:5]
101 [dd, mm, yy, tm, tz] = data
102 mm = mm.lower()
103 if mm not in _monthnames:
104 dd, mm = mm, dd.lower()
105 if mm not in _monthnames:
106 return None
107 mm = _monthnames.index(mm) + 1
108 if mm > 12:
109 mm -= 12
110 if dd[-1] == ',':
111 dd = dd[:-1]
112 i = yy.find(':')
113 if i > 0:
114 yy, tm = tm, yy
115 if yy[-1] == ',':
116 yy = yy[:-1]
117 if not yy[0].isdigit():
118 yy, tz = tz, yy
119 if tm[-1] == ',':
120 tm = tm[:-1]
121 tm = tm.split(':')
122 if len(tm) == 2:
123 [thh, tmm] = tm
124 tss = '0'
125 elif len(tm) == 3:
126 [thh, tmm, tss] = tm
127 elif len(tm) == 1 and '.' in tm[0]:
128 # Some non-compliant MUAs use '.' to separate time elements.
129 tm = tm[0].split('.')
130 if len(tm) == 2:
131 [thh, tmm] = tm
132 tss = 0
133 elif len(tm) == 3:
134 [thh, tmm, tss] = tm
135 else:
136 return None
137 try:
138 yy = int(yy)
139 dd = int(dd)
140 thh = int(thh)
141 tmm = int(tmm)
142 tss = int(tss)
143 except ValueError:
144 return None
145 # Check for a yy specified in two-digit format, then convert it to the
146 # appropriate four-digit format, according to the POSIX standard. RFC 822
147 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
148 # mandates a 4-digit yy. For more information, see the documentation for
149 # the time module.
150 if yy < 100:
151 # The year is between 1969 and 1999 (inclusive).
152 if yy > 68:
153 yy += 1900
154 # The year is between 2000 and 2068 (inclusive).
155 else:
156 yy += 2000
157 tzoffset = None
158 tz = tz.upper()
159 if tz in _timezones:
160 tzoffset = _timezones[tz]
161 else:
162 try:
163 tzoffset = int(tz)
164 except ValueError:
165 pass
166 if tzoffset==0 and tz.startswith('-'):
167 tzoffset = None
168 # Convert a timezone offset into seconds ; -0500 -> -18000
169 if tzoffset:
170 if tzoffset < 0:
171 tzsign = -1
172 tzoffset = -tzoffset
173 else:
174 tzsign = 1
175 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
176 # Daylight Saving Time flag is set to -1, since DST is unknown.
177 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
178
179
180def parsedate(data):
181 """Convert a time string to a time tuple."""
182 t = parsedate_tz(data)
183 if isinstance(t, tuple):
184 return t[:9]
185 else:
186 return t
187
188
189def mktime_tz(data):
190 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
191 if data[9] is None:
192 # No zone info, so localtime is better assumption than GMT
193 return time.mktime(data[:8] + (-1,))
194 else:
195 t = calendar.timegm(data)
196 return t - data[9]
197
198
199def quote(str):
200 """Prepare string to be used in a quoted string.
201
202 Turns backslash and double quote characters into quoted pairs. These
203 are the only characters that need to be quoted inside a quoted string.
204 Does not add the surrounding double quotes.
205 """
206 return str.replace('\\', '\\\\').replace('"', '\\"')
207
208
209class AddrlistClass(object):
210 """Address parser class by Ben Escoto.
211
212 To understand what this class does, it helps to have a copy of RFC 2822 in
213 front of you.
214
215 Note: this class interface is deprecated and may be removed in the future.
216 Use email.utils.AddressList instead.
217 """
218
219 def __init__(self, field):
220 """Initialize a new instance.
221
222 `field' is an unparsed address header field, containing
223 one or more addresses.
224 """
225 self.specials = '()<>@,:;.\"[]'
226 self.pos = 0
227 self.LWS = ' \t'
228 self.CR = '\r\n'
229 self.FWS = self.LWS + self.CR
230 self.atomends = self.specials + self.LWS + self.CR
231 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
232 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
233 # syntax, so allow dots in phrases.
234 self.phraseends = self.atomends.replace('.', '')
235 self.field = field
236 self.commentlist = []
237
238 def gotonext(self):
239 """Skip white space and extract comments."""
240 wslist = []
241 while self.pos < len(self.field):
242 if self.field[self.pos] in self.LWS + '\n\r':
243 if self.field[self.pos] not in '\n\r':
244 wslist.append(self.field[self.pos])
245 self.pos += 1
246 elif self.field[self.pos] == '(':
247 self.commentlist.append(self.getcomment())
248 else:
249 break
250 return EMPTYSTRING.join(wslist)
251
252 def getaddrlist(self):
253 """Parse all addresses.
254
255 Returns a list containing all of the addresses.
256 """
257 result = []
258 while self.pos < len(self.field):
259 ad = self.getaddress()
260 if ad:
261 result += ad
262 else:
263 result.append(('', ''))
264 return result
265
266 def getaddress(self):
267 """Parse the next address."""
268 self.commentlist = []
269 self.gotonext()
270
271 oldpos = self.pos
272 oldcl = self.commentlist
273 plist = self.getphraselist()
274
275 self.gotonext()
276 returnlist = []
277
278 if self.pos >= len(self.field):
279 # Bad email address technically, no domain.
280 if plist:
281 returnlist = [(SPACE.join(self.commentlist), plist[0])]
282
283 elif self.field[self.pos] in '.@':
284 # email address is just an addrspec
285 # this isn't very efficient since we start over
286 self.pos = oldpos
287 self.commentlist = oldcl
288 addrspec = self.getaddrspec()
289 returnlist = [(SPACE.join(self.commentlist), addrspec)]
290
291 elif self.field[self.pos] == ':':
292 # address is a group
293 returnlist = []
294
295 fieldlen = len(self.field)
296 self.pos += 1
297 while self.pos < len(self.field):
298 self.gotonext()
299 if self.pos < fieldlen and self.field[self.pos] == ';':
300 self.pos += 1
301 break
302 returnlist = returnlist + self.getaddress()
303
304 elif self.field[self.pos] == '<':
305 # Address is a phrase then a route addr
306 routeaddr = self.getrouteaddr()
307
308 if self.commentlist:
309 returnlist = [(SPACE.join(plist) + ' (' +
310 ' '.join(self.commentlist) + ')', routeaddr)]
311 else:
312 returnlist = [(SPACE.join(plist), routeaddr)]
313
314 else:
315 if plist:
316 returnlist = [(SPACE.join(self.commentlist), plist[0])]
317 elif self.field[self.pos] in self.specials:
318 self.pos += 1
319
320 self.gotonext()
321 if self.pos < len(self.field) and self.field[self.pos] == ',':
322 self.pos += 1
323 return returnlist
324
325 def getrouteaddr(self):
326 """Parse a route address (Return-path value).
327
328 This method just skips all the route stuff and returns the addrspec.
329 """
330 if self.field[self.pos] != '<':
331 return
332
333 expectroute = False
334 self.pos += 1
335 self.gotonext()
336 adlist = ''
337 while self.pos < len(self.field):
338 if expectroute:
339 self.getdomain()
340 expectroute = False
341 elif self.field[self.pos] == '>':
342 self.pos += 1
343 break
344 elif self.field[self.pos] == '@':
345 self.pos += 1
346 expectroute = True
347 elif self.field[self.pos] == ':':
348 self.pos += 1
349 else:
350 adlist = self.getaddrspec()
351 self.pos += 1
352 break
353 self.gotonext()
354
355 return adlist
356
357 def getaddrspec(self):
358 """Parse an RFC 2822 addr-spec."""
359 aslist = []
360
361 self.gotonext()
362 while self.pos < len(self.field):
363 preserve_ws = True
364 if self.field[self.pos] == '.':
365 if aslist and not aslist[-1].strip():
366 aslist.pop()
367 aslist.append('.')
368 self.pos += 1
369 preserve_ws = False
370 elif self.field[self.pos] == '"':
371 aslist.append('"%s"' % quote(self.getquote()))
372 elif self.field[self.pos] in self.atomends:
373 if aslist and not aslist[-1].strip():
374 aslist.pop()
375 break
376 else:
377 aslist.append(self.getatom())
378 ws = self.gotonext()
379 if preserve_ws and ws:
380 aslist.append(ws)
381
382 if self.pos >= len(self.field) or self.field[self.pos] != '@':
383 return EMPTYSTRING.join(aslist)
384
385 aslist.append('@')
386 self.pos += 1
387 self.gotonext()
388 return EMPTYSTRING.join(aslist) + self.getdomain()
389
390 def getdomain(self):
391 """Get the complete domain name from an address."""
392 sdlist = []
393 while self.pos < len(self.field):
394 if self.field[self.pos] in self.LWS:
395 self.pos += 1
396 elif self.field[self.pos] == '(':
397 self.commentlist.append(self.getcomment())
398 elif self.field[self.pos] == '[':
399 sdlist.append(self.getdomainliteral())
400 elif self.field[self.pos] == '.':
401 self.pos += 1
402 sdlist.append('.')
403 elif self.field[self.pos] in self.atomends:
404 break
405 else:
406 sdlist.append(self.getatom())
407 return EMPTYSTRING.join(sdlist)
408
409 def getdelimited(self, beginchar, endchars, allowcomments=True):
410 """Parse a header fragment delimited by special characters.
411
412 `beginchar' is the start character for the fragment.
413 If self is not looking at an instance of `beginchar' then
414 getdelimited returns the empty string.
415
416 `endchars' is a sequence of allowable end-delimiting characters.
417 Parsing stops when one of these is encountered.
418
419 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
420 within the parsed fragment.
421 """
422 if self.field[self.pos] != beginchar:
423 return ''
424
425 slist = ['']
426 quote = False
427 self.pos += 1
428 while self.pos < len(self.field):
429 if quote:
430 slist.append(self.field[self.pos])
431 quote = False
432 elif self.field[self.pos] in endchars:
433 self.pos += 1
434 break
435 elif allowcomments and self.field[self.pos] == '(':
436 slist.append(self.getcomment())
437 continue # have already advanced pos from getcomment
438 elif self.field[self.pos] == '\\':
439 quote = True
440 else:
441 slist.append(self.field[self.pos])
442 self.pos += 1
443
444 return EMPTYSTRING.join(slist)
445
446 def getquote(self):
447 """Get a quote-delimited fragment from self's field."""
448 return self.getdelimited('"', '"\r', False)
449
450 def getcomment(self):
451 """Get a parenthesis-delimited fragment from self's field."""
452 return self.getdelimited('(', ')\r', True)
453
454 def getdomainliteral(self):
455 """Parse an RFC 2822 domain-literal."""
456 return '[%s]' % self.getdelimited('[', ']\r', False)
457
458 def getatom(self, atomends=None):
459 """Parse an RFC 2822 atom.
460
461 Optional atomends specifies a different set of end token delimiters
462 (the default is to use self.atomends). This is used e.g. in
463 getphraselist() since phrase endings must not include the `.' (which
464 is legal in phrases)."""
465 atomlist = ['']
466 if atomends is None:
467 atomends = self.atomends
468
469 while self.pos < len(self.field):
470 if self.field[self.pos] in atomends:
471 break
472 else:
473 atomlist.append(self.field[self.pos])
474 self.pos += 1
475
476 return EMPTYSTRING.join(atomlist)
477
478 def getphraselist(self):
479 """Parse a sequence of RFC 2822 phrases.
480
481 A phrase is a sequence of words, which are in turn either RFC 2822
482 atoms or quoted-strings. Phrases are canonicalized by squeezing all
483 runs of continuous whitespace into one space.
484 """
485 plist = []
486
487 while self.pos < len(self.field):
488 if self.field[self.pos] in self.FWS:
489 self.pos += 1
490 elif self.field[self.pos] == '"':
491 plist.append(self.getquote())
492 elif self.field[self.pos] == '(':
493 self.commentlist.append(self.getcomment())
494 elif self.field[self.pos] in self.phraseends:
495 break
496 else:
497 plist.append(self.getatom(self.phraseends))
498
499 return plist
500
501class AddressList(AddrlistClass):
502 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
503 def __init__(self, field):
504 AddrlistClass.__init__(self, field)
505 if field:
506 self.addresslist = self.getaddrlist()
507 else:
508 self.addresslist = []
509
510 def __len__(self):
511 return len(self.addresslist)
512
513 def __add__(self, other):
514 # Set union
515 newaddr = AddressList(None)
516 newaddr.addresslist = self.addresslist[:]
517 for x in other.addresslist:
518 if not x in self.addresslist:
519 newaddr.addresslist.append(x)
520 return newaddr
521
522 def __iadd__(self, other):
523 # Set union, in-place
524 for x in other.addresslist:
525 if not x in self.addresslist:
526 self.addresslist.append(x)
527 return self
528
529 def __sub__(self, other):
530 # Set difference
531 newaddr = AddressList(None)
532 for x in self.addresslist:
533 if not x in other.addresslist:
534 newaddr.addresslist.append(x)
535 return newaddr
536
537 def __isub__(self, other):
538 # Set difference, in-place
539 for x in other.addresslist:
540 if x in self.addresslist:
541 self.addresslist.remove(x)
542 return self
543
544 def __getitem__(self, index):
545 # Make indexing, slices, and 'in' work
546 return self.addresslist[index]