From f93de4a4d5258ea827a73a5b2aaca587c081b81b Mon Sep 17 00:00:00 2001 From: Paul Sokolovsky Date: Sat, 24 May 2014 16:15:27 +0300 Subject: [PATCH] email.internal: Add pristine from CPython 3.3.3. This dist-package hosts few "package private" modules with names starting with "_". Maybe this is not the best grouping, but having a separate dist-package for each such module is a bit verbose. --- email.internal/email/_encoded_words.py | 221 ++++++++++ email.internal/email/_parseaddr.py | 540 +++++++++++++++++++++++++ email.internal/email/_policybase.py | 358 ++++++++++++++++ 3 files changed, 1119 insertions(+) create mode 100644 email.internal/email/_encoded_words.py create mode 100644 email.internal/email/_parseaddr.py create mode 100644 email.internal/email/_policybase.py diff --git a/email.internal/email/_encoded_words.py b/email.internal/email/_encoded_words.py new file mode 100644 index 00000000..9e0cc75b --- /dev/null +++ b/email.internal/email/_encoded_words.py @@ -0,0 +1,221 @@ +""" Routines for manipulating RFC2047 encoded words. + +This is currently a package-private API, but will be considered for promotion +to a public API if there is demand. + +""" + +# An ecoded word looks like this: +# +# =?charset[*lang]?cte?encoded_string?= +# +# for more information about charset see the charset module. Here it is one +# of the preferred MIME charset names (hopefully; you never know when parsing). +# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In +# theory other letters could be used for other encodings, but in practice this +# (almost?) never happens. There could be a public API for adding entries +# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is +# Base64. The meaning of encoded_string should be obvious. 'lang' is optional +# as indicated by the brackets (they are not part of the syntax) but is almost +# never encountered in practice. +# +# The general interface for a CTE decoder is that it takes the encoded_string +# as its argument, and returns a tuple (cte_decoded_string, defects). The +# cte_decoded_string is the original binary that was encoded using the +# specified cte. 'defects' is a list of MessageDefect instances indicating any +# problems encountered during conversion. 'charset' and 'lang' are the +# corresponding strings extracted from the EW, case preserved. +# +# The general interface for a CTE encoder is that it takes a binary sequence +# as input and returns the cte_encoded_string, which is an ascii-only string. +# +# Each decoder must also supply a length function that takes the binary +# sequence as its argument and returns the length of the resulting encoded +# string. +# +# The main API functions for the module are decode, which calls the decoder +# referenced by the cte specifier, and encode, which adds the appropriate +# RFC 2047 "chrome" to the encoded string, and can optionally automatically +# select the shortest possible encoding. See their docstrings below for +# details. + +import re +import base64 +import binascii +import functools +from string import ascii_letters, digits +from email import errors + +__all__ = ['decode_q', + 'encode_q', + 'decode_b', + 'encode_b', + 'len_q', + 'len_b', + 'decode', + 'encode', + ] + +# +# Quoted Printable +# + +# regex based decoder. +_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub, + lambda m: bytes([int(m.group(1), 16)])) + +def decode_q(encoded): + encoded = encoded.replace(b'_', b' ') + return _q_byte_subber(encoded), [] + + +# dict mapping bytes to their encoded form +class _QByteMap(dict): + + safe = b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii') + + def __missing__(self, key): + if key in self.safe: + self[key] = chr(key) + else: + self[key] = "={:02X}".format(key) + return self[key] + +_q_byte_map = _QByteMap() + +# In headers spaces are mapped to '_'. +_q_byte_map[ord(' ')] = '_' + +def encode_q(bstring): + return ''.join(_q_byte_map[x] for x in bstring) + +def len_q(bstring): + return sum(len(_q_byte_map[x]) for x in bstring) + + +# +# Base64 +# + +def decode_b(encoded): + defects = [] + pad_err = len(encoded) % 4 + if pad_err: + defects.append(errors.InvalidBase64PaddingDefect()) + padded_encoded = encoded + b'==='[:4-pad_err] + else: + padded_encoded = encoded + try: + return base64.b64decode(padded_encoded, validate=True), defects + except binascii.Error: + # Since we had correct padding, this must an invalid char error. + defects = [errors.InvalidBase64CharactersDefect()] + # The non-alphabet characters are ignored as far as padding + # goes, but we don't know how many there are. So we'll just + # try various padding lengths until something works. + for i in 0, 1, 2, 3: + try: + return base64.b64decode(encoded+b'='*i, validate=False), defects + except binascii.Error: + if i==0: + defects.append(errors.InvalidBase64PaddingDefect()) + else: + # This should never happen. + raise AssertionError("unexpected binascii.Error") + +def encode_b(bstring): + return base64.b64encode(bstring).decode('ascii') + +def len_b(bstring): + groups_of_3, leftover = divmod(len(bstring), 3) + # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in. + return groups_of_3 * 4 + (4 if leftover else 0) + + +_cte_decoders = { + 'q': decode_q, + 'b': decode_b, + } + +def decode(ew): + """Decode encoded word and return (string, charset, lang, defects) tuple. + + An RFC 2047/2243 encoded word has the form: + + =?charset*lang?cte?encoded_string?= + + where '*lang' may be omitted but the other parts may not be. + + This function expects exactly such a string (that is, it does not check the + syntax and may raise errors if the string is not well formed), and returns + the encoded_string decoded first from its Content Transfer Encoding and + then from the resulting bytes into unicode using the specified charset. If + the cte-decoded string does not successfully decode using the specified + character set, a defect is added to the defects list and the unknown octets + are replaced by the unicode 'unknown' character \uFDFF. + + The specified charset and language are returned. The default for language, + which is rarely if ever encountered, is the empty string. + + """ + _, charset, cte, cte_string, _ = ew.split('?') + charset, _, lang = charset.partition('*') + cte = cte.lower() + # Recover the original bytes and do CTE decoding. + bstring = cte_string.encode('ascii', 'surrogateescape') + bstring, defects = _cte_decoders[cte](bstring) + # Turn the CTE decoded bytes into unicode. + try: + string = bstring.decode(charset) + except UnicodeError: + defects.append(errors.UndecodableBytesDefect("Encoded word " + "contains bytes not decodable using {} charset".format(charset))) + string = bstring.decode(charset, 'surrogateescape') + except LookupError: + string = bstring.decode('ascii', 'surrogateescape') + if charset.lower() != 'unknown-8bit': + defects.append(errors.CharsetError("Unknown charset {} " + "in encoded word; decoded as unknown bytes".format(charset))) + return string, charset, lang, defects + + +_cte_encoders = { + 'q': encode_q, + 'b': encode_b, + } + +_cte_encode_length = { + 'q': len_q, + 'b': len_b, + } + +def encode(string, charset='utf-8', encoding=None, lang=''): + """Encode string using the CTE encoding that produces the shorter result. + + Produces an RFC 2047/2243 encoded word of the form: + + =?charset*lang?cte?encoded_string?= + + where '*lang' is omitted unless the 'lang' parameter is given a value. + Optional argument charset (defaults to utf-8) specifies the charset to use + to encode the string to binary before CTE encoding it. Optional argument + 'encoding' is the cte specifier for the encoding that should be used ('q' + or 'b'); if it is None (the default) the encoding which produces the + shortest encoded sequence is used, except that 'q' is preferred if it is up + to five characters longer. Optional argument 'lang' (default '') gives the + RFC 2243 language string to specify in the encoded word. + + """ + if charset == 'unknown-8bit': + bstring = string.encode('ascii', 'surrogateescape') + else: + bstring = string.encode(charset) + if encoding is None: + qlen = _cte_encode_length['q'](bstring) + blen = _cte_encode_length['b'](bstring) + # Bias toward q. 5 is arbitrary. + encoding = 'q' if qlen - blen < 5 else 'b' + encoded = _cte_encoders[encoding](bstring) + if lang: + lang = '*' + lang + return "=?{}{}?{}?{}?=".format(charset, lang, encoding, encoded) diff --git a/email.internal/email/_parseaddr.py b/email.internal/email/_parseaddr.py new file mode 100644 index 00000000..cdfa3729 --- /dev/null +++ b/email.internal/email/_parseaddr.py @@ -0,0 +1,540 @@ +# Copyright (C) 2002-2007 Python Software Foundation +# Contact: email-sig@python.org + +"""Email address parsing code. + +Lifted directly from rfc822.py. This should eventually be rewritten. +""" + +__all__ = [ + 'mktime_tz', + 'parsedate', + 'parsedate_tz', + 'quote', + ] + +import time, calendar + +SPACE = ' ' +EMPTYSTRING = '' +COMMASPACE = ', ' + +# Parse a date field +_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', + 'aug', 'sep', 'oct', 'nov', 'dec', + 'january', 'february', 'march', 'april', 'may', 'june', 'july', + 'august', 'september', 'october', 'november', 'december'] + +_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] + +# The timezone table does not include the military time zones defined +# in RFC822, other than Z. According to RFC1123, the description in +# RFC822 gets the signs wrong, so we can't rely on any such time +# zones. RFC1123 recommends that numeric timezone indicators be used +# instead of timezone names. + +_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, + 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) + 'EST': -500, 'EDT': -400, # Eastern + 'CST': -600, 'CDT': -500, # Central + 'MST': -700, 'MDT': -600, # Mountain + 'PST': -800, 'PDT': -700 # Pacific + } + + +def parsedate_tz(data): + """Convert a date string to a time tuple. + + Accounts for military timezones. + """ + res = _parsedate_tz(data) + if not res: + return + if res[9] is None: + res[9] = 0 + return tuple(res) + +def _parsedate_tz(data): + """Convert date to extended time tuple. + + The last (additional) element is the time zone offset in seconds, except if + the timezone was specified as -0000. In that case the last element is + None. This indicates a UTC timestamp that explicitly declaims knowledge of + the source timezone, as opposed to a +0000 timestamp that indicates the + source timezone really was UTC. + + """ + if not data: + return + data = data.split() + # The FWS after the comma after the day-of-week is optional, so search and + # adjust for this. + if data[0].endswith(',') or data[0].lower() in _daynames: + # There's a dayname here. Skip it + del data[0] + else: + i = data[0].rfind(',') + if i >= 0: + data[0] = data[0][i+1:] + if len(data) == 3: # RFC 850 date, deprecated + stuff = data[0].split('-') + if len(stuff) == 3: + data = stuff + data[1:] + if len(data) == 4: + s = data[3] + i = s.find('+') + if i == -1: + i = s.find('-') + if i > 0: + data[3:] = [s[:i], s[i:]] + else: + data.append('') # Dummy tz + if len(data) < 5: + return None + data = data[:5] + [dd, mm, yy, tm, tz] = data + mm = mm.lower() + if mm not in _monthnames: + dd, mm = mm, dd.lower() + if mm not in _monthnames: + return None + mm = _monthnames.index(mm) + 1 + if mm > 12: + mm -= 12 + if dd[-1] == ',': + dd = dd[:-1] + i = yy.find(':') + if i > 0: + yy, tm = tm, yy + if yy[-1] == ',': + yy = yy[:-1] + if not yy[0].isdigit(): + yy, tz = tz, yy + if tm[-1] == ',': + tm = tm[:-1] + tm = tm.split(':') + if len(tm) == 2: + [thh, tmm] = tm + tss = '0' + elif len(tm) == 3: + [thh, tmm, tss] = tm + elif len(tm) == 1 and '.' in tm[0]: + # Some non-compliant MUAs use '.' to separate time elements. + tm = tm[0].split('.') + if len(tm) == 2: + [thh, tmm] = tm + tss = 0 + elif len(tm) == 3: + [thh, tmm, tss] = tm + else: + return None + try: + yy = int(yy) + dd = int(dd) + thh = int(thh) + tmm = int(tmm) + tss = int(tss) + except ValueError: + return None + # Check for a yy specified in two-digit format, then convert it to the + # appropriate four-digit format, according to the POSIX standard. RFC 822 + # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) + # mandates a 4-digit yy. For more information, see the documentation for + # the time module. + if yy < 100: + # The year is between 1969 and 1999 (inclusive). + if yy > 68: + yy += 1900 + # The year is between 2000 and 2068 (inclusive). + else: + yy += 2000 + tzoffset = None + tz = tz.upper() + if tz in _timezones: + tzoffset = _timezones[tz] + else: + try: + tzoffset = int(tz) + except ValueError: + pass + if tzoffset==0 and tz.startswith('-'): + tzoffset = None + # Convert a timezone offset into seconds ; -0500 -> -18000 + if tzoffset: + if tzoffset < 0: + tzsign = -1 + tzoffset = -tzoffset + else: + tzsign = 1 + tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) + # Daylight Saving Time flag is set to -1, since DST is unknown. + return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset] + + +def parsedate(data): + """Convert a time string to a time tuple.""" + t = parsedate_tz(data) + if isinstance(t, tuple): + return t[:9] + else: + return t + + +def mktime_tz(data): + """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" + if data[9] is None: + # No zone info, so localtime is better assumption than GMT + return time.mktime(data[:8] + (-1,)) + else: + t = calendar.timegm(data) + return t - data[9] + + +def quote(str): + """Prepare string to be used in a quoted string. + + Turns backslash and double quote characters into quoted pairs. These + are the only characters that need to be quoted inside a quoted string. + Does not add the surrounding double quotes. + """ + return str.replace('\\', '\\\\').replace('"', '\\"') + + +class AddrlistClass: + """Address parser class by Ben Escoto. + + To understand what this class does, it helps to have a copy of RFC 2822 in + front of you. + + Note: this class interface is deprecated and may be removed in the future. + Use email.utils.AddressList instead. + """ + + def __init__(self, field): + """Initialize a new instance. + + `field' is an unparsed address header field, containing + one or more addresses. + """ + self.specials = '()<>@,:;.\"[]' + self.pos = 0 + self.LWS = ' \t' + self.CR = '\r\n' + self.FWS = self.LWS + self.CR + self.atomends = self.specials + self.LWS + self.CR + # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it + # is obsolete syntax. RFC 2822 requires that we recognize obsolete + # syntax, so allow dots in phrases. + self.phraseends = self.atomends.replace('.', '') + self.field = field + self.commentlist = [] + + def gotonext(self): + """Skip white space and extract comments.""" + wslist = [] + while self.pos < len(self.field): + if self.field[self.pos] in self.LWS + '\n\r': + if self.field[self.pos] not in '\n\r': + wslist.append(self.field[self.pos]) + self.pos += 1 + elif self.field[self.pos] == '(': + self.commentlist.append(self.getcomment()) + else: + break + return EMPTYSTRING.join(wslist) + + def getaddrlist(self): + """Parse all addresses. + + Returns a list containing all of the addresses. + """ + result = [] + while self.pos < len(self.field): + ad = self.getaddress() + if ad: + result += ad + else: + result.append(('', '')) + return result + + def getaddress(self): + """Parse the next address.""" + self.commentlist = [] + self.gotonext() + + oldpos = self.pos + oldcl = self.commentlist + plist = self.getphraselist() + + self.gotonext() + returnlist = [] + + if self.pos >= len(self.field): + # Bad email address technically, no domain. + if plist: + returnlist = [(SPACE.join(self.commentlist), plist[0])] + + elif self.field[self.pos] in '.@': + # email address is just an addrspec + # this isn't very efficient since we start over + self.pos = oldpos + self.commentlist = oldcl + addrspec = self.getaddrspec() + returnlist = [(SPACE.join(self.commentlist), addrspec)] + + elif self.field[self.pos] == ':': + # address is a group + returnlist = [] + + fieldlen = len(self.field) + self.pos += 1 + while self.pos < len(self.field): + self.gotonext() + if self.pos < fieldlen and self.field[self.pos] == ';': + self.pos += 1 + break + returnlist = returnlist + self.getaddress() + + elif self.field[self.pos] == '<': + # Address is a phrase then a route addr + routeaddr = self.getrouteaddr() + + if self.commentlist: + returnlist = [(SPACE.join(plist) + ' (' + + ' '.join(self.commentlist) + ')', routeaddr)] + else: + returnlist = [(SPACE.join(plist), routeaddr)] + + else: + if plist: + returnlist = [(SPACE.join(self.commentlist), plist[0])] + elif self.field[self.pos] in self.specials: + self.pos += 1 + + self.gotonext() + if self.pos < len(self.field) and self.field[self.pos] == ',': + self.pos += 1 + return returnlist + + def getrouteaddr(self): + """Parse a route address (Return-path value). + + This method just skips all the route stuff and returns the addrspec. + """ + if self.field[self.pos] != '<': + return + + expectroute = False + self.pos += 1 + self.gotonext() + adlist = '' + while self.pos < len(self.field): + if expectroute: + self.getdomain() + expectroute = False + elif self.field[self.pos] == '>': + self.pos += 1 + break + elif self.field[self.pos] == '@': + self.pos += 1 + expectroute = True + elif self.field[self.pos] == ':': + self.pos += 1 + else: + adlist = self.getaddrspec() + self.pos += 1 + break + self.gotonext() + + return adlist + + def getaddrspec(self): + """Parse an RFC 2822 addr-spec.""" + aslist = [] + + self.gotonext() + while self.pos < len(self.field): + preserve_ws = True + if self.field[self.pos] == '.': + if aslist and not aslist[-1].strip(): + aslist.pop() + aslist.append('.') + self.pos += 1 + preserve_ws = False + elif self.field[self.pos] == '"': + aslist.append('"%s"' % quote(self.getquote())) + elif self.field[self.pos] in self.atomends: + if aslist and not aslist[-1].strip(): + aslist.pop() + break + else: + aslist.append(self.getatom()) + ws = self.gotonext() + if preserve_ws and ws: + aslist.append(ws) + + if self.pos >= len(self.field) or self.field[self.pos] != '@': + return EMPTYSTRING.join(aslist) + + aslist.append('@') + self.pos += 1 + self.gotonext() + return EMPTYSTRING.join(aslist) + self.getdomain() + + def getdomain(self): + """Get the complete domain name from an address.""" + sdlist = [] + while self.pos < len(self.field): + if self.field[self.pos] in self.LWS: + self.pos += 1 + elif self.field[self.pos] == '(': + self.commentlist.append(self.getcomment()) + elif self.field[self.pos] == '[': + sdlist.append(self.getdomainliteral()) + elif self.field[self.pos] == '.': + self.pos += 1 + sdlist.append('.') + elif self.field[self.pos] in self.atomends: + break + else: + sdlist.append(self.getatom()) + return EMPTYSTRING.join(sdlist) + + def getdelimited(self, beginchar, endchars, allowcomments=True): + """Parse a header fragment delimited by special characters. + + `beginchar' is the start character for the fragment. + If self is not looking at an instance of `beginchar' then + getdelimited returns the empty string. + + `endchars' is a sequence of allowable end-delimiting characters. + Parsing stops when one of these is encountered. + + If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed + within the parsed fragment. + """ + if self.field[self.pos] != beginchar: + return '' + + slist = [''] + quote = False + self.pos += 1 + while self.pos < len(self.field): + if quote: + slist.append(self.field[self.pos]) + quote = False + elif self.field[self.pos] in endchars: + self.pos += 1 + break + elif allowcomments and self.field[self.pos] == '(': + slist.append(self.getcomment()) + continue # have already advanced pos from getcomment + elif self.field[self.pos] == '\\': + quote = True + else: + slist.append(self.field[self.pos]) + self.pos += 1 + + return EMPTYSTRING.join(slist) + + def getquote(self): + """Get a quote-delimited fragment from self's field.""" + return self.getdelimited('"', '"\r', False) + + def getcomment(self): + """Get a parenthesis-delimited fragment from self's field.""" + return self.getdelimited('(', ')\r', True) + + def getdomainliteral(self): + """Parse an RFC 2822 domain-literal.""" + return '[%s]' % self.getdelimited('[', ']\r', False) + + def getatom(self, atomends=None): + """Parse an RFC 2822 atom. + + Optional atomends specifies a different set of end token delimiters + (the default is to use self.atomends). This is used e.g. in + getphraselist() since phrase endings must not include the `.' (which + is legal in phrases).""" + atomlist = [''] + if atomends is None: + atomends = self.atomends + + while self.pos < len(self.field): + if self.field[self.pos] in atomends: + break + else: + atomlist.append(self.field[self.pos]) + self.pos += 1 + + return EMPTYSTRING.join(atomlist) + + def getphraselist(self): + """Parse a sequence of RFC 2822 phrases. + + A phrase is a sequence of words, which are in turn either RFC 2822 + atoms or quoted-strings. Phrases are canonicalized by squeezing all + runs of continuous whitespace into one space. + """ + plist = [] + + while self.pos < len(self.field): + if self.field[self.pos] in self.FWS: + self.pos += 1 + elif self.field[self.pos] == '"': + plist.append(self.getquote()) + elif self.field[self.pos] == '(': + self.commentlist.append(self.getcomment()) + elif self.field[self.pos] in self.phraseends: + break + else: + plist.append(self.getatom(self.phraseends)) + + return plist + +class AddressList(AddrlistClass): + """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" + def __init__(self, field): + AddrlistClass.__init__(self, field) + if field: + self.addresslist = self.getaddrlist() + else: + self.addresslist = [] + + def __len__(self): + return len(self.addresslist) + + def __add__(self, other): + # Set union + newaddr = AddressList(None) + newaddr.addresslist = self.addresslist[:] + for x in other.addresslist: + if not x in self.addresslist: + newaddr.addresslist.append(x) + return newaddr + + def __iadd__(self, other): + # Set union, in-place + for x in other.addresslist: + if not x in self.addresslist: + self.addresslist.append(x) + return self + + def __sub__(self, other): + # Set difference + newaddr = AddressList(None) + for x in self.addresslist: + if not x in other.addresslist: + newaddr.addresslist.append(x) + return newaddr + + def __isub__(self, other): + # Set difference, in-place + for x in other.addresslist: + if x in self.addresslist: + self.addresslist.remove(x) + return self + + def __getitem__(self, index): + # Make indexing, slices, and 'in' work + return self.addresslist[index] diff --git a/email.internal/email/_policybase.py b/email.internal/email/_policybase.py new file mode 100644 index 00000000..81061149 --- /dev/null +++ b/email.internal/email/_policybase.py @@ -0,0 +1,358 @@ +"""Policy framework for the email package. + +Allows fine grained feature control of how the package parses and emits data. +""" + +import abc +from email import header +from email import charset as _charset +from email.utils import _has_surrogates + +__all__ = [ + 'Policy', + 'Compat32', + 'compat32', + ] + + +class _PolicyBase: + + """Policy Object basic framework. + + This class is useless unless subclassed. A subclass should define + class attributes with defaults for any values that are to be + managed by the Policy object. The constructor will then allow + non-default values to be set for these attributes at instance + creation time. The instance will be callable, taking these same + attributes keyword arguments, and returning a new instance + identical to the called instance except for those values changed + by the keyword arguments. Instances may be added, yielding new + instances with any non-default values from the right hand + operand overriding those in the left hand operand. That is, + + A + B == A() + + The repr of an instance can be used to reconstruct the object + if and only if the repr of the values can be used to reconstruct + those values. + + """ + + def __init__(self, **kw): + """Create new Policy, possibly overriding some defaults. + + See class docstring for a list of overridable attributes. + + """ + for name, value in kw.items(): + if hasattr(self, name): + super(_PolicyBase,self).__setattr__(name, value) + else: + raise TypeError( + "{!r} is an invalid keyword argument for {}".format( + name, self.__class__.__name__)) + + def __repr__(self): + args = [ "{}={!r}".format(name, value) + for name, value in self.__dict__.items() ] + return "{}({})".format(self.__class__.__name__, ', '.join(args)) + + def clone(self, **kw): + """Return a new instance with specified attributes changed. + + The new instance has the same attribute values as the current object, + except for the changes passed in as keyword arguments. + + """ + newpolicy = self.__class__.__new__(self.__class__) + for attr, value in self.__dict__.items(): + object.__setattr__(newpolicy, attr, value) + for attr, value in kw.items(): + if not hasattr(self, attr): + raise TypeError( + "{!r} is an invalid keyword argument for {}".format( + attr, self.__class__.__name__)) + object.__setattr__(newpolicy, attr, value) + return newpolicy + + def __setattr__(self, name, value): + if hasattr(self, name): + msg = "{!r} object attribute {!r} is read-only" + else: + msg = "{!r} object has no attribute {!r}" + raise AttributeError(msg.format(self.__class__.__name__, name)) + + def __add__(self, other): + """Non-default values from right operand override those from left. + + The object returned is a new instance of the subclass. + + """ + return self.clone(**other.__dict__) + + +def _append_doc(doc, added_doc): + doc = doc.rsplit('\n', 1)[0] + added_doc = added_doc.split('\n', 1)[1] + return doc + '\n' + added_doc + +def _extend_docstrings(cls): + if cls.__doc__ and cls.__doc__.startswith('+'): + cls.__doc__ = _append_doc(cls.__bases__[0].__doc__, cls.__doc__) + for name, attr in cls.__dict__.items(): + if attr.__doc__ and attr.__doc__.startswith('+'): + for c in (c for base in cls.__bases__ for c in base.mro()): + doc = getattr(getattr(c, name), '__doc__') + if doc: + attr.__doc__ = _append_doc(doc, attr.__doc__) + break + return cls + + +class Policy(_PolicyBase, metaclass=abc.ABCMeta): + + r"""Controls for how messages are interpreted and formatted. + + Most of the classes and many of the methods in the email package accept + Policy objects as parameters. A Policy object contains a set of values and + functions that control how input is interpreted and how output is rendered. + For example, the parameter 'raise_on_defect' controls whether or not an RFC + violation results in an error being raised or not, while 'max_line_length' + controls the maximum length of output lines when a Message is serialized. + + Any valid attribute may be overridden when a Policy is created by passing + it as a keyword argument to the constructor. Policy objects are immutable, + but a new Policy object can be created with only certain values changed by + calling the Policy instance with keyword arguments. Policy objects can + also be added, producing a new Policy object in which the non-default + attributes set in the right hand operand overwrite those specified in the + left operand. + + Settable attributes: + + raise_on_defect -- If true, then defects should be raised as errors. + Default: False. + + linesep -- string containing the value to use as separation + between output lines. Default '\n'. + + cte_type -- Type of allowed content transfer encodings + + 7bit -- ASCII only + 8bit -- Content-Transfer-Encoding: 8bit is allowed + + Default: 8bit. Also controls the disposition of + (RFC invalid) binary data in headers; see the + documentation of the binary_fold method. + + max_line_length -- maximum length of lines, excluding 'linesep', + during serialization. None or 0 means no line + wrapping is done. Default is 78. + + """ + + raise_on_defect = False + linesep = '\n' + cte_type = '8bit' + max_line_length = 78 + + def handle_defect(self, obj, defect): + """Based on policy, either raise defect or call register_defect. + + handle_defect(obj, defect) + + defect should be a Defect subclass, but in any case must be an + Exception subclass. obj is the object on which the defect should be + registered if it is not raised. If the raise_on_defect is True, the + defect is raised as an error, otherwise the object and the defect are + passed to register_defect. + + This method is intended to be called by parsers that discover defects. + The email package parsers always call it with Defect instances. + + """ + if self.raise_on_defect: + raise defect + self.register_defect(obj, defect) + + def register_defect(self, obj, defect): + """Record 'defect' on 'obj'. + + Called by handle_defect if raise_on_defect is False. This method is + part of the Policy API so that Policy subclasses can implement custom + defect handling. The default implementation calls the append method of + the defects attribute of obj. The objects used by the email package by + default that get passed to this method will always have a defects + attribute with an append method. + + """ + obj.defects.append(defect) + + def header_max_count(self, name): + """Return the maximum allowed number of headers named 'name'. + + Called when a header is added to a Message object. If the returned + value is not 0 or None, and there are already a number of headers with + the name 'name' equal to the value returned, a ValueError is raised. + + Because the default behavior of Message's __setitem__ is to append the + value to the list of headers, it is easy to create duplicate headers + without realizing it. This method allows certain headers to be limited + in the number of instances of that header that may be added to a + Message programmatically. (The limit is not observed by the parser, + which will faithfully produce as many headers as exist in the message + being parsed.) + + The default implementation returns None for all header names. + """ + return None + + @abc.abstractmethod + def header_source_parse(self, sourcelines): + """Given a list of linesep terminated strings constituting the lines of + a single header, return the (name, value) tuple that should be stored + in the model. The input lines should retain their terminating linesep + characters. The lines passed in by the email package may contain + surrogateescaped binary data. + """ + raise NotImplementedError + + @abc.abstractmethod + def header_store_parse(self, name, value): + """Given the header name and the value provided by the application + program, return the (name, value) that should be stored in the model. + """ + raise NotImplementedError + + @abc.abstractmethod + def header_fetch_parse(self, name, value): + """Given the header name and the value from the model, return the value + to be returned to the application program that is requesting that + header. The value passed in by the email package may contain + surrogateescaped binary data if the lines were parsed by a BytesParser. + The returned value should not contain any surrogateescaped data. + + """ + raise NotImplementedError + + @abc.abstractmethod + def fold(self, name, value): + """Given the header name and the value from the model, return a string + containing linesep characters that implement the folding of the header + according to the policy controls. The value passed in by the email + package may contain surrogateescaped binary data if the lines were + parsed by a BytesParser. The returned value should not contain any + surrogateescaped data. + + """ + raise NotImplementedError + + @abc.abstractmethod + def fold_binary(self, name, value): + """Given the header name and the value from the model, return binary + data containing linesep characters that implement the folding of the + header according to the policy controls. The value passed in by the + email package may contain surrogateescaped binary data. + + """ + raise NotImplementedError + + +@_extend_docstrings +class Compat32(Policy): + + """+ + This particular policy is the backward compatibility Policy. It + replicates the behavior of the email package version 5.1. + """ + + def _sanitize_header(self, name, value): + # If the header value contains surrogates, return a Header using + # the unknown-8bit charset to encode the bytes as encoded words. + if not isinstance(value, str): + # Assume it is already a header object + return value + if _has_surrogates(value): + return header.Header(value, charset=_charset.UNKNOWN8BIT, + header_name=name) + else: + return value + + def header_source_parse(self, sourcelines): + """+ + The name is parsed as everything up to the ':' and returned unmodified. + The value is determined by stripping leading whitespace off the + remainder of the first line, joining all subsequent lines together, and + stripping any trailing carriage return or linefeed characters. + + """ + name, value = sourcelines[0].split(':', 1) + value = value.lstrip(' \t') + ''.join(sourcelines[1:]) + return (name, value.rstrip('\r\n')) + + def header_store_parse(self, name, value): + """+ + The name and value are returned unmodified. + """ + return (name, value) + + def header_fetch_parse(self, name, value): + """+ + If the value contains binary data, it is converted into a Header object + using the unknown-8bit charset. Otherwise it is returned unmodified. + """ + return self._sanitize_header(name, value) + + def fold(self, name, value): + """+ + Headers are folded using the Header folding algorithm, which preserves + existing line breaks in the value, and wraps each resulting line to the + max_line_length. Non-ASCII binary data are CTE encoded using the + unknown-8bit charset. + + """ + return self._fold(name, value, sanitize=True) + + def fold_binary(self, name, value): + """+ + Headers are folded using the Header folding algorithm, which preserves + existing line breaks in the value, and wraps each resulting line to the + max_line_length. If cte_type is 7bit, non-ascii binary data is CTE + encoded using the unknown-8bit charset. Otherwise the original source + header is used, with its existing line breaks and/or binary data. + + """ + folded = self._fold(name, value, sanitize=self.cte_type=='7bit') + return folded.encode('ascii', 'surrogateescape') + + def _fold(self, name, value, sanitize): + parts = [] + parts.append('%s: ' % name) + if isinstance(value, str): + if _has_surrogates(value): + if sanitize: + h = header.Header(value, + charset=_charset.UNKNOWN8BIT, + header_name=name) + else: + # If we have raw 8bit data in a byte string, we have no idea + # what the encoding is. There is no safe way to split this + # string. If it's ascii-subset, then we could do a normal + # ascii split, but if it's multibyte then we could break the + # string. There's no way to know so the least harm seems to + # be to not split the string and risk it being too long. + parts.append(value) + h = None + else: + h = header.Header(value, header_name=name) + else: + # Assume it is a Header-like object. + h = value + if h is not None: + parts.append(h.encode(linesep=self.linesep, + maxlinelen=self.max_line_length)) + parts.append(self.linesep) + return ''.join(parts) + + +compat32 = Compat32()