A
aspineux
My goal is to write a parser for these imaginary string from the SMTP
protocol, regarding RFC 821 and 1869.
I'm a little flexible with the BNF from these RFC
Any comment ?
tests=[ 'MAIL FROM:<[email protected]>',
'MAIL FROM:[email protected]',
'MAIL FROM:<[email protected]> SIZE=1234
[email protected]',
'MAIL FROM:[email protected] SIZE=1234
[email protected]',
'MAIL FROM:<"(e-mail address removed)> legal=email"@address.com>',
'MAIL FROM:"(e-mail address removed)> legal=email"@address.com',
'MAIL FROM:<"(e-mail address removed)> legal=email"@address.com> SIZE=1234
[email protected]',
'MAIL FROM:"(e-mail address removed)> legal=email"@address.com SIZE=1234
[email protected]',
]
def RN(name, regex):
"""protect using () and give an optional name to a regex"""
if name:
return r'(?P<%s>%s)' % (name, regex)
else:
return r'(?:%s)' % regex
regex={}
# <dotnum> ::= <snum> "." <snum> "." <snum> "." <snum>
regex['dotnum']=RN(None, r'[012]?\d?\d\.[012]?\d?\d\.[012]?\d?\d\.
[012]?\d?\d' % regex)
# <dot-string> ::= <string> | <string> "." <dot-string>
regex['dot_string']=RN(None, r'[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*' %
regex)
# <domain> ::= <element> | <element> "." <domain>
regex['domain']=RN('domain', r'%(dotnum)s|%(dot_string)s' % regex)
# <q> ::= any one of the 128 ASCII characters except <CR>, <LF>, quote
("), or backslash (\)
regex['q']=RN(None, r'[^\n\r"\\]' % regex)
# <x> ::= any one of the 128 ASCII characters (no exceptions)
regex['x']=RN(None, r'.' % regex)
# <qtext> ::= "\" <x> | "\" <x> <qtext> | <q> | <q> <qtext>
regex['qtext']=RN(None, r'(?:\\%(x)s|%(q)s)+' % regex)
# <quoted-string> ::= """ <qtext> """
regex['quoted_string']=RN('quoted_string', r'"%(qtext)s"' % regex)
# <local-part> ::= <dot-string> | <quoted-string>
regex['local_part']=RN('local_part', r'%(quoted_string)s|%
(dot_string)s' % regex)
# <mailbox> ::= <local-part> "@" <domain>
regex['mailbox']=RN('mailbox', r'%(local_part)s@%(domain)s' % regex)
# <path> ::= "<" [ <a-d-l> ":" ] <mailbox> ">"
# also accept address without <>
regex['path']=RN('path', r'(?P<path_lt><)?%(mailbox)s(?(path_lt)>)' %
regex)
# esmtp-keyword ::= (ALPHA / DIGIT) *(ALPHA / DIGIT / "-")
regex['esmtp_keyword']=RN(None, r'[a-zA-Z0-9][-a-zA-Z0-9]*' % regex)
# esmtp-value ::= 1*<any CHAR excluding "=", SP, and all ;
syntax and values depend on esmtp-keyword
# control characters (US ASCII 0-31inclusive)>
regex['esmtp_value']=RN(None, r'[^= \t\r\n\f\v]*' % regex)
# esmtp-parameter ::= esmtp-keyword ["=" esmtp-value]
regex['esmtp_parameter']=RN(None, r'%(esmtp_keyword)s(?:=%
(esmtp_value)s)?' % regex)
# esmtp-parameter ::= esmtp-keyword ["=" esmtp-value]
regex['esmtp_parameters']=RN('esmtp_parameters', r'%
(esmtp_parameter)s(?:\s+%(esmtp_parameter)s)+' % regex)
# esmtp-cmd ::= inner-esmtp-cmd [SP esmtp-parameters] CR LF
regex['esmtp_addr']=RN('esmtp_addr', r'%(path)s(?:\s+%
(esmtp_parameters)s)?' % regex)
for t in tests:
for keyword in [ 'MAIL FROM:', 'RCPT TO:' ]:
keylen=len(keyword)
if t[:keylen].upper()==keyword:
t=t[keylen:]
break
match=re.match(regex['esmtp_addr'], t)
if match:
print 'MATCH local_part=%(local_part)s domain=%(domain)s
esmtp_parameters=%(esmtp_parameters)s' % match.groupdict()
else:
print 'DONT match', t
protocol, regarding RFC 821 and 1869.
I'm a little flexible with the BNF from these RFC
Any comment ?
tests=[ 'MAIL FROM:<[email protected]>',
'MAIL FROM:[email protected]',
'MAIL FROM:<[email protected]> SIZE=1234
[email protected]',
'MAIL FROM:[email protected] SIZE=1234
[email protected]',
'MAIL FROM:<"(e-mail address removed)> legal=email"@address.com>',
'MAIL FROM:"(e-mail address removed)> legal=email"@address.com',
'MAIL FROM:<"(e-mail address removed)> legal=email"@address.com> SIZE=1234
[email protected]',
'MAIL FROM:"(e-mail address removed)> legal=email"@address.com SIZE=1234
[email protected]',
]
def RN(name, regex):
"""protect using () and give an optional name to a regex"""
if name:
return r'(?P<%s>%s)' % (name, regex)
else:
return r'(?:%s)' % regex
regex={}
# <dotnum> ::= <snum> "." <snum> "." <snum> "." <snum>
regex['dotnum']=RN(None, r'[012]?\d?\d\.[012]?\d?\d\.[012]?\d?\d\.
[012]?\d?\d' % regex)
# <dot-string> ::= <string> | <string> "." <dot-string>
regex['dot_string']=RN(None, r'[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*' %
regex)
# <domain> ::= <element> | <element> "." <domain>
regex['domain']=RN('domain', r'%(dotnum)s|%(dot_string)s' % regex)
# <q> ::= any one of the 128 ASCII characters except <CR>, <LF>, quote
("), or backslash (\)
regex['q']=RN(None, r'[^\n\r"\\]' % regex)
# <x> ::= any one of the 128 ASCII characters (no exceptions)
regex['x']=RN(None, r'.' % regex)
# <qtext> ::= "\" <x> | "\" <x> <qtext> | <q> | <q> <qtext>
regex['qtext']=RN(None, r'(?:\\%(x)s|%(q)s)+' % regex)
# <quoted-string> ::= """ <qtext> """
regex['quoted_string']=RN('quoted_string', r'"%(qtext)s"' % regex)
# <local-part> ::= <dot-string> | <quoted-string>
regex['local_part']=RN('local_part', r'%(quoted_string)s|%
(dot_string)s' % regex)
# <mailbox> ::= <local-part> "@" <domain>
regex['mailbox']=RN('mailbox', r'%(local_part)s@%(domain)s' % regex)
# <path> ::= "<" [ <a-d-l> ":" ] <mailbox> ">"
# also accept address without <>
regex['path']=RN('path', r'(?P<path_lt><)?%(mailbox)s(?(path_lt)>)' %
regex)
# esmtp-keyword ::= (ALPHA / DIGIT) *(ALPHA / DIGIT / "-")
regex['esmtp_keyword']=RN(None, r'[a-zA-Z0-9][-a-zA-Z0-9]*' % regex)
# esmtp-value ::= 1*<any CHAR excluding "=", SP, and all ;
syntax and values depend on esmtp-keyword
# control characters (US ASCII 0-31inclusive)>
regex['esmtp_value']=RN(None, r'[^= \t\r\n\f\v]*' % regex)
# esmtp-parameter ::= esmtp-keyword ["=" esmtp-value]
regex['esmtp_parameter']=RN(None, r'%(esmtp_keyword)s(?:=%
(esmtp_value)s)?' % regex)
# esmtp-parameter ::= esmtp-keyword ["=" esmtp-value]
regex['esmtp_parameters']=RN('esmtp_parameters', r'%
(esmtp_parameter)s(?:\s+%(esmtp_parameter)s)+' % regex)
# esmtp-cmd ::= inner-esmtp-cmd [SP esmtp-parameters] CR LF
regex['esmtp_addr']=RN('esmtp_addr', r'%(path)s(?:\s+%
(esmtp_parameters)s)?' % regex)
for t in tests:
for keyword in [ 'MAIL FROM:', 'RCPT TO:' ]:
keylen=len(keyword)
if t[:keylen].upper()==keyword:
t=t[keylen:]
break
match=re.match(regex['esmtp_addr'], t)
if match:
print 'MATCH local_part=%(local_part)s domain=%(domain)s
esmtp_parameters=%(esmtp_parameters)s' % match.groupdict()
else:
print 'DONT match', t