P
Paul Rubin
I was surprised there was no obvious way with spamassassin (maybe I
shoulda looked at spambayes) to split an existing mbox file into its
spam and non-spam messages. So I wrote one. It's pretty slow, taking
around 1.5 seconds per message on a 2 ghz Athlon, making me wonder how
serious ISP's getting thousands of incoming messages per hour can run
anything like spamassassin on all of them. But for my purposes it's ok.
Comments and improvements are welcome.
================================================================
#!/usr/bin/python
# Spam filter for mbox files. Reads mailfile and makes two new
# files, mailfile.spam and mailfile.ham, containing the spam and non-spam
# messages from mailfile as determined by piping through spamc.
# Copyright 2003 Paul Rubin <http://www.paulrubin.com>
# Copying permission: GNU General Public License ver. 2, <http://www.gnu.org>
import mailbox,os,sys
from time import time
def mktemp():
import sha,os,time
d = sha.new("spam:%s,%s"%(os.getpid(),time.time())).hexdigest()
return "spam%s.temp"% d[:10]
tempfilename = mktemp()
def main():
print sys.argv
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
print "Usage: spam.py mboxfile"
print "marking up", filename
mailfile = open(filename, 'r')
ham = open(filename + ".ham", 'w')
spam = open(filename + ".spam", 'w')
mbox = mailbox.UnixMailbox(mailfile)
i = 0
while 1:
i += 1
m1 = mailfile.tell()
msg = mbox.next()
if not msg: break
body = msg.fp.read()
envelope = env_header(mailfile, m1)
print "%5d"%i, m1, mailfile.tell(), msg.startofbody, len(body),
is_spam, txt = spam_filter (envelope, msg, body)
print ['HAM','SPAM'][is_spam]
if is_spam:
spam.write(txt)
else:
ham.write(txt)
def spam_filter(envelope, msg, body):
txt = envelope + ''.join(msg.headers) + '\n' + body
out = os.popen("spamc > %s"% tempfilename, "w")
out.write(txt)
out.close()
t = mailbox.UnixMailbox(open(tempfilename))
spam_level = len(t.next().get('X-Spam-Level', ''))
txt = open(tempfilename).read()
return (spam_level >= 5, txt)
def env_header(fp, pos):
t = fp.tell()
fp.seek(pos)
e = fp.readline()
fp.seek(t)
return e
try:
t=time()
main()
dt = time()-t
print "elapsed: %d min %d sec"% divmod(int(dt), 60)
finally:
os.unlink(tempfilename)
shoulda looked at spambayes) to split an existing mbox file into its
spam and non-spam messages. So I wrote one. It's pretty slow, taking
around 1.5 seconds per message on a 2 ghz Athlon, making me wonder how
serious ISP's getting thousands of incoming messages per hour can run
anything like spamassassin on all of them. But for my purposes it's ok.
Comments and improvements are welcome.
================================================================
#!/usr/bin/python
# Spam filter for mbox files. Reads mailfile and makes two new
# files, mailfile.spam and mailfile.ham, containing the spam and non-spam
# messages from mailfile as determined by piping through spamc.
# Copyright 2003 Paul Rubin <http://www.paulrubin.com>
# Copying permission: GNU General Public License ver. 2, <http://www.gnu.org>
import mailbox,os,sys
from time import time
def mktemp():
import sha,os,time
d = sha.new("spam:%s,%s"%(os.getpid(),time.time())).hexdigest()
return "spam%s.temp"% d[:10]
tempfilename = mktemp()
def main():
print sys.argv
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
print "Usage: spam.py mboxfile"
print "marking up", filename
mailfile = open(filename, 'r')
ham = open(filename + ".ham", 'w')
spam = open(filename + ".spam", 'w')
mbox = mailbox.UnixMailbox(mailfile)
i = 0
while 1:
i += 1
m1 = mailfile.tell()
msg = mbox.next()
if not msg: break
body = msg.fp.read()
envelope = env_header(mailfile, m1)
print "%5d"%i, m1, mailfile.tell(), msg.startofbody, len(body),
is_spam, txt = spam_filter (envelope, msg, body)
print ['HAM','SPAM'][is_spam]
if is_spam:
spam.write(txt)
else:
ham.write(txt)
def spam_filter(envelope, msg, body):
txt = envelope + ''.join(msg.headers) + '\n' + body
out = os.popen("spamc > %s"% tempfilename, "w")
out.write(txt)
out.close()
t = mailbox.UnixMailbox(open(tempfilename))
spam_level = len(t.next().get('X-Spam-Level', ''))
txt = open(tempfilename).read()
return (spam_level >= 5, txt)
def env_header(fp, pos):
t = fp.tell()
fp.seek(pos)
e = fp.readline()
fp.seek(t)
return e
try:
t=time()
main()
dt = time()-t
print "elapsed: %d min %d sec"% divmod(int(dt), 60)
finally:
os.unlink(tempfilename)