Problem with uudecode

J

Juho Saarikko

I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.

The script also outputs the decoded file to disk for debugging purposes,
and the database large object and filesystem file match so it can't be a
PostGreSQL problem.

So, if anyone has any idea what is wrong, please tell me ? I can't found
any reason why the bytes would get mangled...

The script follows:

#!/usr/local/bin/python2.3

# Insert message contents into the database, for each message-id already there
#
# Copyright 2004 by Juho Saarikko
# License: GNU General Public License (GPL) version 2
# See www.gnu.org for details

from pyPgSQL import libpq
import nntplib
import sys
import string
import regex
import sha
import imghdr
import binascii
import StringIO
import os

def strip_trailing_dots(n):
tmp = []
for i in range(len(n)):
if n[-1] == "," or n[-1] == ".":
tmp.append(n[:-1])
else:
tmp.append(n)
return tmp

def findmimetype(body, filename):
tail4 = string.lower(filename[-5:])
tail3 = string.lower(filename[-4:])
if tail4 == ".jpeg":
return "image/jpeg"
if tail3 == ".jpg":
return "image/jpeg"
if tail3 == ".png":
return "image/png"
if tail3 == ".jpe":
return "image/jpeg"
if tail3 == ".gif":
return "image/gif"
return None

def insert_picture(conn, image, filename):
hash = sha.new(image)
qhash = libpq.PgQuoteBytea(hash.digest())
candidates = conn.query("SELECT id, picture FROM pictures WHERE hash = " + qhash )
if candidates.ntuples > 0:
print "Found possible mathces " + str(candidates.ntuples)
for x in range(candidates.ntuples):
old = candidates.getvalue(x, 1)
old.open("r")
oldpic = old.read()
old.close()
if oldpic == image:
print "Found a match"
ret = (candidates.getvalue(x,0), 1)
return ret
mime = findmimetype(image, filename)
print "attempting to get mimetype"
if mime == None:
print "No mimetype found"
ret = (0, 0)
return ret
mime = libpq.PgQuoteString(mime)
mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
if mimeres.ntuples == 0:
conn.query("INSERT INTO mimetypes (mimetype) VALUES (" + mime + ")")
mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
mimetype = mimeres.getvalue(0,0)
picture = conn.lo_creat("rw")
picture.open("rw")
picture.write(image)
picture.close()
tmp = conn.query("INSERT INTO pictures (hash, mimetype, picture) VALUES (" + qhash + ", " +str(mimetype) + ", " + picture.name + ")")
temp = conn.query("SELECT id FROM pictures WHERE OID = " + str(tmp.oidValue))
id = temp.getvalue(0,0)
ret = (id, 0)
return ret

def try_decode_and_insert_uuencoded(conn, id):
begin = regex.compile("begin [0-9]+ \(.*\)")
conn.query("BEGIN")
basedir = "kuvat"
message = conn.query("SELECT data FROM fragments_bodies WHERE message = " + str(id) + " ORDER BY line")
# print message.ntuples

keywords = []
picids = []
newpicids = []
n = 0
s = ""
picid = 0
print 'Starting message id ' + str(id)
while n < message.ntuples:
# print "length of row " + str(n)
# print str(message.getlength(n, 0))
# print "Got length"
abcddummy = message.getvalue(n, 0)
# print "Got value"
s = message.getvalue(n, 0)
# print "Got s"
if begin.match(s) > 0:
# if match_beginning(s) > 0:
# print "Begin matched"
body = []
file = begin.group(1)
# file = get_file_name(s)
# print "Starting to decode, at line " + str(n + 1)
for k in range(n+1, message.ntuples):
# print "Decodind row " + str(k)
s = message.getvalue(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(binascii.a2b_uu(s))
except:
try:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(binascii.a2b_uu(s[:bytes]))
except:
print "Broken attachment in message " + str(id)
conn.query("ROLLBACK")
return
# print "Got to end, at line " + str(n)
# print "Attempting to join body"
body = string.join(body, "")
# print "Attempting to hash body"
# hash = sha.new(body)
# qhash = libpq.PgQuoteBytea(hash.digest())
# qbody = libpq.PgQuoteBytea(body)
# print "Attempting to find whether the pic already exists"
print "Mimetype returned " + str(findmimetype(body, file))
# temporary = open("dummy", "wb")
# temporary.write(body)
# temporary.close()
# dummy.write("dsfds")
print "Calling insert function"
picid, exists = insert_picture(conn, body, file)
print "Returned from insert function with value " + str(picid)
if picid > 0:
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# if already.ntuples == 0:
# print "Attempting to find mimetype"
# mimetype = findmimetype(body, file)
# print "Found mimetype"
# if mimetype != None:
# o = conn.query("INSERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# already = conn.query("SELECT id FROM pictures WHERE OID = " + str(o.oidValue()));
# already = conn.query("SELECT id FROM pictures WHERE data = " + qbody)
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to insert hash and mimetype"
# conn.query("INSERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# print "Attempting to get id"
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to get value"
# picid = already.getvalue(0, 0)
print picid
print "Attempting to OK dir"
if os.access(basedir + "/tmp", os.F_OK) != 1:
os.mkdir(basedir + "/tmp")
fh = open(basedir + "/tmp/" + str(picid), "wb")
fh.write(body)
fh.close()
print "File ok"
picids.append(picid)
if exists == 0:
newpicids.append(picid)
if file != "":
keywords.append(file)
# else:
# picid = already.getvalue(0, 0)
# if already.ntuples == 0:
# conn.query("ROLLBACK")
# return
# picids.append(picid)
# if already.ntuples == 0:
# print "already.ntuples == 0, ROLLBACKing"
# conn.query("ROLLBACK")
# return
# print "Appending picid"
# picids.append(picid)
# print "Picid appended"
else:
tmptmp = string.split(s)
tmpkey = strip_trailing_dots(tmptmp)
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
# print "Adding 1 to n"
n = n + 1
if len(picids) > 0:
print "Found " + str(len(picids)) + " pictures (" + str(len(newpicids)) + " new ones)"
# print "Finding Subject"
head = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ilike 'Subject')")
if head.ntuples > 0:
# print "Splitting Subject"
blah = head.getvalue(0,0)
# print str(blah)
blahblah = string.split(str(blah))
# print "Stripping"
abctmpkey = strip_trailing_dots(blahblah)
# print "Stripping done"
# print "Really"
tmpkey = abctmpkey
# print "Subject split"
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
o = conn.query("INSERT INTO messages DEFAULT VALUES")
mid = conn.query("SELECT id FROM messages WHERE OID = " + str(o.oidValue))
messageid = mid.getvalue(0, 0)
nresult = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ILIKE 'Newsgroups')")
if nresult.ntuples > 0:
for x in range(nresult.ntuples):
newsgroups = string.split(nresult.getvalue(x, 0), ",")
if len(newsgroups) > 0:
for y in range (len(newsgroups)):
newsgroup = libpq.PgQuoteString(newsgroups[y])
ngroupres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
if ngroupres.ntuples > 0:
newsgid = ngroupres.getvalue(0, 0)
else:
conn.query("INSERT INTO newsgroups (name) VALUES (" + newsgroup + ")")
ngrtmpres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
newsgid = ngrtmpres.getvalue(0, 0)
conn.query("INSERT INTO messages_ngroups_glue (message, newsgroup) VALUES (" + str(messageid) + ", " + str(newsgid) + ")")
else:
print "An empty Newsgroups: header at messag " + str(id)
conn.query("ROLLBACK")
return
else:
print "No Newsgroups: header at message " + str(id)
conn.query("ROLLBACK")
return
for x in range(len(picids)):
conn.query("INSERT INTO messages_pictures_glue (message, picture) VALUES (" + str(messageid) + ", " + str(picids[x]) + ")")
if len(keywords) > 0:
for x in range(len(tmpkey)):
qword = libpq.PgQuoteString(str(keywords[x]))
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
if tmp.ntuples == 0:
conn.query("INSERT INTO keywords_words (keyword) VALUES (" + qword + ")")
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
keyid = str(tmp.getvalue(0, 0))
for y in range(len(picids)):
conn.query("INSERT INTO keywords_glue(word, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
dummyone = "SELECT fragments_header_contents.line, fragments_header_names.header,"
dummytwo = " fragments_header_contents.contents FROM fragments_header_names, fragments_header_contents"
dummythree = " WHERE fragments_header_contents.message = " + str(id)
dummyfour = " AND fragments_header_contents.header = fragments_header_names.id"
head = conn.query(dummyone + dummytwo + dummythree + dummyfour)
if head.ntuples > 0:
for h in range(head.ntuples):
qhead = libpq.PgQuoteString(str(head.getvalue(h, 1)))
qcont = libpq.PgQuoteString(str(head.getvalue(h, 2)))
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
if tmp.ntuples == 0:
conn.query("INSERT INTO header_names (header) VALUES (" + qhead + ")")
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
headid = str(tmp.getvalue(0, 0))
line = str(head.getvalue(0, 0))
conn.query("INSERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
conn.query("DELETE FROM fragments_header_contents WHERE message = " + str(id))
conn.query("DELETE FROM fragments_bodies WHERE message = " + str(id))
conn.query("COMMIT")
if len(newpicids) > 0:
tmpdir = basedir + "/tmp/"
for i in range(len(newpicids)):
picid = newpicids
tmppicname = tmpdir + str(picid)
permpicname = basedir + "/" + str(picid%1000) + "/" + str(picid)
print tmppicname
print permpicname
if os.access(basedir + "/" + str(picid%1000), os.F_OK) != 1:
os.mkdir(basedir + "/" + str(picid%1000))
os.link(tmppicname, permpicname)
os.unlink(tmpdir +str(picid))
else:
print "No pictures found"
conn.query("ROLLBACK")
return


database = libpq.PQconnectdb('dbname = kuvat')
items = database.query("SELECT message FROM whole_attachments")

# try_decode_and_insert_uuencoded(database, 5407)

for i in range(items.ntuples):
try:
print 'Starting call ' + str(i)
try_decode_and_insert_uuencoded(database, items.getvalue(items.ntuples - 1 - i,0))
print ' returned from call ' + str(i)
except:
print 'Some other error occurred at message " + str(i) + ", trying to continue...'
 
V

Ville Vainio

Juho> I made a Python script which takes Usenet message bodies
Juho> from a database, decodes uuencoded contents and inserts them
Juho> as Large Object into a PostGreSQL database. However, it
Juho> appears that the to last few bytes

I skimmed through your program, and noticed that you use binascii
module uuencode/decode. Have you given the "uu" module a try, to see
if it works better?

Also, get rid of "regex" module, it even gives a DeprecationWarning
suggesting switching to "re".
 
J

Juho Saarikko

Juho> I made a Python script which takes Usenet message bodies
Juho> from a database, decodes uuencoded contents and inserts them
Juho> as Large Object into a PostGreSQL database. However, it
Juho> appears that the to last few bytes

I skimmed through your program, and noticed that you use binascii
module uuencode/decode. Have you given the "uu" module a try, to see
if it works better?

I did examine the uu module, but it would seem that I'd had to parse the
message first anyway to get the file name and the non-binary parts of the
message as keywords. Besides, as I understand it, the uu module uses the
binascii module, so if there's something wrong with the binascii module,
the uu module can't possibly work well.

Oh well, I would had to write the parsing engine anyway (or learn to
use the e-mail classes), to properly handle mime and yenc messages. And I
suppose I'd better start using imagemagic to verify the mimetype of
decoded files, instead of just believing the filename. And join together
files that have been spread over multiple messages. Work, work, work...
Also, get rid of "regex" module, it even gives a DeprecationWarning
suggesting switching to "re".

I would, if I knew how to make regular expressions; I found the uu-parsing
snippet from the net and built my script around it, but the
regular expression doesn't seem to work with the re module.
 
S

Steve Holden

Juho said:
I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.

The script also outputs the decoded file to disk for debugging purposes,
and the database large object and filesystem file match so it can't be a
PostGreSQL problem.

So, if anyone has any idea what is wrong, please tell me ? I can't found
any reason why the bytes would get mangled...

The script follows:
[...]
I note that you are dumping words rather than bytes. Is it possible that
the last byte isn't actually a part of the file, that
endianness makes the last byte look like the penultimate byte, and that
what you are seeing is simply noise?

If not then it should probably be looked into ...

regards
Steve
 
J

Juho Saarikko

I note that you are dumping words rather than bytes. Is it possible that
the last byte isn't actually a part of the file, that
endianness makes the last byte look like the penultimate byte, and that
what you are seeing is simply noise?

Well, ImageMagick complains that the image contains errors (altought
Eye of Gnome shows it with no artifacts), so it's likely to be part of the
file itself.

I get both

"display: Premature end of JPEG file"

and

"display: Invalid JPEG file structure: two SOI markers"

errors. The later error prevent ImageMagick's display-command from
displaying the image (but not Eye of Gnome).
If not then it should probably be looked into ...

Looked, looked, but where to start ? The bug could be anywhere from my
script to binascii module to the nntp module to the string.join -function.
 
T

Tim Roberts

Juho Saarikko said:
I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.

As others have pointed out, it's really the last byte that is getting
altered.
for k in range(n+1, message.ntuples):
# print "Decodind row " + str(k)
s = message.getvalue(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(binascii.a2b_uu(s))
except:
try:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(binascii.a2b_uu(s[:bytes]))
except:
print "Broken attachment in message " + str(id)
conn.query("ROLLBACK")
return

Your computation of the number of bytes in the uuencoded string will come
up one short: you're not accounting for the length byte. That will have
exactly the effect you describe. You lose the last encoded character,
which means you'll miss the last 6 bits of the file. Change it to this:

bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3 + 1

However, you should not need to wrap the first binascii.a2b_uu call with
try/except at all. What is happening that causes the error in the first
place? I suspect if you fix the root cause, you could eliminate the except
clause altogether.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,997
Messages
2,570,241
Members
46,830
Latest member
HeleneMull

Latest Threads

Top