J
Juho Saarikko
The program attached to this message makes the Python interpreter segfault
randomly. I have tried both Python 2.2 which came with Debian Stable, and
self-compiled Python 2.3.3 (newest I could find on www.python.org,
compiled with default options (./configure && make). I'm using the pyPgSQL
plugin to connect to a PostGreSQL database, and have tried the Debian and
self-compiled newest versions of that as well.
I'm running BitTorrent, and that works perfectly well; btlaunchmany.py has
been running for months continuously without any problems. I've also run
the kernel compile test (compiling the Linux kernel nonstop to find any
inadequeties in processor cooling), and couldn't get any errors in 6 hours.
This makes me thing I'm hitting some weird bug in the interpreter.
Specifically, I'm wondering if my habit of reusing old variable names in a
function once they are no longer needed might be causing the trouble;
maybe it causes confusion on the variable type ?
The program retrieves Usenet News messages from the database (inserted
there by another Python program, which works perfectly and also uses the
pyPgSQL plugin).
So, here's the program. Does anyone know what's wrong with it ?
#!/usr/local/bin/python2.3
# Insert message contents into the database, for each message-id already there
#
# Copyright 2004 by Juho Saarikko
# License: GNU General Public License (GPL) version 2
# See www.gnu.org for details
from pyPgSQL import libpq
import nntplib
import sys
import string
import regex
import sha
import imghdr
import binascii
import StringIO
import os
def strip_trailing_dots(n):
tmp = []
for i in range(len(n)):
if n[-1] == "," or n[-1] == ".":
tmp.append(n[:-1])
else:
tmp.append(n)
return tmp
def findmimetype(body, filename):
what = imghdr.what(StringIO.StringIO(body))
if what == "gif":
return "image/gif"
if what == "png":
return "image/png"
if what == "jpeg":
return "image/jpeg"
return None
def try_decode_and_insert_uuencoded(conn, id):
begin = regex.compile("begin [0-9]+ \(.*\)")
conn.query("BEGIN")
basedir = "kuvat"
message = conn.query("SELECT data FROM fragments_bodies WHERE message = " + str(id) + " ORDER BY line")
print message.ntuples
keywords = []
picids = []
n = 0
s = ""
print 'Starting message id ' + str(id)
while n < message.ntuples:
# print "length of row " + str(n)
# print str(message.getlength(n, 0))
# print "Got length"
s = str(message.getvalue(n, 0))
# print "Got s"
if begin.match(s) > 0:
# print "Begin matched"
body = []
file = begin.group(1)
# print "Starting to decode, at line " + str(n + 1)
for k in range(n+1, message.ntuples):
# print "Decodind row " + str(k)
s = message.getvalue(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(binascii.a2b_uu(libpq.PgUnquoteBytea(s)))
except:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(binascii.a2b_uu(s[:bytes]))
# print "Got to end, at line " + str(n)
# print "Attempting to join body"
body = string.join(body, "")
# print "Attempting to hash body"
hash = sha.new(body)
qhash = libpq.PgQuoteBytea(hash.digest())
# qbody = libpq.PgQuoteBytea(body)
# print "Attempting to find whether the pic already exists"
already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
if already.ntuples == 0:
# print "Attempting to find mimetype"
mimetype = findmimetype(body, file)
# print "Found mimetype"
if mimetype != None:
# o = conn.query("INSERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# already = conn.query("SELECT id FROM pictures WHERE OID = " + str(o.oidValue()));
# already = conn.query("SELECT id FROM pictures WHERE data = " + qbody)
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to insert hash and mimetype"
conn.query("INSERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# print "Attempting to get id"
already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to get value"
picid = already.getvalue(0, 0)
# print "Attempting to OK dir"
if os.access(basedir + "/tmp", os.F_OK) != 1:
os.mkdir(basedir + "/tmp")
fh = open(basedir + "/tmp/" + str(picid), "wb")
fh.write(body)
fh.close()
# print "File ok"
else:
picid = already.getvalue(0, 0)
if already.ntuples == 0:
# print "already.ntuples == 0, ROLLBACKing"
conn.query("ROLLBACK")
return
# print "Appending picid"
picids.append(picid)
# print "Picid appended"
else:
tmpkey = strip_trailing_dots(string.split(s))
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
# print "Adding 1 to n"
n = n + 1
if len(picids) > 0:
# print "Finding Subject"
head = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ilike 'Subject')")
if head.ntuples > 0:
# print "Splitting Subject"
blah = head.getvalue(0,0)
print str(blah)
blahblah = string.split(str(blah))
# print "Stripping"
abctmpkey = strip_trailing_dots(blahblah)
# print "Stripping done"
# print "Really"
tmpkey = abctmpkey
#B print "Subject split"
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
o = conn.query("INSERT INTO messages DEFAULT VALUES")
mid = conn.query("SELECT id FROM messages WHERE OID = " + str(o.oidValue))
messageid = mid.getvalue(0, 0)
if len(keywords) > 0:
for x in range(len(tmpkey)):
qword = libpq.PgQuoteString(str(keywords[x]))
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
if tmp.ntuples == 0:
conn.query("INSERT INTO keywords_words (keyword) VALUES (" + qword + ")")
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
keyid = str(tmp.getvalue(0, 0))
for y in range(len(picids)):
conn.query("INSERT INTO keywords_glue(word, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
dummyone = "SELECT fragments_header_contents.line, fragments_header_names.header,"
dummytwo = " fragments_header_contents.contents FROM fragments_header_names, fragments_header_contents"
dummythree = " WHERE fragments_header_contents.message = " + str(id)
dummyfour = " AND fragments_header_contents.header = fragments_header_names.id"
head = conn.query(dummyone + dummytwo + dummythree + dummyfour)
if head.ntuples > 0:
for h in range(head.ntuples):
qhead = libpq.PgQuoteString(str(head.getvalue(h, 1)))
qcont = libpq.PgQuoteString(str(head.getvalue(h, 2)))
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
if tmp.ntuples == 0:
conn.query("INSERT INTO header_names (header) VALUES (" + qhead + ")")
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
headid = str(tmp.getvalue(0, 0))
line = str(head.getvalue(0, 0))
conn.query("INSERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
conn.query("DELETE FROM fragments_header_contents WHERE message = " + str(id))
conn.query("DELETE FROM fragments_bodies WHERE message = " + str(id))
conn.query("COMMIT")
tmpdir = basedir + "/tmp/"
for i in range(len(picids)):
picid = picids
if os.access(basedir + "/" + str(picid%1000), os.F_OK) != 1:
os.mkdir(basedir + "/" + str(picid%1000))
os.link(tmpdir + str(picid), basedir + "/" + str(picid%1000) + "/" + str(picid))
os.unlink(tmpdir +str(picid))
else:
conn.query("ROLLBACK")
return
database = libpq.PQconnectdb('dbname = kuvat')
items = database.query("SELECT message FROM whole_attachments")
# try_decode_and_insert_uuencoded(database, 1167)
for i in range(items.ntuples):
print 'Starting call ' + str(i)
try_decode_and_insert_uuencoded(database, items.getvalue(items.ntuples - 1 - i,0))
print ' returned from call ' + str(i)
# except:
# print 'Some other error occurred, trying to continue...\n'
randomly. I have tried both Python 2.2 which came with Debian Stable, and
self-compiled Python 2.3.3 (newest I could find on www.python.org,
compiled with default options (./configure && make). I'm using the pyPgSQL
plugin to connect to a PostGreSQL database, and have tried the Debian and
self-compiled newest versions of that as well.
I'm running BitTorrent, and that works perfectly well; btlaunchmany.py has
been running for months continuously without any problems. I've also run
the kernel compile test (compiling the Linux kernel nonstop to find any
inadequeties in processor cooling), and couldn't get any errors in 6 hours.
This makes me thing I'm hitting some weird bug in the interpreter.
Specifically, I'm wondering if my habit of reusing old variable names in a
function once they are no longer needed might be causing the trouble;
maybe it causes confusion on the variable type ?
The program retrieves Usenet News messages from the database (inserted
there by another Python program, which works perfectly and also uses the
pyPgSQL plugin).
So, here's the program. Does anyone know what's wrong with it ?
#!/usr/local/bin/python2.3
# Insert message contents into the database, for each message-id already there
#
# Copyright 2004 by Juho Saarikko
# License: GNU General Public License (GPL) version 2
# See www.gnu.org for details
from pyPgSQL import libpq
import nntplib
import sys
import string
import regex
import sha
import imghdr
import binascii
import StringIO
import os
def strip_trailing_dots(n):
tmp = []
for i in range(len(n)):
if n[-1] == "," or n[-1] == ".":
tmp.append(n[:-1])
else:
tmp.append(n)
return tmp
def findmimetype(body, filename):
what = imghdr.what(StringIO.StringIO(body))
if what == "gif":
return "image/gif"
if what == "png":
return "image/png"
if what == "jpeg":
return "image/jpeg"
return None
def try_decode_and_insert_uuencoded(conn, id):
begin = regex.compile("begin [0-9]+ \(.*\)")
conn.query("BEGIN")
basedir = "kuvat"
message = conn.query("SELECT data FROM fragments_bodies WHERE message = " + str(id) + " ORDER BY line")
print message.ntuples
keywords = []
picids = []
n = 0
s = ""
print 'Starting message id ' + str(id)
while n < message.ntuples:
# print "length of row " + str(n)
# print str(message.getlength(n, 0))
# print "Got length"
s = str(message.getvalue(n, 0))
# print "Got s"
if begin.match(s) > 0:
# print "Begin matched"
body = []
file = begin.group(1)
# print "Starting to decode, at line " + str(n + 1)
for k in range(n+1, message.ntuples):
# print "Decodind row " + str(k)
s = message.getvalue(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(binascii.a2b_uu(libpq.PgUnquoteBytea(s)))
except:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(binascii.a2b_uu(s[:bytes]))
# print "Got to end, at line " + str(n)
# print "Attempting to join body"
body = string.join(body, "")
# print "Attempting to hash body"
hash = sha.new(body)
qhash = libpq.PgQuoteBytea(hash.digest())
# qbody = libpq.PgQuoteBytea(body)
# print "Attempting to find whether the pic already exists"
already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
if already.ntuples == 0:
# print "Attempting to find mimetype"
mimetype = findmimetype(body, file)
# print "Found mimetype"
if mimetype != None:
# o = conn.query("INSERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# already = conn.query("SELECT id FROM pictures WHERE OID = " + str(o.oidValue()));
# already = conn.query("SELECT id FROM pictures WHERE data = " + qbody)
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to insert hash and mimetype"
conn.query("INSERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# print "Attempting to get id"
already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to get value"
picid = already.getvalue(0, 0)
# print "Attempting to OK dir"
if os.access(basedir + "/tmp", os.F_OK) != 1:
os.mkdir(basedir + "/tmp")
fh = open(basedir + "/tmp/" + str(picid), "wb")
fh.write(body)
fh.close()
# print "File ok"
else:
picid = already.getvalue(0, 0)
if already.ntuples == 0:
# print "already.ntuples == 0, ROLLBACKing"
conn.query("ROLLBACK")
return
# print "Appending picid"
picids.append(picid)
# print "Picid appended"
else:
tmpkey = strip_trailing_dots(string.split(s))
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
# print "Adding 1 to n"
n = n + 1
if len(picids) > 0:
# print "Finding Subject"
head = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ilike 'Subject')")
if head.ntuples > 0:
# print "Splitting Subject"
blah = head.getvalue(0,0)
print str(blah)
blahblah = string.split(str(blah))
# print "Stripping"
abctmpkey = strip_trailing_dots(blahblah)
# print "Stripping done"
# print "Really"
tmpkey = abctmpkey
#B print "Subject split"
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
o = conn.query("INSERT INTO messages DEFAULT VALUES")
mid = conn.query("SELECT id FROM messages WHERE OID = " + str(o.oidValue))
messageid = mid.getvalue(0, 0)
if len(keywords) > 0:
for x in range(len(tmpkey)):
qword = libpq.PgQuoteString(str(keywords[x]))
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
if tmp.ntuples == 0:
conn.query("INSERT INTO keywords_words (keyword) VALUES (" + qword + ")")
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
keyid = str(tmp.getvalue(0, 0))
for y in range(len(picids)):
conn.query("INSERT INTO keywords_glue(word, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
dummyone = "SELECT fragments_header_contents.line, fragments_header_names.header,"
dummytwo = " fragments_header_contents.contents FROM fragments_header_names, fragments_header_contents"
dummythree = " WHERE fragments_header_contents.message = " + str(id)
dummyfour = " AND fragments_header_contents.header = fragments_header_names.id"
head = conn.query(dummyone + dummytwo + dummythree + dummyfour)
if head.ntuples > 0:
for h in range(head.ntuples):
qhead = libpq.PgQuoteString(str(head.getvalue(h, 1)))
qcont = libpq.PgQuoteString(str(head.getvalue(h, 2)))
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
if tmp.ntuples == 0:
conn.query("INSERT INTO header_names (header) VALUES (" + qhead + ")")
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
headid = str(tmp.getvalue(0, 0))
line = str(head.getvalue(0, 0))
conn.query("INSERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
conn.query("DELETE FROM fragments_header_contents WHERE message = " + str(id))
conn.query("DELETE FROM fragments_bodies WHERE message = " + str(id))
conn.query("COMMIT")
tmpdir = basedir + "/tmp/"
for i in range(len(picids)):
picid = picids
if os.access(basedir + "/" + str(picid%1000), os.F_OK) != 1:
os.mkdir(basedir + "/" + str(picid%1000))
os.link(tmpdir + str(picid), basedir + "/" + str(picid%1000) + "/" + str(picid))
os.unlink(tmpdir +str(picid))
else:
conn.query("ROLLBACK")
return
database = libpq.PQconnectdb('dbname = kuvat')
items = database.query("SELECT message FROM whole_attachments")
# try_decode_and_insert_uuencoded(database, 1167)
for i in range(items.ntuples):
print 'Starting call ' + str(i)
try_decode_and_insert_uuencoded(database, items.getvalue(items.ntuples - 1 - i,0))
print ' returned from call ' + str(i)
# except:
# print 'Some other error occurred, trying to continue...\n'