C
Carbon Man
Very new to Python, running 2.5 on windows.
I am processing an XML file (7.2MB). Using the standard library I am
recursively processing each node and parsing it. The branches don't go
particularly deep. What is happening is that the program is running really
really slowly, so slow that even running it over night, it still doesn't
finish.
Stepping through it I have noticed that memory usage has shot up from 190MB
to 624MB and continues to climb. If I set a break point and then stop the
program the memory is not released. It is not until I shutdown PythonWin
that the memory gets released.
I thought this might mean objects were not getting GCed, so through the
interactive window I imported gc. gc.garbage is empty. gc.collect() seems to
fix the problem (after much thinking) and reports 2524104. Running it again
returns 0.
I thought that garbage collection was automatic, if I use variables in a
method do I have to del them?
I tried putting a "del node" in all my for node in .... loops but that
didn't help. collect() reports the same number. Tried putting gc.collect()
at the end of the loops but that didn't help either.
If I have the program at a break and do gc.collect() it doesn't fix it, so
whatever referencing is causing problems is still active.
My program is parsing the XML and generating a Python program for
SQLalchemy, but the program never gets a chance to run the memory problem is
prior to that. It probably has something to do with the way I am string
building.
My apologies for the long post but without being able to see the code I
doubt anyone can give me a solid answer so here it goes (sorry for the lack
of comments):
from xml.dom import minidom
import os
import gc
class xmlProcessing:
""" General class for XML processing"""
def process(self, filename="", xmlString=""):
if xmlString:
pass
elif filename:
xmldoc = minidom.parse(filename)
self.parse( xmldoc.documentElement )
def parseBranch(self, parentNode):
""" Process an XML branch """
for node in parentNode.childNodes:
try:
parseMethod = getattr(self, "parse_%s" %
node.__class__.__name__)
except AttributeError:
continue
if parseMethod(node):
continue
self.parseBranch(node)
del node
def parse_Document(self, node):
pass
def parse_Text(self, node):
pass
def parse_Comment(self, node):
pass
def parse_Element(self, node):
try:
handlerMethod = getattr(self, "do_%s" % node.tagName)
except AttributeError:
return False
handlerMethod(node)
return True
class reptorParsing(xmlProcessing):
""" Specific class for generating a SQLalchemy program to create tables
and populate them with data"""
def __init__(self):
self.schemaPreface = """from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///tutorial.db', echo=False)
metadata = MetaData()
Base = declarative_base()"""
self.schemaTables = ""
self.schemaFields = ""
self.dataUpdate = ""
self.tableDict = {}
self.tableName = ""
self.tables = ""
def parse(self, parentNode):
"""Main entry point to begin processing a XML document"""
self.parseBranch(parentNode)
# Properties such as schemaTables and .tables are populated by the
various methods below
fupdate=open(os.path.join(os.getcwd(), "update.py"), 'w')
if self.schemaTables:
fupdate.write("import schema\n")
f=open(os.path.join(os.getcwd(), "schema.py"), 'w')
f.write(self.schemaPreface+"\n"+self.schemaTables+
'\n' + "metadata.create_all(engine)\n"+
"print 'hello 2'")
f.close()
if self.tables:
fupdate.write(self.tables)
# f=open(os.path.join(os.getcwd(), "dataUpdate.py"), 'w')
# f.write(self.dataUpdate)
# f.close()
fupdate.close()
def do_TABLES(self, tableNode):
"""Process schema for tables"""
for node in tableNode.childNodes:
self.tableName = node.tagName
# Define a declaritive mapping class
self.schemaTables += """\nclass %s(Base):
__tablename__ = '%s'
""" % (self.tableName, self.tableName)
self.schemaFields = ""
# allow for userA = users("Billy","Bob") via a __init__()
self.schemaInitPreface = " def __init__(self"
self.schemaInitBody = ""
self.parseBranch(node)
self.schemaInitPreface += "):\n"
self.schemaTables += self.schemaFields + "\n" + \
self.schemaInitPreface + \
self.schemaInitBody + "\n"
gc.collect()
def do_FIELDS(self, fieldsNode):
"""Process schema for fields within tables"""
for node in fieldsNode.childNodes:
if self.schemaFields:
self.schemaFields += "\n"
cType = ""
# The attribute type holds the type of field
crType = node.attributes["type"].value
if crType==u"C":
cType = "String(length=%s)" % node.attributes["len"].value
elif crType==u"N" and node.attributes["dec"].value==u'0':
cType = "Integer"
elif crType==u"N":
cType = "Numeric(precision=%s, scale=%s)" %
(node.attributes["len"].value,node.attributes["dec"].value)
elif crType==u"L":
cType = "Boolean"
elif crType==u"T":
cType = "DateTime"
elif crType==u"D":
cType = "Date"
elif crType==u"M" or crType==u"G":
cType = "Text"
if node.attributes.getNamedItem("primary"):
cType += ", primary_key=True"
self.schemaFields += " %s = Column(%s)" % (node.tagName,
cType)
self.schemaInitPreface += ", \\\n %s" % (node.tagName)
self.schemaInitBody += " self.%s = %s\n" %
(node.tagName, node.tagName)
self.tableDict[self.tableName + "." + node.tagName] = crType
del node
def do_DATA(self, dataNode):
"""This is for processing actual data to be pushed into the tables
Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE ->
FIELD_NAME -> VALUE"""
for node in dataNode.childNodes:
self.dataUpdate = """
import time
from datetime import *
from sqlalchemy import *
from sqlalchemy.orm import *
engine = create_engine('sqlite:///tutorial.db', echo=False)
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()
"""
self.keyValue = ""
self.keyField = node.attributes["key"].value
self.tableName = node.tagName
self.parseBranch(node)
self.tables += "\nimport %s_update.py" % (self.tableName)
f=open(os.path.join(os.getcwd(), self.tableName + "_update.py"),
'w')
f.write(self.dataUpdate)
f.close()
gc.collect()
def do_TUPLE(self, tupleNode):
""" A TUPLE is what the XML file refers to a table row
Sits below a DATA child"""
self.dataUpdate += """
entry = %s()
session.add(entry)
""" % (self.tableName)
for node in tupleNode.childNodes:
for dataNode in node.childNodes:
crType = self.tableDict[self.tableName + "." + node.tagName]
if crType==u"C" or crType==u"M":
cValue = '"""%s"""' % dataNode.data
elif crType==u"T":
cValue = 'datetime.strptime("'+dataNode.data+'",
"%Y-%m-%d %H:%M")'
elif crType==u"D":
cValue = 'datetime.strptime("'+dataNode.data+'",
"%Y-%m-%d")'
else:
cValue = dataNode.data
self.dataUpdate += "\nentry.%s = %s" % (node.tagName,
cValue)
del dataNode
self.dataUpdate += "\nsession.commit()"
del node
if __name__ == '__main__':
replicate = reptorParsing()
replicate.process(filename=os.path.join(os.getcwd(), "request.xml"))
import update
I am processing an XML file (7.2MB). Using the standard library I am
recursively processing each node and parsing it. The branches don't go
particularly deep. What is happening is that the program is running really
really slowly, so slow that even running it over night, it still doesn't
finish.
Stepping through it I have noticed that memory usage has shot up from 190MB
to 624MB and continues to climb. If I set a break point and then stop the
program the memory is not released. It is not until I shutdown PythonWin
that the memory gets released.
I thought this might mean objects were not getting GCed, so through the
interactive window I imported gc. gc.garbage is empty. gc.collect() seems to
fix the problem (after much thinking) and reports 2524104. Running it again
returns 0.
I thought that garbage collection was automatic, if I use variables in a
method do I have to del them?
I tried putting a "del node" in all my for node in .... loops but that
didn't help. collect() reports the same number. Tried putting gc.collect()
at the end of the loops but that didn't help either.
If I have the program at a break and do gc.collect() it doesn't fix it, so
whatever referencing is causing problems is still active.
My program is parsing the XML and generating a Python program for
SQLalchemy, but the program never gets a chance to run the memory problem is
prior to that. It probably has something to do with the way I am string
building.
My apologies for the long post but without being able to see the code I
doubt anyone can give me a solid answer so here it goes (sorry for the lack
of comments):
from xml.dom import minidom
import os
import gc
class xmlProcessing:
""" General class for XML processing"""
def process(self, filename="", xmlString=""):
if xmlString:
pass
elif filename:
xmldoc = minidom.parse(filename)
self.parse( xmldoc.documentElement )
def parseBranch(self, parentNode):
""" Process an XML branch """
for node in parentNode.childNodes:
try:
parseMethod = getattr(self, "parse_%s" %
node.__class__.__name__)
except AttributeError:
continue
if parseMethod(node):
continue
self.parseBranch(node)
del node
def parse_Document(self, node):
pass
def parse_Text(self, node):
pass
def parse_Comment(self, node):
pass
def parse_Element(self, node):
try:
handlerMethod = getattr(self, "do_%s" % node.tagName)
except AttributeError:
return False
handlerMethod(node)
return True
class reptorParsing(xmlProcessing):
""" Specific class for generating a SQLalchemy program to create tables
and populate them with data"""
def __init__(self):
self.schemaPreface = """from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///tutorial.db', echo=False)
metadata = MetaData()
Base = declarative_base()"""
self.schemaTables = ""
self.schemaFields = ""
self.dataUpdate = ""
self.tableDict = {}
self.tableName = ""
self.tables = ""
def parse(self, parentNode):
"""Main entry point to begin processing a XML document"""
self.parseBranch(parentNode)
# Properties such as schemaTables and .tables are populated by the
various methods below
fupdate=open(os.path.join(os.getcwd(), "update.py"), 'w')
if self.schemaTables:
fupdate.write("import schema\n")
f=open(os.path.join(os.getcwd(), "schema.py"), 'w')
f.write(self.schemaPreface+"\n"+self.schemaTables+
'\n' + "metadata.create_all(engine)\n"+
"print 'hello 2'")
f.close()
if self.tables:
fupdate.write(self.tables)
# f=open(os.path.join(os.getcwd(), "dataUpdate.py"), 'w')
# f.write(self.dataUpdate)
# f.close()
fupdate.close()
def do_TABLES(self, tableNode):
"""Process schema for tables"""
for node in tableNode.childNodes:
self.tableName = node.tagName
# Define a declaritive mapping class
self.schemaTables += """\nclass %s(Base):
__tablename__ = '%s'
""" % (self.tableName, self.tableName)
self.schemaFields = ""
# allow for userA = users("Billy","Bob") via a __init__()
self.schemaInitPreface = " def __init__(self"
self.schemaInitBody = ""
self.parseBranch(node)
self.schemaInitPreface += "):\n"
self.schemaTables += self.schemaFields + "\n" + \
self.schemaInitPreface + \
self.schemaInitBody + "\n"
gc.collect()
def do_FIELDS(self, fieldsNode):
"""Process schema for fields within tables"""
for node in fieldsNode.childNodes:
if self.schemaFields:
self.schemaFields += "\n"
cType = ""
# The attribute type holds the type of field
crType = node.attributes["type"].value
if crType==u"C":
cType = "String(length=%s)" % node.attributes["len"].value
elif crType==u"N" and node.attributes["dec"].value==u'0':
cType = "Integer"
elif crType==u"N":
cType = "Numeric(precision=%s, scale=%s)" %
(node.attributes["len"].value,node.attributes["dec"].value)
elif crType==u"L":
cType = "Boolean"
elif crType==u"T":
cType = "DateTime"
elif crType==u"D":
cType = "Date"
elif crType==u"M" or crType==u"G":
cType = "Text"
if node.attributes.getNamedItem("primary"):
cType += ", primary_key=True"
self.schemaFields += " %s = Column(%s)" % (node.tagName,
cType)
self.schemaInitPreface += ", \\\n %s" % (node.tagName)
self.schemaInitBody += " self.%s = %s\n" %
(node.tagName, node.tagName)
self.tableDict[self.tableName + "." + node.tagName] = crType
del node
def do_DATA(self, dataNode):
"""This is for processing actual data to be pushed into the tables
Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE ->
FIELD_NAME -> VALUE"""
for node in dataNode.childNodes:
self.dataUpdate = """
import time
from datetime import *
from sqlalchemy import *
from sqlalchemy.orm import *
engine = create_engine('sqlite:///tutorial.db', echo=False)
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()
"""
self.keyValue = ""
self.keyField = node.attributes["key"].value
self.tableName = node.tagName
self.parseBranch(node)
self.tables += "\nimport %s_update.py" % (self.tableName)
f=open(os.path.join(os.getcwd(), self.tableName + "_update.py"),
'w')
f.write(self.dataUpdate)
f.close()
gc.collect()
def do_TUPLE(self, tupleNode):
""" A TUPLE is what the XML file refers to a table row
Sits below a DATA child"""
self.dataUpdate += """
entry = %s()
session.add(entry)
""" % (self.tableName)
for node in tupleNode.childNodes:
for dataNode in node.childNodes:
crType = self.tableDict[self.tableName + "." + node.tagName]
if crType==u"C" or crType==u"M":
cValue = '"""%s"""' % dataNode.data
elif crType==u"T":
cValue = 'datetime.strptime("'+dataNode.data+'",
"%Y-%m-%d %H:%M")'
elif crType==u"D":
cValue = 'datetime.strptime("'+dataNode.data+'",
"%Y-%m-%d")'
else:
cValue = dataNode.data
self.dataUpdate += "\nentry.%s = %s" % (node.tagName,
cValue)
del dataNode
self.dataUpdate += "\nsession.commit()"
del node
if __name__ == '__main__':
replicate = reptorParsing()
replicate.process(filename=os.path.join(os.getcwd(), "request.xml"))
import update