Hi Andrew,
The reason I am using mapped objects is that I need to abstract from the
database implementation allowing the replication to target a number of
different platforms. This will definitely slow things down.
process a whole pile in memory and then (perhaps every 10,000 - when your
memory is about to run out and start paging) flush the session.
Under windows how can I tell when memory is about to run out? I guess there
is no cross-platform solution to this.
Writing external files has all come about from a post titled "Memory
problems (garbage collection) by Carbon Man" which I never got a resolution
to.
I was trying to execute gc.collect() when a process was complete because I
was having huge problems with memory (though the problem still remains). If
I stop at "import schema" There are 2524104 objects processed by
gc.collect()
There is a new issue in the code (marked with #### Problem with SQLalchemy),
but here is the program as it now stands:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Reads an XML file and creates schema.py from the TABLES branch (very
simplistic table structures)
# Processes the nodes within the DATA branch, into SQLalchemy
from xml.dom import minidom
import os
import codecs
from cStringIO import StringIO
import time
from datetime import *
from sqlalchemy import *
from sqlalchemy.orm import *
class xmlProcessing:
""" General class for XML processing"""
def process(self, filename="", xmlString=""):
if xmlString:
pass
elif filename:
xmldoc = minidom.parse(filename)
self.parse( xmldoc.documentElement )
def parseBranch(self, parentNode):
""" Process an XML branch """
for node in parentNode.childNodes:
try:
parseMethod = getattr(self, "parse_%s" %
node.__class__.__name__)
except AttributeError:
continue
if parseMethod(node):
continue
self.parseBranch(node)
def parse_Document(self, node):
pass
def parse_Text(self, node):
pass
def parse_Comment(self, node):
pass
def parse_Element(self, node):
try:
handlerMethod = getattr(self, "do_%s" % node.tagName)
except AttributeError:
return False
handlerMethod(node)
return True
class reptorParsing(xmlProcessing):
""" Specific class for generating a SQLalchemy program to create tables
and populate them with data"""
def __init__(self):
self.schemaPreface = StringIO()
self.schemaPreface.write("""from sqlalchemy import *
import time
from datetime import *
from sqlalchemy.ext.declarative import declarative_base
#engine = create_engine('sqlite:///tutorial.db', echo=False)
#metadata = MetaData()
Base = declarative_base()""")
self.schemaTables = StringIO()
self.schemaTablesCreate = StringIO()
self.schemaFields = StringIO()
self.tableDict = {}
self.tableName = StringIO()
self.tables = StringIO()
def parse(self, parentNode):
"""Main entry point to begin processing a XML document"""
self.parseBranch(parentNode)
# Properties such as schemaTables and .tables are populated by the
various methods below
fupdate=codecs.open(os.path.join(os.getcwd(), "update.py"), 'w',
'UTF-8')
if self.schemaTables:
f=codecs.open(os.path.join(os.getcwd(), "schema.py"), 'w',
'UTF-8')
f.write(self.schemaPreface.getvalue()+u"\n"+self.schemaTables.getvalue()+
u"if __name__ == '__main__':\n" +
self.schemaTablesCreate.getvalue() +
u"\n engine = create_engine('sqlite:///tutorial.db',
echo=False)\n" +
u" metadata = Base.metadata\n" +
u" metadata.create_all(engine)\n")
f.close()
if self.tables:
fupdate.write(self.tables.getvalue())
fupdate.close()
def do_TABLES(self, tableNode):
"""Process schema for tables"""
for node in tableNode.childNodes:
self.tableName = node.tagName
# Define a declaritive mapping class
self.schemaTables.write("""\nclass %s(Base):
__tablename__ = '%s'
""" % (self.tableName, self.tableName))
self.schemaTablesCreate.write("\n tableDef =
"+self.tableName+"()")
self.schemaFields = StringIO()
# allow for userA = users("Billy","Bob") via a __init__()
self.schemaInitPreface = StringIO()
self.schemaInitPreface.write(" def __init__(self")
self.schemaInitBody = StringIO()
self.parseBranch(node)
self.schemaInitPreface.write("):\n")
self.schemaTables.write(self.schemaFields.getvalue() + "\n" + \
self.schemaInitPreface.getvalue() + \
self.schemaInitBody.getvalue() + "\n")
# Need a way to execute schema so that the tables will be created.
def do_FIELDS(self, fieldsNode):
"""Process schema for fields within tables"""
for node in fieldsNode.childNodes:
if self.schemaFields:
self.schemaFields.write("\n")
cType = ""
initType = '""'
# The attribute type holds the type of field
crType = node.attributes["type"].value
if crType==u"C":
cType = "String(length=%s)" % node.attributes["len"].value
initType = '""'
elif crType==u"N" and node.attributes["dec"].value==u'0':
cType = "Integer"
initType = '0'
elif crType==u"N":
cType = "Numeric(precision=%s, scale=%s)" %
(node.attributes["len"].value,node.attributes["dec"].value)
initType = '0'
elif crType==u"L":
cType = "Boolean"
initType = 'True'
elif crType==u"T":
cType = "DateTime"
initType = "datetime.strptime('','')"
elif crType==u"D":
cType = "Date"
initType = "datetime.strptime('','')"
elif crType==u"M" or crType==u"G":
cType = "Text"
initType = '""'
if node.attributes.getNamedItem("primary"):
cType += ", primary_key=True"
self.schemaFields.write(" %s = Column(%s)" % (node.tagName,
cType))
self.schemaInitPreface.write(", \\\n %s=%s" %
(node.tagName, initType))
self.schemaInitBody.write(" self.%s = %s\n" %
(node.tagName, node.tagName))
self.tableDict[self.tableName + "." + node.tagName] = crType
def do_DATA(self, dataNode):
"""This is for processing actual data to be pushed into the tables
Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE ->
FIELD_NAME -> VALUE"""
# This is the schema program created in the TABLES branch
import schema
engine = create_engine('sqlite:///tutorial.db', echo=False)
Session = sessionmaker(bind=engine)
self.session = Session()
for node in dataNode.childNodes:
self.tableName = node.tagName
self.keyValue = ""
self.keyField = node.attributes["key"].value
self.parseBranch(node)
#self.tables.write("\nimport update_%s" % (self.tableName))
self.fDataUpdate.close()
self.session.commit()
self.session.close()
def do_TUPLE(self, tupleNode):
""" A TUPLE is what the XML file refers to a table row.
Sits below a DATA->table_name branch """
import schema
colValues = {}
TI = eval("schema."+self.tableName+"()")
exists = None
for node in tupleNode.childNodes:
for dataNode in node.childNodes:
crType = self.tableDict[self.tableName + "." + node.tagName]
if crType==u"C" or crType==u"M":
cValue = dataNode.data
elif crType==u"T":
cValue = datetime.strptime(dataNode.data, "%Y-%m-%d
%H:%M")
elif crType==u"D":
cValue = datetime.strptime(dataNode.data, "%Y-%m-%d")
else:
cValue = dataNode.data
if node.tagName == self.keyField:
#### Problem with SQLalchemy.
exists =
self.session.query(TI).filter(getattr(TI,self.keyField)==cValue)
# Set the value onto a list to be processed below
colValues[node.tagName] = cValue
if exists is None:
return
if exists.count == 0:
entry = TI
else:
entry = exists[0]
for col_name in colValues:
setattr(entry, col_name, colValues[col_name])
if exists.count == 0:
# Insert
self.session.add(TI)
if __name__ == '__main__':
replicate = reptorParsing()
replicate.process(filename=os.path.join(os.getcwd(), "request.xml"))
del replicate