The whole story

Mon Apr 27 21:52:29 EDT 2009

Hi Andrew,
The reason I am using mapped objects is that I need to abstract from the 
database implementation allowing the replication to target a number of 
different platforms. This will definitely slow things down.
> process a whole pile in memory and then (perhaps every 10,000 - when your 
> memory is about to run out and start paging) flush the session.
Under windows how can I tell when memory is about to run out? I guess there 
is no cross-platform solution to this.
Writing external files has all come about from a post titled "Memory 
problems (garbage collection) by Carbon Man" which I never got a resolution 
to.
I was trying to execute gc.collect() when a process was complete because I 
was having huge problems with memory (though the problem still remains). If 
I stop at "import schema" There are 2524104 objects processed by 
gc.collect()

There is a new issue in the code (marked with #### Problem with SQLalchemy), 
but here is the program as it now stands:

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Reads an XML file and creates schema.py from the TABLES branch (very 
simplistic table structures)
# Processes the nodes within the DATA branch, into SQLalchemy
from xml.dom import minidom
import os
import codecs
from cStringIO import StringIO
import time
from datetime import *
from sqlalchemy import *
from sqlalchemy.orm import *

class xmlProcessing:
    """ General class for XML processing"""

    def process(self, filename="", xmlString=""):
        if xmlString:
            pass
        elif filename:
            xmldoc = minidom.parse(filename)
        self.parse( xmldoc.documentElement )

    def parseBranch(self, parentNode):
        """ Process an XML branch """
        for node in parentNode.childNodes:
            try:
                parseMethod = getattr(self, "parse_%s" % 
node.__class__.__name__)
            except AttributeError:
                continue
            if parseMethod(node):
                continue
            self.parseBranch(node)

    def parse_Document(self, node):
        pass

    def parse_Text(self, node):
        pass

    def parse_Comment(self, node):
        pass

    def parse_Element(self, node):
        try:
            handlerMethod = getattr(self, "do_%s" % node.tagName)
        except AttributeError:
            return False
        handlerMethod(node)
        return True

class reptorParsing(xmlProcessing):
    """ Specific class for generating a SQLalchemy program to create tables
    and populate them with data"""

    def __init__(self):
        self.schemaPreface = StringIO()
        self.schemaPreface.write("""from sqlalchemy import *
import time
from datetime import *
from sqlalchemy.ext.declarative import declarative_base
#engine = create_engine('sqlite:///tutorial.db', echo=False)
#metadata = MetaData()
Base = declarative_base()""")
        self.schemaTables = StringIO()
        self.schemaTablesCreate = StringIO()
        self.schemaFields = StringIO()
        self.tableDict = {}
        self.tableName = StringIO()
        self.tables = StringIO()

    def parse(self, parentNode):
        """Main entry point to begin processing a XML document"""
        self.parseBranch(parentNode)
        # Properties such as schemaTables and .tables are populated by the 
various methods below
        fupdate=codecs.open(os.path.join(os.getcwd(), "update.py"), 'w', 
'UTF-8')
        if self.schemaTables:
            f=codecs.open(os.path.join(os.getcwd(), "schema.py"), 'w', 
'UTF-8')
            f.write(self.schemaPreface.getvalue()+u"\n"+self.schemaTables.getvalue()+
                u"if __name__ == '__main__':\n" +
                self.schemaTablesCreate.getvalue() +
                u"\n    engine = create_engine('sqlite:///tutorial.db', 
echo=False)\n" +
                u"    metadata = Base.metadata\n" +
                u"    metadata.create_all(engine)\n")
            f.close()
        if self.tables:
            fupdate.write(self.tables.getvalue())
        fupdate.close()

    def do_TABLES(self, tableNode):
        """Process schema for tables"""
        for node in tableNode.childNodes:
            self.tableName = node.tagName
            # Define a declaritive mapping class
            self.schemaTables.write("""\nclass %s(Base):
    __tablename__ = '%s'
""" % (self.tableName, self.tableName))
            self.schemaTablesCreate.write("\n    tableDef = 
"+self.tableName+"()")
            self.schemaFields = StringIO()
            # allow for userA = users("Billy","Bob") via a __init__()
            self.schemaInitPreface = StringIO()
            self.schemaInitPreface.write("    def __init__(self")
            self.schemaInitBody = StringIO()
            self.parseBranch(node)
            self.schemaInitPreface.write("):\n")
            self.schemaTables.write(self.schemaFields.getvalue() + "\n" + \
                self.schemaInitPreface.getvalue() + \
                self.schemaInitBody.getvalue() + "\n")

        # Need a way to execute schema so that the tables will be created.

    def do_FIELDS(self, fieldsNode):
        """Process schema for fields within tables"""
        for node in fieldsNode.childNodes:
            if self.schemaFields:
                self.schemaFields.write("\n")
            cType = ""
            initType = '""'
            # The attribute type holds the type of field
            crType = node.attributes["type"].value
            if crType==u"C":
                cType = "String(length=%s)" % node.attributes["len"].value
                initType = '""'
            elif crType==u"N" and node.attributes["dec"].value==u'0':
                cType = "Integer"
                initType = '0'
            elif crType==u"N":
                cType = "Numeric(precision=%s, scale=%s)" % 
(node.attributes["len"].value,node.attributes["dec"].value)
                initType = '0'
            elif crType==u"L":
                cType = "Boolean"
                initType = 'True'
            elif crType==u"T":
                cType = "DateTime"
                initType = "datetime.strptime('','')"
            elif crType==u"D":
                cType = "Date"
                initType = "datetime.strptime('','')"
            elif crType==u"M" or crType==u"G":
                cType = "Text"
                initType = '""'

            if node.attributes.getNamedItem("primary"):
                cType += ", primary_key=True"
            self.schemaFields.write("    %s = Column(%s)" % (node.tagName, 
cType))
            self.schemaInitPreface.write(", \\\n        %s=%s" % 
(node.tagName, initType))
            self.schemaInitBody.write("            self.%s = %s\n" % 
(node.tagName, node.tagName))
            self.tableDict[self.tableName + "." + node.tagName] = crType

    def do_DATA(self, dataNode):
        """This is for processing actual data to be pushed into the tables

        Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE -> 
FIELD_NAME -> VALUE"""
        # This is the schema program created in the TABLES branch
        import schema
        engine = create_engine('sqlite:///tutorial.db', echo=False)
        Session = sessionmaker(bind=engine)
        self.session = Session()

        for node in dataNode.childNodes:
            self.tableName = node.tagName

            self.keyValue = ""
            self.keyField = node.attributes["key"].value
            self.parseBranch(node)
            #self.tables.write("\nimport update_%s" % (self.tableName))
            self.fDataUpdate.close()
            self.session.commit()

        self.session.close()

    def do_TUPLE(self, tupleNode):
        """ A TUPLE is what the XML file refers to a table row.
        Sits below a DATA->table_name branch """

        import schema
        colValues = {}
        TI = eval("schema."+self.tableName+"()")
        exists = None
        for node in tupleNode.childNodes:
            for dataNode in node.childNodes:
                crType = self.tableDict[self.tableName + "." + node.tagName]

                if crType==u"C" or crType==u"M":
                    cValue = dataNode.data
                elif crType==u"T":
                    cValue = datetime.strptime(dataNode.data, "%Y-%m-%d 
%H:%M")
                elif crType==u"D":
                    cValue = datetime.strptime(dataNode.data, "%Y-%m-%d")
                else:
                    cValue = dataNode.data
                if node.tagName == self.keyField:
                    #### Problem with SQLalchemy.
                    exists = 
self.session.query(TI).filter(getattr(TI,self.keyField)==cValue)
                # Set the value onto a list to be processed below
                colValues[node.tagName] = cValue

        if exists is None:
            return

        if exists.count == 0:
            entry = TI
        else:
            entry = exists[0]

        for col_name in colValues:
            setattr(entry, col_name, colValues[col_name])
        if exists.count == 0:
            # Insert
            self.session.add(TI)

if __name__ == '__main__':
    replicate = reptorParsing()
    replicate.process(filename=os.path.join(os.getcwd(), "request.xml"))
    del replicate