Memory problems (garbage collection)

Thu Apr 23 02:50:23 EDT 2009

Very new to Python, running 2.5 on windows.
I am processing an XML file (7.2MB). Using the standard library I am 
recursively processing each node and parsing it. The branches don't go 
particularly deep. What is happening is that the program is running really 
really slowly, so slow that even running it over night, it still doesn't 
finish.
Stepping through it I have noticed that memory usage has shot up from 190MB 
to 624MB and continues to climb. If I set a break point and then stop the 
program the memory is not released. It is not until I shutdown PythonWin 
that the memory gets released.
I thought this might mean objects were not getting GCed, so through the 
interactive window I imported gc. gc.garbage is empty. gc.collect() seems to 
fix the problem (after much thinking) and reports 2524104. Running it again 
returns 0.
I thought that garbage collection was automatic, if I use variables in a 
method do I have to del them?
I tried putting a "del node" in all my for node in .... loops but that 
didn't help. collect() reports the same number. Tried putting gc.collect() 
at the end of the loops but that didn't help either.
If I have the program at a break and do gc.collect() it doesn't fix it, so 
whatever referencing is causing problems is still active.
My program is parsing the XML and generating a Python program for 
SQLalchemy, but the program never gets a chance to run the memory problem is 
prior to that. It probably has something to do with the way I am string 
building.

My apologies for the long post but without being able to see the code I 
doubt anyone can give me a solid answer so here it goes (sorry for the lack 
of comments):

from xml.dom import minidom
import os
import gc

class xmlProcessing:
    """ General class for XML processing"""

    def process(self, filename="", xmlString=""):
        if xmlString:
            pass
        elif filename:
            xmldoc = minidom.parse(filename)
        self.parse( xmldoc.documentElement )

    def parseBranch(self, parentNode):
        """ Process an XML branch """
        for node in parentNode.childNodes:
            try:
                parseMethod = getattr(self, "parse_%s" % 
node.__class__.__name__)
            except AttributeError:
                continue
            if parseMethod(node):
                continue
            self.parseBranch(node)
            del node

    def parse_Document(self, node):
        pass

    def parse_Text(self, node):
        pass

    def parse_Comment(self, node):
        pass

    def parse_Element(self, node):
        try:
            handlerMethod = getattr(self, "do_%s" % node.tagName)
        except AttributeError:
            return False
        handlerMethod(node)
        return True

class reptorParsing(xmlProcessing):
    """ Specific class for generating a SQLalchemy program to create tables
    and populate them with data"""

    def __init__(self):
        self.schemaPreface = """from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///tutorial.db', echo=False)
metadata = MetaData()
Base = declarative_base()"""
        self.schemaTables = ""
        self.schemaFields = ""
        self.dataUpdate = ""
        self.tableDict = {}
        self.tableName = ""
        self.tables = ""

    def parse(self, parentNode):
        """Main entry point to begin processing a XML document"""
        self.parseBranch(parentNode)
        # Properties such as schemaTables and .tables are populated by the 
various methods below
        fupdate=open(os.path.join(os.getcwd(), "update.py"), 'w')
        if self.schemaTables:
            fupdate.write("import schema\n")
            f=open(os.path.join(os.getcwd(), "schema.py"), 'w')
            f.write(self.schemaPreface+"\n"+self.schemaTables+
                '\n' + "metadata.create_all(engine)\n"+
                "print 'hello 2'")
            f.close()
        if self.tables:
            fupdate.write(self.tables)
#            f=open(os.path.join(os.getcwd(), "dataUpdate.py"), 'w')
#            f.write(self.dataUpdate)
#            f.close()
        fupdate.close()

    def do_TABLES(self, tableNode):
        """Process schema for tables"""
        for node in tableNode.childNodes:
            self.tableName = node.tagName
            # Define a declaritive mapping class
            self.schemaTables += """\nclass %s(Base):
    __tablename__ = '%s'
""" % (self.tableName, self.tableName)
            self.schemaFields = ""
            # allow for userA = users("Billy","Bob") via a __init__()
            self.schemaInitPreface = "    def __init__(self"
            self.schemaInitBody = ""
            self.parseBranch(node)
            self.schemaInitPreface += "):\n"
            self.schemaTables += self.schemaFields + "\n" + \
                self.schemaInitPreface + \
                self.schemaInitBody + "\n"
        gc.collect()

    def do_FIELDS(self, fieldsNode):
        """Process schema for fields within tables"""
        for node in fieldsNode.childNodes:
            if self.schemaFields:
                self.schemaFields += "\n"
            cType = ""
            # The attribute type holds the type of field
            crType = node.attributes["type"].value
            if crType==u"C":
                cType = "String(length=%s)" % node.attributes["len"].value
            elif crType==u"N" and node.attributes["dec"].value==u'0':
                cType = "Integer"
            elif crType==u"N":
                cType = "Numeric(precision=%s, scale=%s)" % 
(node.attributes["len"].value,node.attributes["dec"].value)
            elif crType==u"L":
                cType = "Boolean"
            elif crType==u"T":
                cType = "DateTime"
            elif crType==u"D":
                cType = "Date"
            elif crType==u"M" or crType==u"G":
                cType = "Text"

            if node.attributes.getNamedItem("primary"):
                cType += ", primary_key=True"
            self.schemaFields += "    %s = Column(%s)" % (node.tagName, 
cType)
            self.schemaInitPreface += ", \\\n        %s" % (node.tagName)
            self.schemaInitBody += "            self.%s = %s\n" % 
(node.tagName, node.tagName)
            self.tableDict[self.tableName + "." + node.tagName] = crType
            del node

    def do_DATA(self, dataNode):
        """This is for processing actual data to be pushed into the tables

        Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE -> 
FIELD_NAME -> VALUE"""
        for node in dataNode.childNodes:
            self.dataUpdate = """
import time
from datetime import *
from sqlalchemy import *
from sqlalchemy.orm import *
engine = create_engine('sqlite:///tutorial.db', echo=False)
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()
"""
            self.keyValue = ""
            self.keyField = node.attributes["key"].value
            self.tableName = node.tagName
            self.parseBranch(node)
            self.tables += "\nimport %s_update.py" % (self.tableName)
            f=open(os.path.join(os.getcwd(), self.tableName + "_update.py"), 
'w')
            f.write(self.dataUpdate)
            f.close()
            gc.collect()

    def do_TUPLE(self, tupleNode):
        """ A TUPLE is what the XML file refers to a table row
        Sits below a DATA child"""
        self.dataUpdate += """
entry = %s()
session.add(entry)
""" % (self.tableName)
        for node in tupleNode.childNodes:
            for dataNode in node.childNodes:
                crType = self.tableDict[self.tableName + "." + node.tagName]

                if crType==u"C" or crType==u"M":
                    cValue = '"""%s"""' % dataNode.data
                elif crType==u"T":
                    cValue = 'datetime.strptime("'+dataNode.data+'", 
"%Y-%m-%d %H:%M")'
                elif crType==u"D":
                    cValue = 'datetime.strptime("'+dataNode.data+'", 
"%Y-%m-%d")'
                else:
                    cValue = dataNode.data

                self.dataUpdate += "\nentry.%s = %s" % (node.tagName, 
cValue)
                del dataNode

        self.dataUpdate += "\nsession.commit()"
        del node

if __name__ == '__main__':
    replicate = reptorParsing()
    replicate.process(filename=os.path.join(os.getcwd(), "request.xml"))
    import update