Program inefficiency?

hall.jeff at gmail.com hall.jeff at gmail.com
Sat Sep 29 17:22:27 CEST 2007


I wrote the following simple program to loop through our help files
and fix some errors (in case you can't see the subtle RE search that's
happening, we're replacing spaces in bookmarks with _'s)

the program works great except for one thing. It's significantly
slower through the later files in the search then through the early
ones... Before anyone criticizes, I recognize that that middle section
could be simplified with a for loop... I just haven't cleaned it
up...

The problem is that the first 300 files take about 10-15 seconds and
the last 300 take about 2 minutes... If we do more than about 1500
files in one run, it just hangs up and never finishes...

Is there a solution here that I'm missing? What am I doing that is so
inefficient?

# File: masseditor.py

import re
import os
import time

def massreplace():
    editfile = open("pathname\editfile.txt")
    filestring = editfile.read()
    filelist = filestring.splitlines()
##    errorcheck = re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
    for i in range(len(filelist)):
        source = open(filelist[i])
        starttext = source.read()
        interimtext = replacecycle(starttext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        interimtext = replacecycle(interimtext)
        finaltext = replacecycle(interimtext)
        source.close()
        source = open(filelist[i],"w")
        source.write(finaltext)
        source.close()
##        if errorcheck.findall(finaltext)!=[]:
##            print errorcheck.findall(finaltext)
##            print filelist[i]
        if i == 100:
            print "done 100"
            print time.clock()
        elif i == 300:
            print "done 300"
            print time.clock()
        elif i == 600:
            print "done 600"
            print time.clock()
        elif i == 1000:
            print "done 1000"
            print time.clock()
    print "done"
    print i
    print time.clock()

def replacecycle(starttext):
    p1= re.compile('(href=|HREF=)+(.*)(#)+(.*)( )+(.*)(">)+')
    p2= re.compile('(name=")+(.*)( )+(.*)(">)+')
    p3= re.compile('(href=|HREF=)+(.*)(#)+(.*)(\')+(.*)(">)+')
    p4= re.compile('(name=")+(.*)(\')+(.*)(">)+')
    p5= re.compile('(href=|HREF=)+(.*)(#)+(.*)(-)+(.*)(">)+')
    p6= re.compile('(name=")+(.*)(-)+(.*)(">)+')
    p7= re.compile('(href=|HREF=)+(.*)(#)+(.*)(<)+(.*)(">)+')
    p8= re.compile('(name=")+(.*)(<)+(.*)(">)+')
    p7= re.compile('(href=|HREF=")+(.*)(#)+(.*)(:)+(.*)(">)+')
    p8= re.compile('(name=")+(.*)(:)+(.*)(">)+')
    p9= re.compile('(href=|HREF=")+(.*)(#)+(.*)(\?)+(.*)(">)+')
    p10= re.compile('(name=")+(.*)(\?)+(.*)(">)+')
    p100= re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
    q1= r"\1\2\3\4_\6\7"
    q2= r"\1\2_\4\5"
    interimtext = p1.sub(q1, starttext)
    interimtext = p2.sub(q2, interimtext)
    interimtext = p3.sub(q1, interimtext)
    interimtext = p4.sub(q2, interimtext)
    interimtext = p5.sub(q1, interimtext)
    interimtext = p6.sub(q2, interimtext)
    interimtext = p7.sub(q1, interimtext)
    interimtext = p8.sub(q2, interimtext)
    interimtext = p9.sub(q1, interimtext)
    interimtext = p10.sub(q2, interimtext)
    interimtext = p100.sub(q2, interimtext)

    return interimtext

massreplace()




More information about the Python-list mailing list