Splitting a list of strings

Wed Sep 18 21:37:55 EDT 2002

Okay, then. For those still interested, here's several implementations of
sieving, based on peoples suggestions, plus a sample time trial. The trial
is on a file with over 8000 lines to process.

class ARFF:
    "Holds the lines of a *.arff file"
    def __init__(self, filename):
        file = open(filename)
        self.__lines = file.readlines()
        file.close()

    # used by sieve2()
    def __classify(self, char):
        "Returns classification index for attributes, comments, and data"
        if char == "@": return "attr"
        elif char == "%": return "comm"
        else: return "data"

    # based on suggestion by Emile van Sebille
    def sieve(self):
        "Sorts lines into data, attributes, or comments"
        results = [[],[],[]]  #[data, attributes, comments]
        for line in self.__lines:
            # classify line as attribute, comment, or data by testing first
char
            results["@%".find(line[0]) + 1].append(line)
        # discard first and last attribute
        results[1] = results[1][1:-1]
        return results[:-1]  # discard comments

    # based on suggestion by Alex Martelli
    def sieve2(self):
        "Sorts lines into data, attributes, or comments"
        results = {}
        for line in self.__lines:
            # classify line as attribute, comment,  or data by testing first
char
            results.setdefault(self.__classify(line[0]), []).append(line)
        # discard first and last attribute
        results["attr"] = results["attr"][1:-1]
        del results["comm"]  # discard comments
        return results

    # my original solution augmented with suggestion by Anthony Tuininga
    def sieve3(self):
        "Sorts lines into data, attributes, and ignores comments"
        attributes = [line for line in self.__lines if line[0] == "@"]
        data = [line for line in self.__lines if not line[0] in ("%", "@") ]
        return data, attributes[1:-1]

    # using filters and lambda
    def sieve4(self):
        "Sorts lines into data, attributes, and ignores comments"
        attributes = filter(lambda line: line[0]=="@", self.__lines)
        data = filter(lambda line:  not line[0] in ("%", "@"), self.__lines)
        return data, attributes[1:-1]

    # Emile's idea again, with a list comprehension
    def sieve5(self):
        "Sorts lines into data, attributes, or comments"
        results = [[],[],[]]  #[data, attributes, comments]
        [results["@%".find(line[0]) + 1].append(line) for line in
self.__lines]
        # discard first and last attribute
        results[1] = results[1][1:-1]
        return results[:-1]  # discard comments

    # based on Mark McEachern's suggestion
    def sieve6(self):
        attribute = "@"
        comment = "%"
        attributes = data = []
        for line in self.__lines:
            line = line.strip()
            if not line or line.startswith(comment):
                continue
            elif line.startswith(attribute):
                attributes.append(line)
            else:
                data.append(line)
        return data, attributes[1:-1]
if __name__ == "__main__":
    import time
    arff = ARFF('./mushroom.arff')

    t0=time.time()
    data, attr = arff.sieve()
    t = time.time()-t0
    print "sieve time: %r seconds" % (t)

    t0=time.time()
    result = arff.sieve2()
    t = time.time()-t0
    print "sieve2 time: %r seconds" % (t)

    t0=time.time()
    data2, attr2 = arff.sieve3()
    t = time.time()-t0
    print "sieve3 time: %r seconds" % (t)

    t0=time.time()
    data3, attr3 = arff.sieve4()
    t = time.time()-t0
    print "sieve4 time: %r seconds" % (t)

    t0=time.time()
    data4, attr4 = arff.sieve5()
    t = time.time()-t0
    print "sieve5 time: %r seconds" % (t)

    t0=time.time()
    data5, attr5 = arff.sieve6()
    t = time.time()-t0
    print "sieve6 time: %r seconds" % (t)

====================================================
Trial Time:
sieve time: 0.05000007152557373 seconds
sieve2 time: 0.12999999523162842 seconds
sieve3 time: 0.069999933242797852 seconds
sieve4 time: 0.081000089645385742 seconds
sieve5 time: 0.069999933242797852 seconds
sieve6 time: 0.20000004768371582 seconds

In case you're curious, I'm probably just going to use sieve3().
It's not much slower than sieve(), and it's very easy to understand what's
happening without a lot of commenting.

OK, so, there you are. Thanks to everyone for your time and suggestions,
Sean Ross