Splitting a list of strings
Sean Ross
sross at connectmail.carleton.ca
Wed Sep 18 21:37:55 EDT 2002
Okay, then. For those still interested, here's several implementations of
sieving, based on peoples suggestions, plus a sample time trial. The trial
is on a file with over 8000 lines to process.
class ARFF:
"Holds the lines of a *.arff file"
def __init__(self, filename):
file = open(filename)
self.__lines = file.readlines()
file.close()
# used by sieve2()
def __classify(self, char):
"Returns classification index for attributes, comments, and data"
if char == "@": return "attr"
elif char == "%": return "comm"
else: return "data"
# based on suggestion by Emile van Sebille
def sieve(self):
"Sorts lines into data, attributes, or comments"
results = [[],[],[]] #[data, attributes, comments]
for line in self.__lines:
# classify line as attribute, comment, or data by testing first
char
results["@%".find(line[0]) + 1].append(line)
# discard first and last attribute
results[1] = results[1][1:-1]
return results[:-1] # discard comments
# based on suggestion by Alex Martelli
def sieve2(self):
"Sorts lines into data, attributes, or comments"
results = {}
for line in self.__lines:
# classify line as attribute, comment, or data by testing first
char
results.setdefault(self.__classify(line[0]), []).append(line)
# discard first and last attribute
results["attr"] = results["attr"][1:-1]
del results["comm"] # discard comments
return results
# my original solution augmented with suggestion by Anthony Tuininga
def sieve3(self):
"Sorts lines into data, attributes, and ignores comments"
attributes = [line for line in self.__lines if line[0] == "@"]
data = [line for line in self.__lines if not line[0] in ("%", "@") ]
return data, attributes[1:-1]
# using filters and lambda
def sieve4(self):
"Sorts lines into data, attributes, and ignores comments"
attributes = filter(lambda line: line[0]=="@", self.__lines)
data = filter(lambda line: not line[0] in ("%", "@"), self.__lines)
return data, attributes[1:-1]
# Emile's idea again, with a list comprehension
def sieve5(self):
"Sorts lines into data, attributes, or comments"
results = [[],[],[]] #[data, attributes, comments]
[results["@%".find(line[0]) + 1].append(line) for line in
self.__lines]
# discard first and last attribute
results[1] = results[1][1:-1]
return results[:-1] # discard comments
# based on Mark McEachern's suggestion
def sieve6(self):
attribute = "@"
comment = "%"
attributes = data = []
for line in self.__lines:
line = line.strip()
if not line or line.startswith(comment):
continue
elif line.startswith(attribute):
attributes.append(line)
else:
data.append(line)
return data, attributes[1:-1]
if __name__ == "__main__":
import time
arff = ARFF('./mushroom.arff')
t0=time.time()
data, attr = arff.sieve()
t = time.time()-t0
print "sieve time: %r seconds" % (t)
t0=time.time()
result = arff.sieve2()
t = time.time()-t0
print "sieve2 time: %r seconds" % (t)
t0=time.time()
data2, attr2 = arff.sieve3()
t = time.time()-t0
print "sieve3 time: %r seconds" % (t)
t0=time.time()
data3, attr3 = arff.sieve4()
t = time.time()-t0
print "sieve4 time: %r seconds" % (t)
t0=time.time()
data4, attr4 = arff.sieve5()
t = time.time()-t0
print "sieve5 time: %r seconds" % (t)
t0=time.time()
data5, attr5 = arff.sieve6()
t = time.time()-t0
print "sieve6 time: %r seconds" % (t)
====================================================
Trial Time:
sieve time: 0.05000007152557373 seconds
sieve2 time: 0.12999999523162842 seconds
sieve3 time: 0.069999933242797852 seconds
sieve4 time: 0.081000089645385742 seconds
sieve5 time: 0.069999933242797852 seconds
sieve6 time: 0.20000004768371582 seconds
In case you're curious, I'm probably just going to use sieve3().
It's not much slower than sieve(), and it's very easy to understand what's
happening without a lot of commenting.
OK, so, there you are. Thanks to everyone for your time and suggestions,
Sean Ross
More information about the Python-list
mailing list