help make it faster please
Ron Adam
rrr at ronadam.com
Sun Nov 13 13:27:13 EST 2005
Fredrik Lundh wrote:
> Lonnie Princehouse wrote:
>
>
>>"[a-z0-9_]" means "match a single character from the set {a through z,
>>0 through 9, underscore}".
>
>
> "\w" should be a bit faster; it's equivalent to "[a-zA-Z0-9_]" (unless you
> specify otherwise using the locale or unicode flags), but is handled more
> efficiently by the RE engine.
>
> (you can use \w in sets too, so you can do e.g. "[\w@]")
>
> </F>
The \w does make a small difference, but not as much as I expected.
Surprisingly a standard Python word iterator works just as well, and is
easier to understand than the re version.
Which one is faster depends on the average word length and number of
ignored characters.
Cheers,
Ron
Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.06820057 (best of 3)
count_words: 0.07333837 (best of 3)
#########
import string
import re
import time
import random
# Create a really ugly n length string to test with.
n = 100000
random.seed(1)
lines = ''.join([ random.choice(string.ascii_letters * 2
+ '_@$&*()#/<>' + ' \n' * 6) for x in range(n) ])
print 'Character count:', n
print 'Word count:', len(lines.split())
print 'Average word size:', float(n)/len(lines.split())
letters = string.lowercase + string.digits + '_@'
def word_iter(text, letters=letters):
ltrs=letters.lower()
wd = ''
for c in text + ' ':
if c in ltrs:
wd += c
elif wd != '':
yield wd
wd = ''
def word_counter(text):
txt = text.lower()
countDict={}
for wd in word_iter(txt):
if wd in countDict:
countDict[wd] += 1
else:
countDict[wd] = 1
return countDict
word_finder = re.compile('[\w@]+', re.I)
def count_words(string, word_finder = word_finder):
# avoid global lookups
countDict = {}
for match in word_finder.finditer(string.lower()):
word = match.group(0)
countDict[word] = countDict.get(word,0) + 1
return countDict
foos = [word_counter, count_words]
r1 = r2 = None
for foo in foos:
best_time = 0
for n in range(3):
t = time.clock()
for line in lines.splitlines():
countDict = foo(line)
tt = time.clock()-t
if tt > best_time: best_time = tt
r1 = r2
r2 = countDict
if r1 != None:
# change to 1 if assert fails to find problem
if 0:
for k in r1.keys():
if r1[k] != r2[k]:
print k,r1[k],r2[k]
assert r1 == r2
print '%s: %.8f (best of %d)' \
% (foo.__name__, best_time, n+1)
More information about the Python-list
mailing list