help make it faster please

Sun Nov 13 13:27:13 EST 2005

Fredrik Lundh wrote:
> Lonnie Princehouse wrote:
> 
> 
>>"[a-z0-9_]" means "match a single character from the set {a through z,
>>0 through 9, underscore}".
> 
> 
> "\w" should be a bit faster; it's equivalent to "[a-zA-Z0-9_]" (unless you
> specify otherwise using the locale or unicode flags), but is handled more
> efficiently by the RE engine.
> 
> (you can use \w in sets too, so you can do e.g. "[\w@]")
> 
> </F>

The \w does make a small difference, but not as much as I expected. 
Surprisingly a standard Python word iterator works just as well, and is 
easier to understand than the re version.

Which one is faster depends on the average word length and number of 
ignored characters.

Cheers,
    Ron

Character count: 100000
Word count: 16477
Average word size: 6.06906597075
word_counter: 0.06820057 (best of 3)
count_words: 0.07333837 (best of 3)

#########

import string
import re
import time
import random

# Create a really ugly n length string to test with.
n = 100000
random.seed(1)
lines = ''.join([ random.choice(string.ascii_letters * 2
                   + '_@$&*()#/<>' + '    \n' * 6) for x in range(n) ])
print 'Character count:', n
print 'Word count:', len(lines.split())
print 'Average word size:', float(n)/len(lines.split())

letters = string.lowercase + string.digits + '_@'

def word_iter(text, letters=letters):
     ltrs=letters.lower()
     wd = ''
     for c in text + ' ':
         if c in ltrs:
             wd += c
         elif wd != '':
             yield wd
             wd = ''

def word_counter(text):
     txt = text.lower()
     countDict={}
     for wd in word_iter(txt):
         if wd in countDict:
             countDict[wd] += 1
         else:
             countDict[wd] = 1
     return countDict

word_finder = re.compile('[\w@]+', re.I)

def count_words(string, word_finder = word_finder):
     # avoid global lookups
     countDict = {}
     for match in word_finder.finditer(string.lower()):
         word = match.group(0)
         countDict[word] = countDict.get(word,0) + 1
     return countDict

foos = [word_counter, count_words]
r1 = r2 = None
for foo in foos:
     best_time = 0
     for n in range(3):
         t = time.clock()
         for line in lines.splitlines():
             countDict = foo(line)
         tt = time.clock()-t
         if tt > best_time: best_time = tt

     r1 = r2
     r2 = countDict
     if r1 != None:
         # change to 1 if assert fails to find problem
         if 0:
             for k in r1.keys():
                 if r1[k] != r2[k]:
                     print k,r1[k],r2[k]
         assert r1 == r2

     print '%s: %.8f (best of %d)' \
           % (foo.__name__, best_time, n+1)