[Tutor] clean text
spir
denis.spir at free.fr
Tue May 19 19:19:02 CEST 2009
Le Tue, 19 May 2009 11:36:17 +0200,
spir <denis.spir at free.fr> s'exprima ainsi:
[...]
Thank you Albert, Kent, Sanders, Lie, Malcolm.
This time regex wins! Thought it wouldn't because of the additional func call (too bad we cannot pass a mapping to re.sub). Actually the diff. is very small ;-) The relevant change is indeed using a dict.
Replacing string concat with ''.join() is slower (tested with 10 times and 100 times bigger strings too). Strange...
Membership test in a set is only very slightly faster than in dict keys.
I did a test with random strings of typical length for my app. Timing is ~ stable.
===================================================
### various funcs ###
# original
def cleanRepr0(text):
''' text with control chars replaced by repr() equivalent '''
chars = ""
for char in text:
n = ord(char)
if (n < 32) or (n > 126 and n < 160):
char = repr(char)[1:-1]
chars += char
return chars
# use list
def cleanRepr1(text):
chars = []
for char in text:
n = ord(char)
if (n < 32) or (n > 126 and n < 160):
char = repr(char)[1:-1]
chars.append(char)
return ''.join(chars)
control_chars = set( chr(n) for n in (range(0, 32) + range(127, 160)) )
control_char_map = dict( (c, repr(c)[1:-1]) for c in control_chars )
# use map
def cleanRepr2(text):
chars = ""
for char in text:
if char in control_char_map:
char = control_char_map[char]
chars += char
return chars
# use map & set
def cleanRepr3(text):
chars = []
for char in text:
if char in control_chars:
char = control_char_map[char]
chars.append(char)
return ''.join(chars)
def cleanRepr3(text):
chars = ""
for char in text:
if char in control_chars:
char = control_char_map[char]
chars += char
return chars
import re
controlsRe = re.compile(r'[\x00-\x1f\x7f-\x9f]')
# use regex
def substChar(m):
''' Helper function for re.sub(). m will be a Match object. '''
return control_char_map[m.group()]
def cleanRepr4(text):
return controlsRe.sub(substChar, text)
### timing ###
#helper func to generate random string
from time import time
import random
def randomString():
count = random.randrange(11,111)
chars = [chr(random.randrange(1, 255)) for n in range(count)]
return ''.join(chars)
def timeAll():
t0=t1=t2=t3=t4=0
for n in range(9999):
s = randomString()
t = time() ; cleanRepr0(s) ; t0 += time() - t
t = time() ; cleanRepr1(s) ; t1 += time() - t
t = time() ; cleanRepr2(s) ; t2 += time() - t
t = time() ; cleanRepr3(s) ; t3 += time() - t
t = time() ; cleanRepr4(s) ; t4 += time() - t
print ( "original: %.3f\n"
"list: %.3f\n"
"map: %.3f\n"
"map & set: %.3f\n"
"regex: %.3f\n"
%(t0,t1,t2,t3,t4) )
timeAll()
===================================================
==>
original: 0.692
list: 0.829
map: 0.364
map & set: 0.349
regex: 0.341
===================================================
Denis
------
la vita e estrany
More information about the Tutor
mailing list