[Tutor] clean text

Emile van Sebille emile at fenx.com
Tue May 19 19:49:15 CEST 2009


On 5/19/2009 10:19 AM spir said...
> Le Tue, 19 May 2009 11:36:17 +0200,
> spir <denis.spir at free.fr> s'exprima ainsi:
> 
> [...]
> 
> Thank you Albert, Kent, Sanders, Lie, Malcolm.
> 
> This time regex wins! Thought it wouldn't because of the additional func call (too bad we cannot pass a mapping to re.sub). Actually the diff. is very small ;-) The relevant  change is indeed using a dict.
> Replacing string concat with ''.join() is slower (tested with 10 times and 100 times bigger strings too). Strange...
> Membership test in a set is only very slightly faster than in dict keys.

Hmm... this seems faster assuming it does the same thing...

xlate = dict( (chr(c),chr(c)) for c in range(256))
xlate.update(control_char_map)

def cleanRepr5(text):
     return "".join([ xlate[c] for c in text ])


Emile



> 
> I did a test with random strings of typical length for my app. Timing is ~ stable.
> 
> ===================================================
> ### various funcs ###
> # original
> def cleanRepr0(text):
> 	''' text with control chars replaced by repr() equivalent '''
> 	chars = ""
> 	for char in text:
> 		n = ord(char)
> 		if (n < 32) or (n > 126 and n < 160):
> 			char = repr(char)[1:-1]
> 		chars += char
> 	return chars
> 
> # use list
> def cleanRepr1(text):
> 	chars = []
> 	for char in text:
> 		n = ord(char)
> 		if (n < 32) or (n > 126 and n < 160):
> 			char = repr(char)[1:-1]
> 		chars.append(char)
> 	return ''.join(chars)
> 
> control_chars = set( chr(n) for n in (range(0, 32) + range(127, 160)) )
> control_char_map = dict( (c, repr(c)[1:-1]) for c in control_chars )
> 
> # use map
> def cleanRepr2(text):
> 	chars = ""
> 	for char in text:
> 		if char in control_char_map:
> 			char = control_char_map[char]
> 		chars += char
> 	return chars
> 
> # use map & set
> def cleanRepr3(text):
> 	chars = []
> 	for char in text:
> 		if char in control_chars:
> 			char = control_char_map[char]
> 		chars.append(char)
> 	return ''.join(chars)
> def cleanRepr3(text):
> 	chars = ""
> 	for char in text:
> 		if char in control_chars:
> 			char = control_char_map[char]
> 		chars += char
> 	return chars
> 
> import re
> controlsRe = re.compile(r'[\x00-\x1f\x7f-\x9f]')
> 
> # use regex
> def substChar(m):
>     ''' Helper function for re.sub(). m will be a Match object. '''
>     return control_char_map[m.group()]
> def cleanRepr4(text):
> 	return controlsRe.sub(substChar, text)
> 
> 
> ### timing ###
> #helper func to generate random string
> from time import time
> import random
> 
> def randomString():
> 	count = random.randrange(11,111)
> 	chars = [chr(random.randrange(1, 255)) for n in range(count)]
> 	return ''.join(chars)
> 
> def timeAll():
> 	t0=t1=t2=t3=t4=0
> 	for n in range(9999):
> 		s = randomString()
> 		t = time() ; cleanRepr0(s) ; t0 += time() - t
> 		t = time() ; cleanRepr1(s) ; t1 += time() - t
> 		t = time() ; cleanRepr2(s) ; t2 += time() - t
> 		t = time() ; cleanRepr3(s) ; t3 += time() - t
> 		t = time() ; cleanRepr4(s) ; t4 += time() - t
> 	print (	"original:	%.3f\n"
> 			"list:		%.3f\n"
> 			"map:		%.3f\n"
> 			"map & set:	%.3f\n"
> 			"regex:		%.3f\n"
> 			%(t0,t1,t2,t3,t4) )
> 
> timeAll()
> ===================================================
> ==>
> original:	0.692
> list:		0.829
> map:		0.364
> map & set:	0.349
> regex:		0.341
> ===================================================
> 
> Denis
> ------
> la vita e estrany
> _______________________________________________
> Tutor maillist  -  Tutor at python.org
> http://mail.python.org/mailman/listinfo/tutor
> 



More information about the Tutor mailing list