[Tutor] clean text

spir denis.spir at free.fr
Tue May 19 19:19:02 CEST 2009


Le Tue, 19 May 2009 11:36:17 +0200,
spir <denis.spir at free.fr> s'exprima ainsi:

[...]

Thank you Albert, Kent, Sanders, Lie, Malcolm.

This time regex wins! Thought it wouldn't because of the additional func call (too bad we cannot pass a mapping to re.sub). Actually the diff. is very small ;-) The relevant  change is indeed using a dict.
Replacing string concat with ''.join() is slower (tested with 10 times and 100 times bigger strings too). Strange...
Membership test in a set is only very slightly faster than in dict keys.

I did a test with random strings of typical length for my app. Timing is ~ stable.

===================================================
### various funcs ###
# original
def cleanRepr0(text):
	''' text with control chars replaced by repr() equivalent '''
	chars = ""
	for char in text:
		n = ord(char)
		if (n < 32) or (n > 126 and n < 160):
			char = repr(char)[1:-1]
		chars += char
	return chars

# use list
def cleanRepr1(text):
	chars = []
	for char in text:
		n = ord(char)
		if (n < 32) or (n > 126 and n < 160):
			char = repr(char)[1:-1]
		chars.append(char)
	return ''.join(chars)

control_chars = set( chr(n) for n in (range(0, 32) + range(127, 160)) )
control_char_map = dict( (c, repr(c)[1:-1]) for c in control_chars )

# use map
def cleanRepr2(text):
	chars = ""
	for char in text:
		if char in control_char_map:
			char = control_char_map[char]
		chars += char
	return chars

# use map & set
def cleanRepr3(text):
	chars = []
	for char in text:
		if char in control_chars:
			char = control_char_map[char]
		chars.append(char)
	return ''.join(chars)
def cleanRepr3(text):
	chars = ""
	for char in text:
		if char in control_chars:
			char = control_char_map[char]
		chars += char
	return chars

import re
controlsRe = re.compile(r'[\x00-\x1f\x7f-\x9f]')

# use regex
def substChar(m):
    ''' Helper function for re.sub(). m will be a Match object. '''
    return control_char_map[m.group()]
def cleanRepr4(text):
	return controlsRe.sub(substChar, text)


### timing ###
#helper func to generate random string
from time import time
import random

def randomString():
	count = random.randrange(11,111)
	chars = [chr(random.randrange(1, 255)) for n in range(count)]
	return ''.join(chars)

def timeAll():
	t0=t1=t2=t3=t4=0
	for n in range(9999):
		s = randomString()
		t = time() ; cleanRepr0(s) ; t0 += time() - t
		t = time() ; cleanRepr1(s) ; t1 += time() - t
		t = time() ; cleanRepr2(s) ; t2 += time() - t
		t = time() ; cleanRepr3(s) ; t3 += time() - t
		t = time() ; cleanRepr4(s) ; t4 += time() - t
	print (	"original:	%.3f\n"
			"list:		%.3f\n"
			"map:		%.3f\n"
			"map & set:	%.3f\n"
			"regex:		%.3f\n"
			%(t0,t1,t2,t3,t4) )

timeAll()
===================================================
==>
original:	0.692
list:		0.829
map:		0.364
map & set:	0.349
regex:		0.341
===================================================

Denis
------
la vita e estrany


More information about the Tutor mailing list