[pypy-svn] r49343 - in pypy/dist/pypy/rlib: . test
cfbolz at codespeak.net
cfbolz at codespeak.net
Tue Dec 4 16:59:20 CET 2007
Author: cfbolz
Date: Tue Dec 4 16:59:20 2007
New Revision: 49343
Modified:
pypy/dist/pypy/rlib/rope.py
pypy/dist/pypy/rlib/test/test_rope.py
Log:
some refactorings about the FringeIterator. also contains the beginnings about
some crazy ideas about extremely fast string searching.
Modified: pypy/dist/pypy/rlib/rope.py
==============================================================================
--- pypy/dist/pypy/rlib/rope.py (original)
+++ pypy/dist/pypy/rlib/rope.py Tue Dec 4 16:59:20 2007
@@ -53,6 +53,7 @@
class StringNode(object):
hash_cache = 0
+ charbitmask = 0
def length(self):
raise NotImplementedError("base class")
@@ -86,6 +87,9 @@
def getslice(self, start, stop):
raise NotImplementedError("abstract base class")
+ def can_contain_int(self, value):
+ return True #conservative default
+
def view(self):
view([self])
@@ -115,9 +119,13 @@
assert isinstance(s, str)
self.s = s
is_ascii = True
+ charbitmask = 0
for c in s:
- if ord(c) >= 128:
+ ordc = ord(c)
+ if ordc >= 128:
is_ascii = False
+ charbitmask |= 1 << (ordc & 0x1F)
+ self.charbitmask = charbitmask
self._is_ascii = is_ascii
def length(self):
@@ -158,16 +166,22 @@
def getrope(self, index):
return LiteralStringNode.PREBUILT[ord(self.s[index])]
+ def can_contain_int(self, value):
+ if value > 255:
+ return False
+ if self.is_ascii() and value > 127:
+ return False
+ return (1 << (value & 0x1f)) & self.charbitmask
+
def getslice(self, start, stop):
assert 0 <= start <= stop
return LiteralStringNode(self.s[start:stop])
def find_int(self, what, start, stop):
- if what >= 256:
+ if not self.can_contain_int(what):
return -1
- result = self.s.find(chr(what), start, stop)
- return result
+ return self.s.find(chr(what), start, stop)
def literal_concat(self, other):
if (isinstance(other, LiteralStringNode) and
@@ -198,6 +212,13 @@
def __init__(self, u):
assert isinstance(u, unicode)
self.u = u
+ charbitmask = 0
+ for c in u:
+ ordc = ord(c)
+ if ordc >= 128:
+ charbitmask |= 1 # be compatible with LiteralStringNode
+ charbitmask |= 1 << (ordc & 0x1F)
+ self.charbitmask = charbitmask
def length(self):
return len(self.u)
@@ -236,13 +257,17 @@
return self
return LiteralUnicodeNode(unichr(ch))
+ def can_contain_int(self, value):
+ return (1 << (value & 0x1f)) & self.charbitmask
+
def getslice(self, start, stop):
assert 0 <= start <= stop
return LiteralUnicodeNode(self.u[start:stop])
def find_int(self, what, start, stop):
- result = self.u.find(unichr(what), start, stop)
- return result
+ if not self.can_contain_int(what):
+ return -1
+ return self.u.find(unichr(what), start, stop)
def literal_concat(self, other):
if (isinstance(other, LiteralUnicodeNode) and
@@ -278,6 +303,7 @@
self.balanced = False
self._is_ascii = left.is_ascii() and right.is_ascii()
self._is_bytestring = left.is_bytestring() and right.is_bytestring()
+ self.charbitmask = left.charbitmask | right.charbitmask
def is_ascii(self):
return self._is_ascii
@@ -335,6 +361,13 @@
else:
return self.left.getrope(index)
+ def can_contain_int(self, value):
+ if self.is_bytestring() and value > 255:
+ return False
+ if self.is_ascii() and value > 127:
+ return False
+ return (1 << (value & 0x1f)) & self.charbitmask
+
def flatten_string(self):
f = fringe(self)
return "".join([node.flatten_string() for node in f])
@@ -650,11 +683,6 @@
length = node.length()
if stop == -1:
stop = length
- if start != 0 or stop != length:
- newstart, newstop, node = find_straddling(node, start, stop)
- offset = start - newstart
- start = newstart
- stop = newstop
assert 0 <= start <= stop
if isinstance(node, LiteralNode):
pos = node.find_int(what, start, stop)
@@ -662,6 +690,9 @@
return pos
return pos + offset
iter = FringeIterator(node)
+ newstart = iter._seekforward(start)
+ offset += start - newstart
+ start = newstart
#import pdb; pdb.set_trace()
i = 0
while i < stop:
@@ -675,6 +706,8 @@
continue
searchstart = max(0, start - i)
searchstop = min(stop - i, nodelength)
+ if searchstop <= 0:
+ return -1
assert isinstance(fringenode, LiteralNode)
pos = fringenode.find_int(what, searchstart, searchstop)
if pos != -1:
@@ -809,6 +842,29 @@
return curr
raise StopIteration
+ def _seekforward(self, length):
+ """seek forward up to n characters, returning the number remaining chars.
+ experimental api"""
+ curr = None
+ while self.stack:
+ curr = self.stack.pop()
+ if length < curr.length():
+ break
+ length -= curr.length()
+ else:
+ raise StopIteration
+ while isinstance(curr, BinaryConcatNode):
+ left_length = curr.left.length()
+ if length < left_length:
+ self.stack.append(curr.right)
+ curr = curr.left
+ else:
+ length -= left_length
+ curr = curr.right
+ self.stack.append(curr)
+ return length
+
+
def fringe(node):
result = []
iter = FringeIterator(node)
@@ -834,25 +890,6 @@
return curr
raise StopIteration
-class SeekableFringeIterator(FringeIterator):
- def __init__(self, node):
- FringeIterator.__init__(self, node)
- self.fringestack = []
- self.fringe = []
-
- def next(self):
- if self.fringestack:
- result = self.fringestack.pop()
- else:
- result = FringeIterator.next(self)
- self.fringe.append(result)
- return result
-
- def seekback(self):
- result = self.fringe.pop()
- self.fringestack.append(result)
- return result
-
class ItemIterator(object):
def __init__(self, node, start=0):
@@ -864,19 +901,9 @@
self._advance_to(start)
def _advance_to(self, index):
- # XXX this is O(index), should be O(log(index))
- assert index > 0
- assert self.index == 0
- while 1:
- node = self.iter.next()
- length = node.length()
- if index < length:
- self.index = index
- self.node = node
- self.nodelength = length
- break
- index -= length
- assert index >= 0
+ self.index = self.iter._seekforward(index)
+ self.node = self.iter.next()
+ self.nodelength = self.node.length()
def getnode(self):
node = self.node
Modified: pypy/dist/pypy/rlib/test/test_rope.py
==============================================================================
--- pypy/dist/pypy/rlib/test/test_rope.py (original)
+++ pypy/dist/pypy/rlib/test/test_rope.py Tue Dec 4 16:59:20 2007
@@ -350,36 +350,26 @@
n = iter.next()
assert n is GHI
py.test.raises(StopIteration, iter.next)
+ iter = FringeIterator(rope)
-def test_seekable_fringe_iterator():
+def test_fringe_iterator_seekforward():
ABC = LiteralStringNode("abc")
DEF = LiteralStringNode("def")
GHI = LiteralStringNode("ghi")
rope = BinaryConcatNode(BinaryConcatNode(ABC, DEF), GHI)
- iter = SeekableFringeIterator(rope)
- n = iter.next()
- assert n is ABC
- n = iter.seekback()
- assert n is ABC
+ iter = FringeIterator(rope)
n = iter.next()
assert n is ABC
+ i = iter._seekforward(5)
+ assert i == 2
n = iter.next()
- assert n is DEF
- n = iter.next()
- assert n is GHI
- n = iter.seekback()
assert n is GHI
- n = iter.seekback()
- assert n is DEF
- n = iter.seekback()
- assert n is ABC
- n = iter.next()
- assert n is ABC
- n = iter.next()
- assert n is DEF
+ py.test.raises(StopIteration, iter.next)
+ iter = FringeIterator(rope)
+ i = iter._seekforward(7)
+ assert i == 1
n = iter.next()
assert n is GHI
- py.test.raises(StopIteration, iter.next)
def test_seekforward():
More information about the Pypy-commit
mailing list