[pypy-commit] pypy utf8-unicode2: Add a simple index caching scheme
waedt
noreply at buildbot.pypy.org
Sat Sep 6 22:45:14 CEST 2014
Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r73352:26fe9ddc3df9
Date: 2014-09-06 15:38 -0500
http://bitbucket.org/pypy/pypy/changeset/26fe9ddc3df9/
Log: Add a simple index caching scheme
diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -2,7 +2,8 @@
import py
import sys
-from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
+from pypy.interpreter.utf8 import (
+ Utf8Str, Utf8Builder, utf8chr, utf8ord, utf8ord_bytes, LastAccessCache)
from rpython.rtyper.lltypesystem import rffi
from rpython.rtyper.test.test_llinterp import interpret
@@ -221,3 +222,14 @@
rffi.free_wcharp(wcharp)
+def test_mru_cache():
+ s = Utf8Str.from_unicode(u'abcdefg')
+ cacher = LastAccessCache(s)
+
+ for i in range(len(s)):
+ assert i == cacher.byte_index_of_char(i)
+ assert cacher.byte_index_of_char(1) == 1
+
+ l = [unichr(utf8ord_bytes(s.bytes, cacher.byte_index_of_char(i)))
+ for i in range(len(s))]
+ assert u''.join(l)
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -4,6 +4,7 @@
from rpython.rlib.runicode import utf8_code_length
from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
from rpython.rlib.rarithmetic import r_uint, intmask, base_int
+from rpython.rlib.jit import elidable
from rpython.rtyper.lltypesystem import rffi, lltype
from rpython.tool.sourcetools import func_with_new_name
@@ -132,6 +133,7 @@
# have to iterate the bytes while checking for (& 0b01000000)
self.bytes = data
+ self._cache_scheme = LastAccessCache(self)
self._is_ascii = is_ascii
if length != -1:
@@ -160,22 +162,51 @@
self._len = length
def index_of_char(self, char):
- if char >= len(self):
- return len(self.bytes)
- byte = 0
- pos = 0
- while pos < char:
- pos += 1
- byte += utf8_code_length[ord(self.bytes[byte])]
+ return self._cache_scheme.byte_index_of_char(char)
- return byte
+ def index_of_char_from_known(self, char, start_char, start_byte):
+ if start_char > char:
+ pos = start_char
+ byte_pos = start_byte
+ while pos != char:
+ byte_pos -= 1
+ if utf8_code_length[ord(self.bytes[byte_pos])]:
+ pos -= 1
- def char_index_of_byte(self, byte_):
- byte = 0
- pos = 0
- while byte < byte_:
- pos += 1
- byte += utf8_code_length[ord(self.bytes[byte])]
+ elif start_char < char:
+ diff = char - start_char
+ byte_pos = start_byte
+ while diff:
+ byte_pos = self.next_char(byte_pos)
+ diff -= 1
+
+ else:
+ return start_byte
+
+ return byte_pos
+
+
+ def char_index_of_byte(self, byte_pos):
+ return self._cache_scheme.char_index_of_byte(byte_pos)
+
+ def char_index_of_byte_from_known(self, byte_pos, start_char, start_byte):
+ if start_byte > byte_pos:
+ pos = start_char - 1
+ cur_byte_pos = start_byte - 1
+ while byte_pos != cur_byte_pos:
+ if utf8_code_length[ord(self.bytes[cur_byte_pos])]:
+ pos -= 1
+ cur_byte_pos -= 1
+
+ elif start_byte < byte_pos:
+ pos = start_char
+ cur_byte_pos = start_byte
+ while byte_pos != cur_byte_pos:
+ cur_byte_pos = self.next_char(cur_byte_pos)
+ pos += 1
+
+ else:
+ return start_char
return pos
@@ -870,3 +901,60 @@
return i
+#__________________
+
+
+class LastAccessCache(object):
+ _immutable_fields_ = ['str']
+ def __init__(self, str):
+ self.str = str
+ self.prev_byte_pos = 0
+ self.prev_pos = 0
+
+ def byte_index_of_char(self, pos):
+ if pos == 0:
+ return 0
+
+ # Calculate the distance from the start, the last known position, and
+ # the end
+ # (cost, known char, known byte)
+ start_dist = (pos, 0, 0)
+ end_dist = (2 * (len(self.str) - pos), len(self.str),
+ len(self.str.bytes))
+
+ if pos <= self.prev_pos:
+ min = (2 * (self.prev_pos - pos), self.prev_pos, self.prev_byte_pos)
+ else:
+ min = (pos - self.prev_pos, self.prev_pos, self.prev_byte_pos)
+
+ if start_dist[0] < min[0]:
+ min = start_dist
+ if end_dist[0] < min[0]:
+ min = end_dist
+
+ b = self.str.index_of_char_from_known(pos, min[1], min[2])
+ self.prev_pos = pos
+ self.prev_byte_pos = b
+ return b
+
+ def char_index_of_byte(self, byte_pos):
+ if byte_pos == 0:
+ return 0
+
+ # (cost, known char, known byte)
+ start_dist = (byte_pos, 0, 0)
+ end_dist = (2 * (len(self.str.bytes) - byte_pos), len(self.str),
+ len(self.str.bytes))
+
+ min = (2 * abs(byte_pos - self.prev_byte_pos), self.prev_pos,
+ self.prev_byte_pos)
+
+ if start_dist[0] < min[0]:
+ min = start_dist
+ if end_dist[0] < min[0]:
+ min = end_dist
+
+ i = self.str.char_index_of_byte_from_known(byte_pos, min[1], min[2])
+ self.prev_pos = i
+ self.prev_byte_pos = byte_pos
+ return i
More information about the pypy-commit
mailing list