[pypy-commit] pypy default: don't do one to two dict lookups per parsed character

cfbolz noreply at buildbot.pypy.org
Fri Oct 30 13:32:34 EDT 2015


Author: Carl Friedrich Bolz <cfbolz at gmx.de>
Branch: 
Changeset: r80490:986b29dd1c0b
Date: 2015-10-30 18:30 +0100
http://bitbucket.org/pypy/pypy/changeset/986b29dd1c0b/

Log:	don't do one to two dict lookups per parsed character

	this is done by computing a different representation of the state
	transition table (a big string, instead of many dictionaries). A bit
	of a "just-because" commit.

diff --git a/pypy/interpreter/pyparser/automata.py b/pypy/interpreter/pyparser/automata.py
--- a/pypy/interpreter/pyparser/automata.py
+++ b/pypy/interpreter/pyparser/automata.py
@@ -22,27 +22,61 @@
 # PYPY Modification : removed all automata functions (any, maybe,
 #                     newArcPair, etc.)
 
+ERROR_STATE = chr(255)
+
 class DFA:
     # ____________________________________________________________
     def __init__(self, states, accepts, start = 0):
-        self.states = states
+        """ NOT_RPYTHON """
+        assert len(states) < 255 # no support for huge amounts of states
+        # construct string for looking up state transitions
+        string_states = [] * len(states)
+        # compute maximum
+        maximum = 0
+        for state in states:
+            for key in state:
+                if key == DEFAULT:
+                    continue
+                maximum = max(ord(key), maximum)
+        self.max_char = maximum + 1
+
+        defaults = []
+        for i, state in enumerate(states):
+            default = ERROR_STATE
+            if DEFAULT in state:
+                default = chr(state[DEFAULT])
+            defaults.append(default)
+            string_state = [default] * self.max_char
+            for key, value in state.iteritems():
+                if key == DEFAULT:
+                    continue
+                assert len(key) == 1
+                assert ord(key) < self.max_char
+                string_state[ord(key)] = chr(value)
+            string_states.extend(string_state)
+        self.states = "".join(string_states)
+        self.defaults = "".join(defaults)
         self.accepts = accepts
         self.start = start
 
     # ____________________________________________________________
-    def recognize (self, inVec, pos = 0): # greedy = True
+
+    def _next_state(self, item, crntState):
+        if ord(item) >= self.max_char:
+            return self.defaults[crntState]
+        else:
+            return self.states[crntState * self.max_char + ord(item)]
+
+    def recognize(self, inVec, pos = 0):
         crntState = self.start
         lastAccept = False
         i = pos
         for i in range(pos, len(inVec)):
             item = inVec[i]
-            # arcMap, accept = self.states[crntState]
-            arcMap = self.states[crntState]
             accept = self.accepts[crntState]
-            if item in arcMap:
-                crntState = arcMap[item]
-            elif DEFAULT in arcMap:
-                crntState = arcMap[DEFAULT]
+            crntState = self._next_state(item, crntState)
+            if crntState != ERROR_STATE:
+                pass
             elif accept:
                 return i
             elif lastAccept:
@@ -51,6 +85,7 @@
                 return i - 1
             else:
                 return -1
+            crntState = ord(crntState)
             lastAccept = accept
         # if self.states[crntState][1]:
         if self.accepts[crntState]:
@@ -63,24 +98,20 @@
 # ______________________________________________________________________
 
 class NonGreedyDFA (DFA):
-    def recognize (self, inVec, pos = 0):
+
+    def recognize(self, inVec, pos = 0):
         crntState = self.start
         i = pos
         for i in range(pos, len(inVec)):
             item = inVec[i]
-            # arcMap, accept = self.states[crntState]
-            arcMap = self.states[crntState]
             accept = self.accepts[crntState]
             if accept:
                 return i
-            elif item in arcMap:
-                crntState = arcMap[item]
-            elif DEFAULT in arcMap:
-                crntState = arcMap[DEFAULT]
-            else:
+            crntState = self._next_state(item, crntState)
+            if crntState == ERROR_STATE:
                 return -1
+            crntState = ord(crntState)
             i += 1
-        # if self.states[crntState][1]:
         if self.accepts[crntState]:
             return i
         else:
diff --git a/pypy/interpreter/pyparser/test/test_automata.py b/pypy/interpreter/pyparser/test/test_automata.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/pyparser/test/test_automata.py
@@ -0,0 +1,12 @@
+from pypy.interpreter.pyparser.automata import DFA, DEFAULT
+
+def test_states():
+    d = DFA([{"\x00": 1}, {"\x01": 0}], [False, True])
+    assert d.states == "\x01\xff\xff\x00"
+    assert d.defaults == "\xff\xff"
+    assert d.max_char == 2
+
+    d = DFA([{"\x00": 1}, {DEFAULT: 0}], [False, True])
+    assert d.states == "\x01\x00"
+    assert d.defaults == "\xff\x00"
+    assert d.max_char == 1


More information about the pypy-commit mailing list