[pypy-commit] pypy default: merge the identity-dict-strategy branch, which optimizes access to dictionary containing only classes which compare 'by identity'
antocuni
noreply at buildbot.pypy.org
Thu Jul 21 15:10:45 CEST 2011
Author: Antonio Cuni <anto.cuni at gmail.com>
Branch:
Changeset: r45817:69a7e76a319a
Date: 2011-07-21 15:10 +0200
http://bitbucket.org/pypy/pypy/changeset/69a7e76a319a/
Log: merge the identity-dict-strategy branch, which optimizes access to
dictionary containing only classes which compare 'by identity'
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -327,6 +327,9 @@
BoolOption("mutable_builtintypes",
"Allow the changing of builtin types", default=False,
requires=[("objspace.std.builtinshortcut", True)]),
+ BoolOption("withidentitydict",
+ "track types that override __hash__, __eq__ or __cmp__ and use a special dict strategy for those which do not",
+ default=True),
]),
])
diff --git a/pypy/doc/config/objspace.std.withidentitydict.txt b/pypy/doc/config/objspace.std.withidentitydict.txt
new file mode 100644
--- /dev/null
+++ b/pypy/doc/config/objspace.std.withidentitydict.txt
@@ -0,0 +1,21 @@
+=============================
+objspace.std.withidentitydict
+=============================
+
+* **name:** withidentitydict
+
+* **description:** enable a dictionary strategy for "by identity" comparisons
+
+* **command-line:** --objspace-std-withidentitydict
+
+* **command-line for negation:** --no-objspace-std-withidentitydict
+
+* **option type:** boolean option
+
+* **default:** True
+
+
+Enable a dictionary strategy specialized for instances of classes which
+compares "by identity", which is the default unless you override ``__hash__``,
+``__eq__`` or ``__cmp__``. This strategy will be used only with new-style
+classes.
diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst
--- a/pypy/doc/cpython_differences.rst
+++ b/pypy/doc/cpython_differences.rst
@@ -211,6 +211,38 @@
>>>> print d1['a']
42
+Mutating classes of objects which are already used as dictionary keys
+---------------------------------------------------------------------
+
+Consider the following snippet of code::
+
+ class X(object):
+ pass
+
+ def __evil_eq__(self, other):
+ print 'hello world'
+ return False
+
+ def evil(y):
+ d = {x(): 1}
+ X.__eq__ = __evil_eq__
+ d[y] # might trigger a call to __eq__?
+
+In CPython, __evil_eq__ **might** be called, although there is no way to write
+a test which reliably calls it. It happens if ``y is not x`` and ``hash(y) ==
+hash(x)``, where ``hash(x)`` is computed when ``x`` is inserted into the
+dictionary. If **by chance** the condition is satisfied, then ``__evil_eq__``
+is called.
+
+PyPy uses a special strategy to optimize dictionaries whose keys are instances
+of user-defined classes which do not override the default ``__hash__``,
+``__eq__`` and ``__cmp__``: when using this strategy, ``__eq__`` and
+``__cmp__`` are never called, but instead the lookup is done by identity, so
+in the case above it is guaranteed that ``__eq__`` won't be called.
+
+Note that in all other cases (e.g., if you have a custom ``__hash__`` and
+``__eq__`` in ``y``) the behavior is exactly the same as CPython.
+
Ignored exceptions
-----------------------
diff --git a/pypy/module/pypyjit/test_pypy_c/test_containers.py b/pypy/module/pypyjit/test_pypy_c/test_containers.py
--- a/pypy/module/pypyjit/test_pypy_c/test_containers.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_containers.py
@@ -23,3 +23,29 @@
ops = loop.ops_by_id('look')
assert log.opnames(ops) == ['setfield_gc',
'guard_not_invalidated']
+
+ def test_identitydict(self):
+ def fn(n):
+ class X(object):
+ pass
+ x = X()
+ d = {}
+ d[x] = 1
+ res = 0
+ for i in range(300):
+ value = d[x] # ID: getitem
+ res += value
+ return res
+ #
+ log = self.run(fn, [1000])
+ assert log.result == 300
+ loop, = log.loops_by_filename(self.filepath)
+ # check that the call to ll_dict_lookup is not a call_may_force
+ assert loop.match_by_id("getitem", """
+ i25 = call(ConstClass(_ll_1_gc_identityhash__objectPtr), p6, descr=...)
+ ...
+ i28 = call(ConstClass(ll_dict_lookup__dicttablePtr_objectPtr_Signed), p18, p6, i25, descr=...)
+ ...
+ p33 = call(ConstClass(ll_get_value__dicttablePtr_Signed), p18, i28, descr=...)
+ ...
+ """)
diff --git a/pypy/objspace/descroperation.py b/pypy/objspace/descroperation.py
--- a/pypy/objspace/descroperation.py
+++ b/pypy/objspace/descroperation.py
@@ -28,6 +28,13 @@
return w_delattr
object_delattr._annspecialcase_ = 'specialize:memo'
+def object_hash(space):
+ "Utility that returns the app-level descriptor object.__hash__."
+ w_src, w_hash = space.lookup_in_type_where(space.w_object,
+ '__hash__')
+ return w_hash
+object_hash._annspecialcase_ = 'specialize:memo'
+
def raiseattrerror(space, w_obj, name, w_descr=None):
w_type = space.type(w_obj)
typename = w_type.getname(space)
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -157,11 +157,15 @@
return self.erase(None)
def switch_to_correct_strategy(self, w_dict, w_key):
- #XXX implement other strategies later
+ withidentitydict = self.space.config.objspace.std.withidentitydict
if type(w_key) is self.space.StringObjectCls:
self.switch_to_string_strategy(w_dict)
- elif self.space.is_w(self.space.type(w_key), self.space.w_int):
+ return
+ w_type = self.space.type(w_key)
+ if self.space.is_w(w_type, self.space.w_int):
self.switch_to_int_strategy(w_dict)
+ elif withidentitydict and w_type.compares_by_identity():
+ self.switch_to_identity_strategy(w_dict)
else:
self.switch_to_object_strategy(w_dict)
@@ -177,6 +181,13 @@
w_dict.strategy = strategy
w_dict.dstorage = storage
+ def switch_to_identity_strategy(self, w_dict):
+ from pypy.objspace.std.identitydict import IdentityDictStrategy
+ strategy = self.space.fromcache(IdentityDictStrategy)
+ storage = strategy.get_empty_storage()
+ w_dict.strategy = strategy
+ w_dict.dstorage = storage
+
def switch_to_object_strategy(self, w_dict):
strategy = self.space.fromcache(ObjectDictStrategy)
storage = strategy.get_empty_storage()
@@ -338,7 +349,6 @@
def getitem(self, w_dict, w_key):
space = self.space
-
if self.is_correct_type(w_key):
return self.unerase(w_dict.dstorage).get(self.unwrap(w_key), None)
elif self._never_equal_to(space.type(w_key)):
@@ -404,6 +414,7 @@
def keys(self, w_dict):
return self.unerase(w_dict.dstorage).keys()
+
class StringDictStrategy(AbstractTypedStrategy, DictStrategy):
erase, unerase = rerased.new_erasing_pair("string")
@@ -448,7 +459,9 @@
return StrIteratorImplementation(self.space, self, w_dict)
-class StrIteratorImplementation(IteratorImplementation):
+class _WrappedIteratorMixin(object):
+ _mixin_ = True
+
def __init__(self, space, strategy, dictimplementation):
IteratorImplementation.__init__(self, space, dictimplementation)
self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems()
@@ -460,6 +473,23 @@
else:
return None, None
+class _UnwrappedIteratorMixin:
+ _mixin_ = True
+
+ def __init__(self, space, strategy, dictimplementation):
+ IteratorImplementation.__init__(self, space, dictimplementation)
+ self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems()
+
+ def next_entry(self):
+ # note that this 'for' loop only runs once, at most
+ for w_key, w_value in self.iterator:
+ return w_key, w_value
+ else:
+ return None, None
+
+
+class StrIteratorImplementation(_WrappedIteratorMixin, IteratorImplementation):
+ pass
class IntDictStrategy(AbstractTypedStrategy, DictStrategy):
erase, unerase = rerased.new_erasing_pair("int")
@@ -490,31 +520,11 @@
def iter(self, w_dict):
return IntIteratorImplementation(self.space, self, w_dict)
-class IntIteratorImplementation(IteratorImplementation):
- def __init__(self, space, strategy, dictimplementation):
- IteratorImplementation.__init__(self, space, dictimplementation)
- self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems()
+class IntIteratorImplementation(_WrappedIteratorMixin, IteratorImplementation):
+ pass
- def next_entry(self):
- # note that this 'for' loop only runs once, at most
- for key, w_value in self.iterator:
- return self.space.wrap(key), w_value
- else:
- return None, None
-
-
-class ObjectIteratorImplementation(IteratorImplementation):
- def __init__(self, space, strategy, dictimplementation):
- IteratorImplementation.__init__(self, space, dictimplementation)
- self.iterator = strategy.unerase(dictimplementation.dstorage).iteritems()
-
- def next_entry(self):
- # note that this 'for' loop only runs once, at most
- for w_key, w_value in self.iterator:
- return w_key, w_value
- else:
- return None, None
-
+class ObjectIteratorImplementation(_UnwrappedIteratorMixin, IteratorImplementation):
+ pass
init_signature = Signature(['seq_or_map'], None, 'kwargs')
init_defaults = [None]
diff --git a/pypy/objspace/std/dictproxyobject.py b/pypy/objspace/std/dictproxyobject.py
--- a/pypy/objspace/std/dictproxyobject.py
+++ b/pypy/objspace/std/dictproxyobject.py
@@ -86,7 +86,7 @@
def clear(self, w_dict):
self.unerase(w_dict.dstorage).dict_w.clear()
- self.unerase(w_dict.dstorage).mutated()
+ self.unerase(w_dict.dstorage).mutated(None)
class DictProxyIteratorImplementation(IteratorImplementation):
def __init__(self, space, strategy, dictimplementation):
diff --git a/pypy/objspace/std/identitydict.py b/pypy/objspace/std/identitydict.py
new file mode 100644
--- /dev/null
+++ b/pypy/objspace/std/identitydict.py
@@ -0,0 +1,85 @@
+## ----------------------------------------------------------------------------
+## dict strategy (see dict_multiobject.py)
+
+from pypy.rlib import rerased
+from pypy.objspace.std.dictmultiobject import (AbstractTypedStrategy,
+ DictStrategy,
+ IteratorImplementation,
+ _UnwrappedIteratorMixin)
+
+
+# this strategy is selected by EmptyDictStrategy.switch_to_correct_strategy
+class IdentityDictStrategy(AbstractTypedStrategy, DictStrategy):
+ """
+ Strategy for custom instances which compares by identity (i.e., the
+ default unless you override __hash__, __eq__ or __cmp__). The storage is
+ just a normal RPython dict, which has already the correct by-identity
+ semantics.
+
+ Note that at a first sight, you might have problems if you mutate the
+ class of an object which is already inside an identitydict. Consider this
+ example::
+
+ class X(object):
+ pass
+ d = {x(): 1}
+ X.__eq__ = ...
+ d[y] # might trigger a call to __eq__?
+
+ We want to be sure that x.__eq__ is called in the same cases as in
+ CPython. However, as long as the strategy is IdentityDictStrategy, the
+ __eq__ will never be called.
+
+ It turns out that it's not a problem. In CPython (and in PyPy without
+ this strategy), the __eq__ is called if ``hash(y) == hash(x)`` and ``x is
+ not y``. Note that hash(x) is computed at the time when we insert x in
+ the dict, not at the time we lookup y.
+
+ Now, how can hash(y) == hash(x)? There are two possibilities:
+
+ 1. we write a custom __hash__ for the class of y, thus making it a not
+ "compares by reference" type
+
+ 2. the class of y is "compares by reference" type, and by chance the
+ hash is the same as x
+
+ In the first case, the getitem immediately notice that y is not of the
+ right type, and switches the strategy to ObjectDictStrategy, then the
+ lookup works as usual.
+
+ The second case is completely non-deterministic, even in CPython.
+ Depending on the phase of the moon, you might call the __eq__ or not, so
+ it is perfectly fine to *never* call it. Morever, in practice with the
+ minimar GC we never have two live objects with the same hash, so it would
+ never happen anyway.
+ """
+
+ erase, unerase = rerased.new_erasing_pair("identitydict")
+ erase = staticmethod(erase)
+ unerase = staticmethod(unerase)
+
+ def wrap(self, unwrapped):
+ return unwrapped
+
+ def unwrap(self, wrapped):
+ return wrapped
+
+ def get_empty_storage(self):
+ return self.erase({})
+
+ def is_correct_type(self, w_obj):
+ w_type = self.space.type(w_obj)
+ return w_type.compares_by_identity()
+
+ def _never_equal_to(self, w_lookup_type):
+ return False
+
+ def iter(self, w_dict):
+ return IdentityDictIteratorImplementation(self.space, self, w_dict)
+
+ def keys(self, w_dict):
+ return self.unerase(w_dict.dstorage).keys()
+
+
+class IdentityDictIteratorImplementation(_UnwrappedIteratorMixin, IteratorImplementation):
+ pass
diff --git a/pypy/objspace/std/objecttype.py b/pypy/objspace/std/objecttype.py
--- a/pypy/objspace/std/objecttype.py
+++ b/pypy/objspace/std/objecttype.py
@@ -6,7 +6,7 @@
from pypy.objspace.descroperation import Object
from pypy.objspace.std.stdtypedef import StdTypeDef
from pypy.objspace.std.register_all import register_all
-
+from pypy.objspace.std import identitydict
def descr__repr__(space, w_obj):
w = space.wrap
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -39,7 +39,6 @@
from pypy.objspace.std.stringtype import wrapstr
from pypy.objspace.std.unicodetype import wrapunicode
-
class StdObjSpace(ObjSpace, DescrOperation):
"""The standard object space, implementing a general-purpose object
library in Restricted Python."""
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -898,6 +898,7 @@
withsmalldicts = False
withcelldict = False
withmethodcache = False
+ withidentitydict = False
FakeSpace.config = Config()
@@ -1105,3 +1106,4 @@
fakespace = FakeSpace()
d = fakespace.newdict(module=True)
assert type(d.strategy) is StringDictStrategy
+
diff --git a/pypy/objspace/std/test/test_identitydict.py b/pypy/objspace/std/test/test_identitydict.py
new file mode 100644
--- /dev/null
+++ b/pypy/objspace/std/test/test_identitydict.py
@@ -0,0 +1,135 @@
+from pypy.interpreter.gateway import interp2app
+from pypy.conftest import gettestobjspace
+from pypy.conftest import option
+
+class AppTestComparesByIdentity:
+
+ def setup_class(cls):
+ from pypy.objspace.std import identitydict
+ cls.space = gettestobjspace(
+ **{"objspace.std.withidentitydict": True})
+
+ def compares_by_identity(space, w_cls):
+ return space.wrap(w_cls.compares_by_identity())
+ cls.w_compares_by_identity = cls.space.wrap(interp2app(compares_by_identity))
+
+ def test_compares_by_identity(self):
+ class Plain(object):
+ pass
+
+ class CustomEq(object):
+ def __eq__(self, other):
+ return True
+
+ class CustomCmp (object):
+ def __cmp__(self, other):
+ return 0
+
+ class CustomHash(object):
+ def __hash__(self):
+ return 0
+
+ assert self.compares_by_identity(Plain)
+ assert not self.compares_by_identity(CustomEq)
+ assert not self.compares_by_identity(CustomCmp)
+ assert not self.compares_by_identity(CustomHash)
+
+ def test_modify_class(self):
+ class X(object):
+ pass
+
+ assert self.compares_by_identity(X)
+ X.__eq__ = lambda x: None
+ assert not self.compares_by_identity(X)
+ del X.__eq__
+ assert self.compares_by_identity(X)
+
+
+class AppTestIdentityDict(object):
+ def setup_class(cls):
+ cls.space = gettestobjspace(**{"objspace.std.withidentitydict": True})
+ if option.runappdirect:
+ py.test.skip("__repr__ doesn't work on appdirect")
+
+ def w_uses_identity_strategy(self, obj):
+ import __pypy__
+ return "IdentityDictStrategy" in __pypy__.internal_repr(obj)
+
+ def test_use_strategy(self):
+ class X(object):
+ pass
+ d = {}
+ x = X()
+ d[x] = 1
+ assert self.uses_identity_strategy(d)
+ assert d[x] == 1
+
+ def test_bad_item(self):
+ class X(object):
+ pass
+ class Y(object):
+ def __hash__(self):
+ return 32
+
+ d = {}
+ x = X()
+ y = Y()
+ d[x] = 1
+ assert self.uses_identity_strategy(d)
+ d[y] = 2
+ assert not self.uses_identity_strategy(d)
+ assert d[x] == 1
+ assert d[y] == 2
+
+ def test_bad_key(self):
+ class X(object):
+ pass
+ d = {}
+ x = X()
+
+ class Y(object):
+ def __hash__(self):
+ return hash(x) # to make sure we do x == y
+
+ def __eq__(self, other):
+ return True
+
+ y = Y()
+ d[x] = 1
+ assert self.uses_identity_strategy(d)
+ assert d[y] == 1
+ assert not self.uses_identity_strategy(d)
+
+ def test_iter(self):
+ class X(object):
+ pass
+ x = X()
+ d = {x: 1}
+ assert self.uses_identity_strategy(d)
+ assert list(iter(d)) == [x]
+
+ def test_mutate_class_and_then_compare(self):
+ class X(object):
+ pass
+ class Y(object):
+ pass
+
+ x = X()
+ y = Y()
+ d1 = {x: 1}
+ d2 = {y: 1}
+ assert self.uses_identity_strategy(d1)
+ assert self.uses_identity_strategy(d2)
+ #
+ X.__hash__ = lambda self: hash(y)
+ X.__eq__ = lambda self, other: True
+ #
+ assert d1 == d2
+ assert self.uses_identity_strategy(d1)
+ assert not self.uses_identity_strategy(d2)
+
+ def test_old_style_classes(self):
+ class X:
+ pass
+ d = {X(): 1}
+ assert not self.uses_identity_strategy(d)
diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py
--- a/pypy/objspace/std/typeobject.py
+++ b/pypy/objspace/std/typeobject.py
@@ -7,6 +7,7 @@
from pypy.interpreter.baseobjspace import W_Root
from pypy.objspace.std.stdtypedef import std_dict_descr, issubtypedef, Member
from pypy.objspace.std.objecttype import object_typedef
+from pypy.objspace.std import identitydict
from pypy.rlib.objectmodel import we_are_translated
from pypy.rlib.objectmodel import current_object_addr_as_int, compute_hash
from pypy.rlib.jit import promote, elidable_promote, we_are_jitted
@@ -76,6 +77,10 @@
for i in range(len(self.lookup_where)):
self.lookup_where[i] = None_None
+# possible values of compares_by_identity_status
+UNKNOWN = 0
+COMPARES_BY_IDENTITY = 1
+OVERRIDES_EQ_CMP_OR_HASH = 2
class W_TypeObject(W_Object):
from pypy.objspace.std.typetype import type_typedef as typedef
@@ -102,6 +107,9 @@
# (False is a conservative default, fixed during real usage)
uses_object_getattribute = False
+ # for config.objspace.std.withidentitydict
+ compares_by_identity_status = UNKNOWN
+
# used to cache the type __new__ function if it comes from a builtin type
# != 'type', in that case call__Type will also assumes the result
# of the __new__ is an instance of the type
@@ -146,11 +154,17 @@
else:
w_self.terminator = NoDictTerminator(space, w_self)
- def mutated(w_self):
+ def mutated(w_self, key):
+ """
+ The type is being mutated. key is either the string containing the
+ specific attribute which is being deleted/set or None to indicate a
+ generic mutation.
+ """
space = w_self.space
assert w_self.is_heaptype() or space.config.objspace.std.mutable_builtintypes
if (not space.config.objspace.std.withtypeversion and
not space.config.objspace.std.getattributeshortcut and
+ not space.config.objspace.std.withidentitydict and
not space.config.objspace.std.newshortcut):
return
@@ -158,6 +172,13 @@
w_self.uses_object_getattribute = False
# ^^^ conservative default, fixed during real usage
+ if space.config.objspace.std.withidentitydict:
+ did_compare_by_identity = (
+ w_self.compares_by_identity_status == COMPARES_BY_IDENTITY)
+ if (key is None or key == '__eq__' or
+ key == '__cmp__' or key == '__hash__'):
+ w_self.compares_by_identity_status = UNKNOWN
+
if space.config.objspace.std.newshortcut:
w_self.w_bltin_new = None
@@ -168,7 +189,7 @@
subclasses_w = w_self.get_subclasses()
for w_subclass in subclasses_w:
assert isinstance(w_subclass, W_TypeObject)
- w_subclass.mutated()
+ w_subclass.mutated(key)
def version_tag(w_self):
if (not we_are_jitted() or w_self.is_heaptype() or
@@ -207,6 +228,25 @@
def has_object_getattribute(w_self):
return w_self.getattribute_if_not_from_object() is None
+ def compares_by_identity(w_self):
+ from pypy.objspace.descroperation import object_hash
+ if not w_self.space.config.objspace.std.withidentitydict:
+ return False # conservative
+ #
+ if w_self.compares_by_identity_status != UNKNOWN:
+ # fast path
+ return w_self.compares_by_identity_status == COMPARES_BY_IDENTITY
+ #
+ default_hash = object_hash(w_self.space)
+ overrides_eq_cmp_or_hash = (w_self.lookup('__eq__') or
+ w_self.lookup('__cmp__') or
+ w_self.lookup('__hash__') is not default_hash)
+ if overrides_eq_cmp_or_hash:
+ w_self.compares_by_identity_status = OVERRIDES_EQ_CMP_OR_HASH
+ else:
+ w_self.compares_by_identity_status = COMPARES_BY_IDENTITY
+ return w_self.compares_by_identity_status == COMPARES_BY_IDENTITY
+
def ready(w_self):
for w_base in w_self.bases_w:
if not isinstance(w_base, W_TypeObject):
@@ -269,7 +309,7 @@
w_curr.w_value = w_value
return True
w_value = TypeCell(w_value)
- w_self.mutated()
+ w_self.mutated(name)
w_self.dict_w[name] = w_value
return True
@@ -286,7 +326,7 @@
except KeyError:
return False
else:
- w_self.mutated()
+ w_self.mutated(key)
return True
def lookup(w_self, name):
diff --git a/pypy/objspace/std/typetype.py b/pypy/objspace/std/typetype.py
--- a/pypy/objspace/std/typetype.py
+++ b/pypy/objspace/std/typetype.py
@@ -141,7 +141,7 @@
w_oldbestbase.getname(space))
# invalidate the version_tag of all the current subclasses
- w_type.mutated()
+ w_type.mutated(None)
# now we can go ahead and change 'w_type.bases_w'
saved_bases_w = w_type.bases_w
More information about the pypy-commit
mailing list