[Python-checkins] cpython: Implement PEP 412: Key-sharing dictionaries (closes #13903)

Mon Apr 23 17:24:57 CEST 2012

http://hg.python.org/cpython/rev/6e5855854a2e
changeset:   76485:6e5855854a2e
user:        Benjamin Peterson <benjamin at python.org>
date:        Mon Apr 23 11:24:50 2012 -0400
summary:
  Implement PEP 412: Key-sharing dictionaries (closes #13903)

Patch from Mark Shannon.

files:
  Include/dictobject.h    |    88 +-
  Include/object.h        |     2 +-
  Lib/test/test_dict.py   |    21 +
  Lib/test/test_pprint.py |     4 +
  Lib/test/test_sys.py    |     6 +-
  Misc/NEWS               |     4 +
  Objects/dictnotes.txt   |   221 +--
  Objects/dictobject.c    |  1783 ++++++++++++++++++--------
  Objects/object.c        |    27 +-
  Objects/typeobject.c    |    17 +-
  Python/ceval.c          |    75 +-
  Tools/gdb/libpython.py  |    11 +-
  12 files changed, 1357 insertions(+), 902 deletions(-)

diff --git a/Include/dictobject.h b/Include/dictobject.h
--- a/Include/dictobject.h
+++ b/Include/dictobject.h
@@ -13,78 +13,20 @@
    tuning dictionaries, and several ideas for possible optimizations.
 */
 
-/*
-There are three kinds of slots in the table:
+#ifndef Py_LIMITED_API
 
-1. Unused.  me_key == me_value == NULL
-   Does not hold an active (key, value) pair now and never did.  Unused can
-   transition to Active upon key insertion.  This is the only case in which
-   me_key is NULL, and is each slot's initial state.
+typedef struct _dictkeysobject PyDictKeysObject;
 
-2. Active.  me_key != NULL and me_key != dummy and me_value != NULL
-   Holds an active (key, value) pair.  Active can transition to Dummy upon
-   key deletion.  This is the only case in which me_value != NULL.
+/* The ma_values pointer is NULL for a combined table
+ * or points to an array of PyObject* for a split table
+ */
+typedef struct {
+    PyObject_HEAD
+    Py_ssize_t ma_used;
+    PyDictKeysObject *ma_keys;
+    PyObject **ma_values;
+} PyDictObject;
 
-3. Dummy.  me_key == dummy and me_value == NULL
-   Previously held an active (key, value) pair, but that was deleted and an
-   active pair has not yet overwritten the slot.  Dummy can transition to
-   Active upon key insertion.  Dummy slots cannot be made Unused again
-   (cannot have me_key set to NULL), else the probe sequence in case of
-   collision would have no way to know they were once active.
-
-Note: .popitem() abuses the me_hash field of an Unused or Dummy slot to
-hold a search finger.  The me_hash field of Unused or Dummy slots has no
-meaning otherwise.
-*/
-
-/* PyDict_MINSIZE is the minimum size of a dictionary.  This many slots are
- * allocated directly in the dict object (in the ma_smalltable member).
- * It must be a power of 2, and at least 4.  8 allows dicts with no more
- * than 5 active entries to live in ma_smalltable (and so avoid an
- * additional malloc); instrumentation suggested this suffices for the
- * majority of dicts (consisting mostly of usually-small instance dicts and
- * usually-small dicts created to pass keyword arguments).
- */
-#ifndef Py_LIMITED_API
-#define PyDict_MINSIZE 8
-
-typedef struct {
-    /* Cached hash code of me_key. */
-    Py_hash_t me_hash;
-    PyObject *me_key;
-    PyObject *me_value;
-} PyDictEntry;
-
-/*
-To ensure the lookup algorithm terminates, there must be at least one Unused
-slot (NULL key) in the table.
-The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
-ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
-values == the number of Active items).
-To avoid slowing down lookups on a near-full table, we resize the table when
-it's two-thirds full.
-*/
-typedef struct _dictobject PyDictObject;
-struct _dictobject {
-    PyObject_HEAD
-    Py_ssize_t ma_fill;  /* # Active + # Dummy */
-    Py_ssize_t ma_used;  /* # Active */
-
-    /* The table contains ma_mask + 1 slots, and that's a power of 2.
-     * We store the mask instead of the size because the mask is more
-     * frequently needed.
-     */
-    Py_ssize_t ma_mask;
-
-    /* ma_table points to ma_smalltable for small tables, else to
-     * additional malloc'ed memory.  ma_table is never NULL!  This rule
-     * saves repeated runtime null-tests in the workhorse getitem and
-     * setitem calls.
-     */
-    PyDictEntry *ma_table;
-    PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, Py_hash_t hash);
-    PyDictEntry ma_smalltable[PyDict_MINSIZE];
-};
 #endif /* Py_LIMITED_API */
 
 PyAPI_DATA(PyTypeObject) PyDict_Type;
@@ -117,6 +59,8 @@
 PyAPI_FUNC(int) PyDict_Next(
     PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value);
 #ifndef Py_LIMITED_API
+PyDictKeysObject *_PyDict_NewKeysForClass(void);
+PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
 PyAPI_FUNC(int) _PyDict_Next(
     PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value, Py_hash_t *hash);
 #endif
@@ -131,6 +75,7 @@
 PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
 PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
 PyAPI_FUNC(int) _PyDict_HasOnlyStringKeys(PyObject *mp);
+#define _PyDict_HasSplitTable(d) ((d)->ma_values != NULL)
 
 PyAPI_FUNC(int) PyDict_ClearFreeList(void);
 #endif
@@ -162,6 +107,11 @@
 PyAPI_FUNC(int) _PyDict_SetItemId(PyObject *dp, struct _Py_Identifier *key, PyObject *item);
 PyAPI_FUNC(int) PyDict_DelItemString(PyObject *dp, const char *key);
 
+#ifndef Py_LIMITED_API
+int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr, PyObject *name, PyObject *value);
+PyObject *_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/object.h b/Include/object.h
--- a/Include/object.h
+++ b/Include/object.h
@@ -449,6 +449,7 @@
                                       see add_operators() in typeobject.c . */
     PyBufferProcs as_buffer;
     PyObject *ht_name, *ht_slots, *ht_qualname;
+    struct _dictkeysobject *ht_cached_keys;
     /* here are optional user slots, followed by the members. */
 } PyHeapTypeObject;
 
@@ -517,7 +518,6 @@
 PyAPI_FUNC(PyObject *) PyObject_GenericGetAttr(PyObject *, PyObject *);
 PyAPI_FUNC(int) PyObject_GenericSetAttr(PyObject *,
                                               PyObject *, PyObject *);
-PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
 PyAPI_FUNC(int) PyObject_GenericSetDict(PyObject *, PyObject *, void *);
 PyAPI_FUNC(Py_hash_t) PyObject_Hash(PyObject *);
 PyAPI_FUNC(Py_hash_t) PyObject_HashNotImplemented(PyObject *);
diff --git a/Lib/test/test_dict.py b/Lib/test/test_dict.py
--- a/Lib/test/test_dict.py
+++ b/Lib/test/test_dict.py
@@ -321,6 +321,27 @@
         self.assertEqual(hashed2.hash_count, 1)
         self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
 
+    def test_setitem_atomic_at_resize(self):
+        class Hashed(object):
+            def __init__(self):
+                self.hash_count = 0
+                self.eq_count = 0
+            def __hash__(self):
+                self.hash_count += 1
+                return 42
+            def __eq__(self, other):
+                self.eq_count += 1
+                return id(self) == id(other)
+        hashed1 = Hashed()
+        # 5 items
+        y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
+        hashed2 = Hashed()
+        # 6th item forces a resize
+        y[hashed2] = []
+        self.assertEqual(hashed1.hash_count, 1)
+        self.assertEqual(hashed2.hash_count, 1)
+        self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
+
     def test_popitem(self):
         # dict.popitem()
         for copymode in -1, +1:
diff --git a/Lib/test/test_pprint.py b/Lib/test/test_pprint.py
--- a/Lib/test/test_pprint.py
+++ b/Lib/test/test_pprint.py
@@ -219,6 +219,8 @@
  others.should.not.be: like.this}"""
         self.assertEqual(DottedPrettyPrinter().pformat(o), exp)
 
+    @unittest.expectedFailure
+    #See http://bugs.python.org/issue13907
     @test.support.cpython_only
     def test_set_reprs(self):
         # This test creates a complex arrangement of frozensets and
@@ -241,10 +243,12 @@
         # Consequently, this test is fragile and
         # implementation-dependent.  Small changes to Python's sort
         # algorithm cause the test to fail when it should pass.
+        # XXX Or changes to the dictionary implmentation...
 
         self.assertEqual(pprint.pformat(set()), 'set()')
         self.assertEqual(pprint.pformat(set(range(3))), '{0, 1, 2}')
         self.assertEqual(pprint.pformat(frozenset()), 'frozenset()')
+
         self.assertEqual(pprint.pformat(frozenset(range(3))), 'frozenset({0, 1, 2})')
         cube_repr_tgt = """\
 {frozenset(): frozenset({frozenset({2}), frozenset({0}), frozenset({1})}),
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -687,9 +687,9 @@
         # method-wrapper (descriptor object)
         check({}.__iter__, size(h + '2P'))
         # dict
-        check({}, size(h + '3P2P' + 8*'P2P'))
+        check({}, size(h + '3P' + '4P' + 8*'P2P'))
         longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
-        check(longdict, size(h + '3P2P' + 8*'P2P') + 16*size('P2P'))
+        check(longdict, size(h + '3P' + '4P') + 16*size('P2P'))
         # dictionary-keyiterator
         check({}.keys(), size(h + 'P'))
         # dictionary-valueiterator
@@ -831,7 +831,7 @@
         # type
         # (PyTypeObject + PyNumberMethods + PyMappingMethods +
         #  PySequenceMethods + PyBufferProcs)
-        s = size(vh + 'P2P15Pl4PP9PP11PI') + size('16Pi17P 3P 10P 2P 3P')
+        s = size(vh + 'P2P15Pl4PP9PP11PIP') + size('16Pi17P 3P 10P 2P 3P')
         check(int, s)
         # class
         class newstyleclass(object): pass
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@
 Core and Builtins
 -----------------
 
+- Issue #13903: Implement PEP 412. Individual dictionary instances can now share
+  their keys with other dictionaries. Classes take advantage of this to share
+  their instance dictionary keys for improved memory and performance.
+
 - Issue #14630: Fix a memory access bug for instances of a subclass of int
   with value 0.
 
diff --git a/Objects/dictnotes.txt b/Objects/dictnotes.txt
--- a/Objects/dictnotes.txt
+++ b/Objects/dictnotes.txt
@@ -1,7 +1,6 @@
-NOTES ON OPTIMIZING DICTIONARIES
+NOTES ON DICTIONARIES
 ================================
 
-
 Principal Use Cases for Dictionaries
 ------------------------------------
 
@@ -21,7 +20,7 @@
 
 Builtins
     Frequent reads.  Almost never written.
-    Size 126 interned strings (as of Py2.3b1).
+    About 150 interned strings (as of Py3.3).
     A few keys are accessed much more frequently than others.
 
 Uniquification
@@ -59,44 +58,43 @@
     Characterized by deletions interspersed with adds and replacements.
     Performance benefits greatly from the re-use of dummy entries.
 
+Data Layout
+-----------
 
-Data Layout (assuming a 32-bit box with 64 bytes per cache line)
-----------------------------------------------------------------
-
-Smalldicts (8 entries) are attached to the dictobject structure
-and the whole group nearly fills two consecutive cache lines.
-
-Larger dicts use the first half of the dictobject structure (one cache
-line) and a separate, continuous block of entries (at 12 bytes each
-for a total of 5.333 entries per cache line).
+Dictionaries are composed of 3 components:
+The dictobject struct itself
+A dict-keys object (keys & hashes)
+A values array
 
 
 Tunable Dictionary Parameters
 -----------------------------
 
-* PyDict_MINSIZE.  Currently set to 8.
-    Must be a power of two.  New dicts have to zero-out every cell.
-    Each additional 8 consumes 1.5 cache lines.  Increasing improves
-    the sparseness of small dictionaries but costs time to read in
-    the additional cache lines if they are not already in cache.
-    That case is common when keyword arguments are passed.
+* PyDict_STARTSIZE. Starting size of dict (unless an instance dict).
+    Currently set to 8. Must be a power of two.
+    New dicts have to zero-out every cell.
+    Increasing improves the sparseness of small dictionaries but costs
+    time to read in the additional cache lines if they are not already
+    in cache. That case is common when keyword arguments are passed.
+    Prior to version 3.3, PyDict_MINSIZE was used as the starting size
+    of a new dict.
 
-* Maximum dictionary load in PyDict_SetItem.  Currently set to 2/3.
-    Increasing this ratio makes dictionaries more dense resulting
-    in more collisions.  Decreasing it improves sparseness at the
-    expense of spreading entries over more cache lines and at the
+* PyDict_MINSIZE. Minimum size of a dict.
+    Currently set to 4 (to keep instance dicts small).
+    Must be a power of two. Prior to version 3.3, PyDict_MINSIZE was
+    set to 8.
+
+* USABLE_FRACTION. Maximum dictionary load in PyDict_SetItem.
+    Currently set to 2/3. Increasing this ratio makes dictionaries more
+    dense resulting in more collisions.  Decreasing it improves sparseness
+    at the expense of spreading entries over more cache lines and at the
     cost of total memory consumed.
 
-    The load test occurs in highly time sensitive code.  Efforts
-    to make the test more complex (for example, varying the load
-    for different sizes) have degraded performance.
-
 * Growth rate upon hitting maximum load.  Currently set to *2.
-    Raising this to *4 results in half the number of resizes,
-    less effort to resize, better sparseness for some (but not
-    all dict sizes), and potentially doubles memory consumption
-    depending on the size of the dictionary.  Setting to *4
-    eliminates every other resize step.
+    Raising this to *4 results in half the number of resizes, less
+    effort to resize, better sparseness for some (but not all dict sizes),
+    and potentially doubles memory consumption depending on the size of
+    the dictionary.  Setting to *4 eliminates every other resize step.
 
 * Maximum sparseness (minimum dictionary load).  What percentage
     of entries can be unused before the dictionary shrinks to
@@ -126,8 +124,8 @@
 Also, every dictionary iterates at least twice, once for the memset()
 when it is created and once by dealloc().
 
-Dictionary operations involving only a single key can be O(1) unless 
-resizing is possible.  By checking for a resize only when the 
+Dictionary operations involving only a single key can be O(1) unless
+resizing is possible.  By checking for a resize only when the
 dictionary can grow (and may *require* resizing), other operations
 remain O(1), and the odds of resize thrashing or memory fragmentation
 are reduced. In particular, an algorithm that empties a dictionary
@@ -135,136 +133,51 @@
 not be necessary at all because the dictionary is eventually
 discarded entirely.
 
+The key differences between this implementation and earlier versions are:
+    1. The table can be split into two parts, the keys and the values.
+
+    2. There is an additional key-value combination: (key, NULL).
+       Unlike (<dummy>, NULL) which represents a deleted value, (key, NULL)
+       represented a yet to be inserted value. This combination can only occur
+       when the table is split.
+
+    3. No small table embedded in the dict,
+       as this would make sharing of key-tables impossible.
+
+
+These changes have the following consequences.
+   1. General dictionaries are slightly larger.
+
+   2. All object dictionaries of a single class can share a single key-table,
+      saving about 60% memory for such cases.
 
 Results of Cache Locality Experiments
--------------------------------------
+--------------------------------------
 
-When an entry is retrieved from memory, 4.333 adjacent entries are also
-retrieved into a cache line.  Since accessing items in cache is *much*
-cheaper than a cache miss, an enticing idea is to probe the adjacent
-entries as a first step in collision resolution.  Unfortunately, the
-introduction of any regularity into collision searches results in more
-collisions than the current random chaining approach.
+Experiments on an earlier design of dictionary, in which all tables were
+combined, showed the following:
 
-Exploiting cache locality at the expense of additional collisions fails
-to payoff when the entries are already loaded in cache (the expense
-is paid with no compensating benefit).  This occurs in small dictionaries
-where the whole dictionary fits into a pair of cache lines.  It also
-occurs frequently in large dictionaries which have a common access pattern
-where some keys are accessed much more frequently than others.  The
-more popular entries *and* their collision chains tend to remain in cache.
+  When an entry is retrieved from memory, several adjacent entries are also
+  retrieved into a cache line.  Since accessing items in cache is *much*
+  cheaper than a cache miss, an enticing idea is to probe the adjacent
+  entries as a first step in collision resolution.  Unfortunately, the
+  introduction of any regularity into collision searches results in more
+  collisions than the current random chaining approach.
 
-To exploit cache locality, change the collision resolution section
-in lookdict() and lookdict_string().  Set i^=1 at the top of the
-loop and move the  i = (i << 2) + i + perturb + 1 to an unrolled
-version of the loop.
+  Exploiting cache locality at the expense of additional collisions fails
+  to payoff when the entries are already loaded in cache (the expense
+  is paid with no compensating benefit).  This occurs in small dictionaries
+  where the whole dictionary fits into a pair of cache lines.  It also
+  occurs frequently in large dictionaries which have a common access pattern
+  where some keys are accessed much more frequently than others.  The
+  more popular entries *and* their collision chains tend to remain in cache.
 
-This optimization strategy can be leveraged in several ways:
+  To exploit cache locality, change the collision resolution section
+  in lookdict() and lookdict_string().  Set i^=1 at the top of the
+  loop and move the  i = (i << 2) + i + perturb + 1 to an unrolled
+  version of the loop.
 
-* If the dictionary is kept sparse (through the tunable parameters),
-then the occurrence of additional collisions is lessened.
+For split tables, the above will apply to the keys, but the value will
+always be in a different cache line from the key.
 
-* If lookdict() and lookdict_string() are specialized for small dicts
-and for largedicts, then the versions for large_dicts can be given
-an alternate search strategy without increasing collisions in small dicts
-which already have the maximum benefit of cache locality.
 
-* If the use case for a dictionary is known to have a random key
-access pattern (as opposed to a more common pattern with a Zipf's law
-distribution), then there will be more benefit for large dictionaries
-because any given key is no more likely than another to already be
-in cache.
-
-* In use cases with paired accesses to the same key, the second access
-is always in cache and gets no benefit from efforts to further improve
-cache locality.
-
-Optimizing the Search of Small Dictionaries
--------------------------------------------
-
-If lookdict() and lookdict_string() are specialized for smaller dictionaries,
-then a custom search approach can be implemented that exploits the small
-search space and cache locality.
-
-* The simplest example is a linear search of contiguous entries.  This is
-  simple to implement, guaranteed to terminate rapidly, never searches
-  the same entry twice, and precludes the need to check for dummy entries.
-
-* A more advanced example is a self-organizing search so that the most
-  frequently accessed entries get probed first.  The organization
-  adapts if the access pattern changes over time.  Treaps are ideally
-  suited for self-organization with the most common entries at the
-  top of the heap and a rapid binary search pattern.  Most probes and
-  results are all located at the top of the tree allowing them all to
-  be located in one or two cache lines.
-
-* Also, small dictionaries may be made more dense, perhaps filling all
-  eight cells to take the maximum advantage of two cache lines.
-
-
-Strategy Pattern
-----------------
-
-Consider allowing the user to set the tunable parameters or to select a
-particular search method.  Since some dictionary use cases have known
-sizes and access patterns, the user may be able to provide useful hints.
-
-1) For example, if membership testing or lookups dominate runtime and memory
-   is not at a premium, the user may benefit from setting the maximum load
-   ratio at 5% or 10% instead of the usual 66.7%.  This will sharply
-   curtail the number of collisions but will increase iteration time.
-   The builtin namespace is a prime example of a dictionary that can
-   benefit from being highly sparse.
-
-2) Dictionary creation time can be shortened in cases where the ultimate
-   size of the dictionary is known in advance.  The dictionary can be
-   pre-sized so that no resize operations are required during creation.
-   Not only does this save resizes, but the key insertion will go
-   more quickly because the first half of the keys will be inserted into
-   a more sparse environment than before.  The preconditions for this
-   strategy arise whenever a dictionary is created from a key or item
-   sequence and the number of *unique* keys is known.
-
-3) If the key space is large and the access pattern is known to be random,
-   then search strategies exploiting cache locality can be fruitful.
-   The preconditions for this strategy arise in simulations and
-   numerical analysis.
-
-4) If the keys are fixed and the access pattern strongly favors some of
-   the keys, then the entries can be stored contiguously and accessed
-   with a linear search or treap.  This exploits knowledge of the data,
-   cache locality, and a simplified search routine.  It also eliminates
-   the need to test for dummy entries on each probe.  The preconditions
-   for this strategy arise in symbol tables and in the builtin dictionary.
-
-
-Readonly Dictionaries
----------------------
-Some dictionary use cases pass through a build stage and then move to a
-more heavily exercised lookup stage with no further changes to the
-dictionary.
-
-An idea that emerged on python-dev is to be able to convert a dictionary
-to a read-only state.  This can help prevent programming errors and also
-provide knowledge that can be exploited for lookup optimization.
-
-The dictionary can be immediately rebuilt (eliminating dummy entries),
-resized (to an appropriate level of sparseness), and the keys can be
-jostled (to minimize collisions).  The lookdict() routine can then
-eliminate the test for dummy entries (saving about 1/4 of the time
-spent in the collision resolution loop).
-
-An additional possibility is to insert links into the empty spaces
-so that dictionary iteration can proceed in len(d) steps instead of
-(mp->mask + 1) steps.  Alternatively, a separate tuple of keys can be
-kept just for iteration.
-
-
-Caching Lookups
----------------
-The idea is to exploit key access patterns by anticipating future lookups
-based on previous lookups.
-
-The simplest incarnation is to save the most recently accessed entry.
-This gives optimal performance for use cases where every get is followed
-by a set or del to the same key.
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -7,9 +7,93 @@
    tuning dictionaries, and several ideas for possible optimizations.
 */
 
+
+/*
+There are four kinds of slots in the table:
+
+1. Unused.  me_key == me_value == NULL
+   Does not hold an active (key, value) pair now and never did.  Unused can
+   transition to Active upon key insertion.  This is the only case in which
+   me_key is NULL, and is each slot's initial state.
+
+2. Active.  me_key != NULL and me_key != dummy and me_value != NULL
+   Holds an active (key, value) pair.  Active can transition to Dummy or
+   Pending upon key deletion (for combined and split tables respectively).
+   This is the only case in which me_value != NULL.
+
+3. Dummy.  me_key == dummy and me_value == NULL
+   Previously held an active (key, value) pair, but that was deleted and an
+   active pair has not yet overwritten the slot.  Dummy can transition to
+   Active upon key insertion.  Dummy slots cannot be made Unused again
+   (cannot have me_key set to NULL), else the probe sequence in case of
+   collision would have no way to know they were once active.
+
+4. Pending. Not yet inserted or deleted from a split-table.
+   key != NULL, key != dummy and value == NULL
+
+The DictObject can be in one of two forms.
+Either:
+  A combined table:
+    ma_values == NULL, dk_refcnt == 1.
+    Values are stored in the me_value field of the PyDictKeysObject.
+    Slot kind 4 is not allowed i.e.
+        key != NULL, key != dummy and value == NULL is illegal.
+Or:
+  A split table:
+    ma_values != NULL, dk_refcnt >= 1
+    Values are stored in the ma_values array.
+    Only string (unicode) keys are allowed, no <dummy> keys are present.
+
+Note: .popitem() abuses the me_hash field of an Unused or Dummy slot to
+hold a search finger.  The me_hash field of Unused or Dummy slots has no
+meaning otherwise. As a consequence of this popitem always converts the dict
+to the combined-table form.
+*/
+
+/* PyDict_MINSIZE_SPLIT is the minimum size of a split dictionary.
+ * It must be a power of 2, and at least 4.
+ * Resizing of split dictionaries is very rare, so the saving memory is more
+ * important than the cost of resizing.
+ */
+#define PyDict_MINSIZE_SPLIT 4
+
+/* PyDict_MINSIZE_COMBINED is the starting size for any new, non-split dict.
+ * 8 allows dicts with no more than 5 active entries; experiments suggested
+ * this suffices for the majority of dicts (consisting mostly of usually-small
+ * dicts created to pass keyword arguments).
+ * Making this 8, rather than 4 reduces the number of resizes for most
+ * dictionaries, without any significant extra memory use.
+ */
+#define PyDict_MINSIZE_COMBINED 8
+
 #include "Python.h"
 #include "stringlib/eq.h"
 
+typedef struct {
+    /* Cached hash code of me_key. */
+    Py_hash_t me_hash;
+    PyObject *me_key;
+    PyObject *me_value; /* This field is only meaningful for combined tables */
+} PyDictKeyEntry;
+
+typedef PyDictKeyEntry *(*dict_lookup_func)
+(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject ***value_addr);
+
+struct _dictkeysobject {
+    Py_ssize_t dk_refcnt;
+    Py_ssize_t dk_size;
+    dict_lookup_func dk_lookup;
+    Py_ssize_t dk_usable;
+    PyDictKeyEntry dk_entries[1];
+};
+
+
+/*
+To ensure the lookup algorithm terminates, there must be at least one Unused
+slot (NULL key) in the table.
+To avoid slowing down lookups on a near-full table, we resize the table when
+it's USABLE_FRACTION (currently two-thirds) full.
+*/
 
 /* Set a key error with the specified argument, wrapping it in a
  * tuple automatically so that tuple keys are not unpacked as the
@@ -25,10 +109,6 @@
     Py_DECREF(tup);
 }
 
-/* Define this out if you don't want conversion statistics on exit. */
-#undef SHOW_CONVERSION_COUNTS
-
-/* See large comment block below.  This must be >= 1. */
 #define PERTURB_SHIFT 5
 
 /*
@@ -126,8 +206,13 @@
 
 */
 
-/* Object used as dummy key to fill deleted entries */
-static PyObject *dummy = NULL; /* Initialized by first call to newPyDictObject() */
+/* Object used as dummy key to fill deleted entries
+ * This could be any unique object,
+ * use a custom type in order to minimise coupling.
+*/
+static PyObject _dummy_struct;
+
+#define dummy (&_dummy_struct)
 
 #ifdef Py_REF_DEBUG
 PyObject *
@@ -138,77 +223,17 @@
 #endif
 
 /* forward declarations */
-static PyDictEntry *
-lookdict_unicode(PyDictObject *mp, PyObject *key, Py_hash_t hash);
-
-#ifdef SHOW_CONVERSION_COUNTS
-static long created = 0L;
-static long converted = 0L;
-
-static void
-show_counts(void)
-{
-    fprintf(stderr, "created %ld string dicts\n", created);
-    fprintf(stderr, "converted %ld to normal dicts\n", converted);
-    fprintf(stderr, "%.2f%% conversion rate\n", (100.0*converted)/created);
-}
-#endif
-
-/* Debug statistic to compare allocations with reuse through the free list */
-#undef SHOW_ALLOC_COUNT
-#ifdef SHOW_ALLOC_COUNT
-static size_t count_alloc = 0;
-static size_t count_reuse = 0;
-
-static void
-show_alloc(void)
-{
-    fprintf(stderr, "Dict allocations: %" PY_FORMAT_SIZE_T "d\n",
-        count_alloc);
-    fprintf(stderr, "Dict reuse through freelist: %" PY_FORMAT_SIZE_T
-        "d\n", count_reuse);
-    fprintf(stderr, "%.2f%% reuse rate\n\n",
-        (100.0*count_reuse/(count_alloc+count_reuse)));
-}
-#endif
-
-/* Debug statistic to count GC tracking of dicts */
-#ifdef SHOW_TRACK_COUNT
-static Py_ssize_t count_untracked = 0;
-static Py_ssize_t count_tracked = 0;
-
-static void
-show_track(void)
-{
-    fprintf(stderr, "Dicts created: %" PY_FORMAT_SIZE_T "d\n",
-        count_tracked + count_untracked);
-    fprintf(stderr, "Dicts tracked by the GC: %" PY_FORMAT_SIZE_T
-        "d\n", count_tracked);
-    fprintf(stderr, "%.2f%% dict tracking rate\n\n",
-        (100.0*count_tracked/(count_untracked+count_tracked)));
-}
-#endif
-
-
-/* Initialization macros.
-   There are two ways to create a dict:  PyDict_New() is the main C API
-   function, and the tp_new slot maps to dict_new().  In the latter case we
-   can save a little time over what PyDict_New does because it's guaranteed
-   that the PyDictObject struct is already zeroed out.
-   Everyone except dict_new() should use EMPTY_TO_MINSIZE (unless they have
-   an excellent reason not to).
-*/
-
-#define INIT_NONZERO_DICT_SLOTS(mp) do {                                \
-    (mp)->ma_table = (mp)->ma_smalltable;                               \
-    (mp)->ma_mask = PyDict_MINSIZE - 1;                                 \
-    } while(0)
-
-#define EMPTY_TO_MINSIZE(mp) do {                                       \
-    memset((mp)->ma_smalltable, 0, sizeof((mp)->ma_smalltable));        \
-    (mp)->ma_used = (mp)->ma_fill = 0;                                  \
-    INIT_NONZERO_DICT_SLOTS(mp);                                        \
-    } while(0)
+static PyDictKeyEntry *lookdict(PyDictObject *mp, PyObject *key,
+                                Py_hash_t hash, PyObject ***value_addr);
+static PyDictKeyEntry *lookdict_unicode(PyDictObject *mp, PyObject *key,
+                                        Py_hash_t hash, PyObject ***value_addr);
+static PyDictKeyEntry *
+lookdict_unicode_nodummy(PyDictObject *mp, PyObject *key,
+                         Py_hash_t hash, PyObject ***value_addr);
+static PyDictKeyEntry *lookdict_split(PyDictObject *mp, PyObject *key,
+                                      Py_hash_t hash, PyObject ***value_addr);
+
+static int dictresize(PyDictObject *mp, Py_ssize_t minused);
 
 /* Dictionary reuse scheme to save calls to malloc, free, and memset */
 #ifndef PyDict_MAXFREELIST
@@ -236,61 +261,149 @@
     PyDict_ClearFreeList();
 }
 
-PyObject *
-PyDict_New(void)
+#define DK_INCREF(dk) (++(dk)->dk_refcnt)
+#define DK_DECREF(dk) if ((--(dk)->dk_refcnt) == 0) free_keys_object(dk)
+#define DK_SIZE(dk) ((dk)->dk_size)
+#define DK_MASK(dk) (((dk)->dk_size)-1)
+#define IS_POWER_OF_2(x) (((x) & (x-1)) == 0)
+
+/* USABLE_FRACTION must obey the following:
+ *     (0 < USABLE_FRACTION(n) < n) for all n >= 2
+ *
+ * USABLE_FRACTION should be very quick to calculate.
+ * Fractions around 5/8 to 2/3 seem to work well in practice.
+ */
+
+/* Use (2n+1)/3 rather than 2n+3 because: it makes no difference for
+ * combined tables (the two fractions round to the same number n < ),
+ * but 2*4/3 is 2 whereas (2*4+1)/3 is 3 which potentially saves quite
+ * a lot of space for small, split tables */
+#define USABLE_FRACTION(n) ((((n) << 1)+1)/3)
+
+/* Alternative fraction that is otherwise close enough to (2n+1)/3 to make
+ * little difference. 8 * 2/3 == 8 * 5/8 == 5. 16 * 2/3 == 16 * 5/8 == 10.
+ * 32 * 2/3 = 21, 32 * 5/8 = 20.
+ * Its advantage is that it is faster to compute on machines with slow division.
+ * #define USABLE_FRACTION(n) (((n) >> 1) + ((n) >> 2) - ((n) >> 3))
+*/
+
+
+#define ENSURE_ALLOWS_DELETIONS(d) \
+    if ((d)->ma_keys->dk_lookup == lookdict_unicode_nodummy) { \
+        (d)->ma_keys->dk_lookup = lookdict_unicode; \
+    }
+
+/* This immutable, empty PyDictKeysObject is used for PyDict_Clear()
+ * (which cannot fail and thus can do no allocation).
+ */
+static PyDictKeysObject empty_keys_struct = {
+        2, /* dk_refcnt 1 for this struct, 1 for dummy_struct */
+        1, /* dk_size */
+        lookdict_split, /* dk_lookup */
+        0, /* dk_usable (immutable) */
+        {
+            { 0, 0, 0 } /* dk_entries (empty) */
+        }
+};
+
+static PyObject *empty_values[1] = { NULL };
+
+#define Py_EMPTY_KEYS &empty_keys_struct
+
+static PyDictKeysObject *new_keys_object(Py_ssize_t size)
 {
-    register PyDictObject *mp;
-    if (dummy == NULL) { /* Auto-initialize dummy */
-        dummy = PyUnicode_FromString("<dummy key>");
-        if (dummy == NULL)
-            return NULL;
-#ifdef SHOW_CONVERSION_COUNTS
-        Py_AtExit(show_counts);
-#endif
-#ifdef SHOW_ALLOC_COUNT
-        Py_AtExit(show_alloc);
-#endif
-#ifdef SHOW_TRACK_COUNT
-        Py_AtExit(show_track);
-#endif
+    PyDictKeysObject *dk;
+    Py_ssize_t i;
+    PyDictKeyEntry *ep0;
+
+    assert(size >= PyDict_MINSIZE_SPLIT);
+    assert(IS_POWER_OF_2(size));
+    dk = PyMem_MALLOC(sizeof(PyDictKeysObject) +
+                      sizeof(PyDictKeyEntry) * (size-1));
+    if (dk == NULL) {
+        PyErr_NoMemory();
+        return NULL;
     }
+    dk->dk_refcnt = 1;
+    dk->dk_size = size;
+    dk->dk_usable = USABLE_FRACTION(size);
+    ep0 = &dk->dk_entries[0];
+    /* Hash value of slot 0 is used by popitem, so it must be initialized */
+    ep0->me_hash = 0;
+    for (i = 0; i < size; i++) {
+        ep0[i].me_key = NULL;
+        ep0[i].me_value = NULL;
+    }
+    dk->dk_lookup = lookdict_unicode_nodummy;
+    return dk;
+}
+
+static void
+free_keys_object(PyDictKeysObject *keys)
+{
+    PyDictKeyEntry *entries = &keys->dk_entries[0];
+    Py_ssize_t i, n;
+    for (i = 0, n = DK_SIZE(keys); i < n; i++) {
+        Py_XDECREF(entries[i].me_key);
+        Py_XDECREF(entries[i].me_value);
+    }
+    PyMem_FREE(keys);
+}
+
+#define new_values(size) PyMem_NEW(PyObject *, size)
+
+#define free_values(values) PyMem_FREE(values)
+
+/* Consumes a reference to the keys object */
+static PyObject *
+new_dict(PyDictKeysObject *keys, PyObject **values)
+{
+    PyDictObject *mp;
     if (numfree) {
         mp = free_list[--numfree];
         assert (mp != NULL);
         assert (Py_TYPE(mp) == &PyDict_Type);
         _Py_NewReference((PyObject *)mp);
-        if (mp->ma_fill) {
-            EMPTY_TO_MINSIZE(mp);
-        } else {
-            /* At least set ma_table and ma_mask; these are wrong
-               if an empty but presized dict is added to freelist */
-            INIT_NONZERO_DICT_SLOTS(mp);
+    }
+    else {
+        mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
+        if (mp == NULL) {
+            DK_DECREF(keys);
+            free_values(values);
+            return NULL;
         }
-        assert (mp->ma_used == 0);
-        assert (mp->ma_table == mp->ma_smalltable);
-        assert (mp->ma_mask == PyDict_MINSIZE - 1);
-#ifdef SHOW_ALLOC_COUNT
-        count_reuse++;
-#endif
-    } else {
-        mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
-        if (mp == NULL)
-            return NULL;
-        EMPTY_TO_MINSIZE(mp);
-#ifdef SHOW_ALLOC_COUNT
-        count_alloc++;
-#endif
     }
-    mp->ma_lookup = lookdict_unicode;
-#ifdef SHOW_TRACK_COUNT
-    count_untracked++;
-#endif
-#ifdef SHOW_CONVERSION_COUNTS
-    ++created;
-#endif
+    mp->ma_keys = keys;
+    mp->ma_values = values;
+    mp->ma_used = 0;
     return (PyObject *)mp;
 }
 
+/* Consumes a reference to the keys object */
+static PyObject *
+new_dict_with_shared_keys(PyDictKeysObject *keys)
+{
+    PyObject **values;
+    Py_ssize_t i, size;
+
+    size = DK_SIZE(keys);
+    values = new_values(size);
+    if (values == NULL) {
+        DK_DECREF(keys);
+        return PyErr_NoMemory();
+    }
+    for (i = 0; i < size; i++) {
+        values[i] = NULL;
+    }
+    return new_dict(keys, values);
+}
+
+PyObject *
+PyDict_New(void)
+{
+    return new_dict(new_keys_object(PyDict_MINSIZE_COMBINED), NULL);
+}
+
 /*
 The basic lookup function used by all operations.
 This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4.
@@ -309,29 +422,32 @@
 lookdict() is general-purpose, and may return NULL if (and only if) a
 comparison raises an exception (this was new in Python 2.5).
 lookdict_unicode() below is specialized to string keys, comparison of which can
-never raise an exception; that function can never return NULL.  For both, when
-the key isn't found a PyDictEntry* is returned for which the me_value field is
-NULL; this is the slot in the dict at which the key would have been found, and
-the caller can (if it wishes) add the <key, value> pair to the returned
-PyDictEntry*.
+never raise an exception; that function can never return NULL.
+lookdict_unicode_nodummy is further specialized for string keys that cannot be
+the <dummy> value.
+For both, when the key isn't found a PyDictEntry* is returned
+where the key would have been found, *value_addr points to the matching value
+slot.
 */
-static PyDictEntry *
-lookdict(PyDictObject *mp, PyObject *key, register Py_hash_t hash)
+static PyDictKeyEntry *
+lookdict(PyDictObject *mp, PyObject *key,
+         Py_hash_t hash, PyObject ***value_addr)
 {
     register size_t i;
     register size_t perturb;
-    register PyDictEntry *freeslot;
-    register size_t mask = (size_t)mp->ma_mask;
-    PyDictEntry *ep0 = mp->ma_table;
-    register PyDictEntry *ep;
+    register PyDictKeyEntry *freeslot;
+    register size_t mask = DK_MASK(mp->ma_keys);
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
     register int cmp;
     PyObject *startkey;
 
     i = (size_t)hash & mask;
     ep = &ep0[i];
-    if (ep->me_key == NULL || ep->me_key == key)
+    if (ep->me_key == NULL || ep->me_key == key) {
+        *value_addr = &ep->me_value;
         return ep;
-
+    }
     if (ep->me_key == dummy)
         freeslot = ep;
     else {
@@ -342,9 +458,11 @@
             Py_DECREF(startkey);
             if (cmp < 0)
                 return NULL;
-            if (ep0 == mp->ma_table && ep->me_key == startkey) {
-                if (cmp > 0)
+            if (ep0 == mp->ma_keys->dk_entries && ep->me_key == startkey) {
+                if (cmp > 0) {
+                    *value_addr = &ep->me_value;
                     return ep;
+                }
             }
             else {
                 PyErr_SetString(PyExc_RuntimeError,
@@ -360,20 +478,33 @@
     for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
-        if (ep->me_key == NULL)
-            return freeslot == NULL ? ep : freeslot;
-        if (ep->me_key == key)
+        if (ep->me_key == NULL) {
+            if (freeslot == NULL) {
+                *value_addr = &ep->me_value;
+                return ep;
+            } else {
+                *value_addr = &freeslot->me_value;
+                return freeslot;
+            }
+        }
+        if (ep->me_key == key) {
+            *value_addr = &ep->me_value;
             return ep;
+        }
         if (ep->me_hash == hash && ep->me_key != dummy) {
             startkey = ep->me_key;
             Py_INCREF(startkey);
             cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
             Py_DECREF(startkey);
-            if (cmp < 0)
+            if (cmp < 0) {
+                *value_addr = NULL;
                 return NULL;
-            if (ep0 == mp->ma_table && ep->me_key == startkey) {
-                if (cmp > 0)
+            }
+            if (ep0 == mp->ma_keys->dk_entries && ep->me_key == startkey) {
+                if (cmp > 0) {
+                    *value_addr = &ep->me_value;
                     return ep;
+                }
             }
             else {
                 PyErr_SetString(PyExc_RuntimeError,
@@ -388,46 +519,39 @@
     return 0;
 }
 
-/*
- * Hacked up version of lookdict which can assume keys are always
- * unicodes; this assumption allows testing for errors during
- * PyObject_RichCompareBool() to be dropped; unicode-unicode
- * comparisons never raise exceptions.  This also means we don't need
- * to go through PyObject_RichCompareBool(); we can always use
- * unicode_eq() directly.
- *
- * This is valuable because dicts with only unicode keys are very common.
- */
-static PyDictEntry *
-lookdict_unicode(PyDictObject *mp, PyObject *key, register Py_hash_t hash)
+/* Specialized version for string-only keys */
+static PyDictKeyEntry *
+lookdict_unicode(PyDictObject *mp, PyObject *key,
+                 Py_hash_t hash, PyObject ***value_addr)
 {
     register size_t i;
     register size_t perturb;
-    register PyDictEntry *freeslot;
-    register size_t mask = (size_t)mp->ma_mask;
-    PyDictEntry *ep0 = mp->ma_table;
-    register PyDictEntry *ep;
+    register PyDictKeyEntry *freeslot;
+    register size_t mask = DK_MASK(mp->ma_keys);
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
 
     /* Make sure this function doesn't have to handle non-unicode keys,
        including subclasses of str; e.g., one reason to subclass
        unicodes is to override __eq__, and for speed we don't cater to
        that here. */
     if (!PyUnicode_CheckExact(key)) {
-#ifdef SHOW_CONVERSION_COUNTS
-        ++converted;
-#endif
-        mp->ma_lookup = lookdict;
-        return lookdict(mp, key, hash);
+        mp->ma_keys->dk_lookup = lookdict;
+        return lookdict(mp, key, hash, value_addr);
     }
     i = (size_t)hash & mask;
     ep = &ep0[i];
-    if (ep->me_key == NULL || ep->me_key == key)
+    if (ep->me_key == NULL || ep->me_key == key) {
+        *value_addr = &ep->me_value;
         return ep;
+    }
     if (ep->me_key == dummy)
         freeslot = ep;
     else {
-        if (ep->me_hash == hash && unicode_eq(ep->me_key, key))
+        if (ep->me_hash == hash && unicode_eq(ep->me_key, key)) {
+            *value_addr = &ep->me_value;
             return ep;
+        }
         freeslot = NULL;
     }
 
@@ -436,13 +560,22 @@
     for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
-        if (ep->me_key == NULL)
-            return freeslot == NULL ? ep : freeslot;
+        if (ep->me_key == NULL) {
+            if (freeslot == NULL) {
+                *value_addr = &ep->me_value;
+                return ep;
+            } else {
+                *value_addr = &freeslot->me_value;
+                return freeslot;
+            }
+        }
         if (ep->me_key == key
             || (ep->me_hash == hash
             && ep->me_key != dummy
-            && unicode_eq(ep->me_key, key)))
+            && unicode_eq(ep->me_key, key))) {
+            *value_addr = &ep->me_value;
             return ep;
+        }
         if (ep->me_key == dummy && freeslot == NULL)
             freeslot = ep;
     }
@@ -450,6 +583,88 @@
     return 0;
 }
 
+/* Faster version of lookdict_unicode when it is known that no <dummy> keys
+ * will be present. */
+static PyDictKeyEntry *
+lookdict_unicode_nodummy(PyDictObject *mp, PyObject *key,
+                         Py_hash_t hash, PyObject ***value_addr)
+{
+    register size_t i;
+    register size_t perturb;
+    register size_t mask = DK_MASK(mp->ma_keys);
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
+
+    /* Make sure this function doesn't have to handle non-unicode keys,
+       including subclasses of str; e.g., one reason to subclass
+       unicodes is to override __eq__, and for speed we don't cater to
+       that here. */
+    if (!PyUnicode_CheckExact(key)) {
+        mp->ma_keys->dk_lookup = lookdict;
+        return lookdict(mp, key, hash, value_addr);
+    }
+    i = (size_t)hash & mask;
+    ep = &ep0[i];
+    assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+    if (ep->me_key == NULL || ep->me_key == key ||
+        (ep->me_hash == hash && unicode_eq(ep->me_key, key))) {
+        *value_addr = &ep->me_value;
+        return ep;
+    }
+    for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
+        i = (i << 2) + i + perturb + 1;
+        ep = &ep0[i & mask];
+        assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+        if (ep->me_key == NULL || ep->me_key == key ||
+            (ep->me_hash == hash && unicode_eq(ep->me_key, key))) {
+            *value_addr = &ep->me_value;
+            return ep;
+        }
+    }
+    assert(0);          /* NOT REACHED */
+    return 0;
+}
+
+/* Version of lookdict for split tables.
+ * All split tables and only split tables use this lookup function.
+ * Split tables only contain unicode keys and no dummy keys,
+ * so algorithm is the same as lookdict_unicode_nodummy.
+ */
+static PyDictKeyEntry *
+lookdict_split(PyDictObject *mp, PyObject *key,
+               Py_hash_t hash, PyObject ***value_addr)
+{
+    register size_t i;
+    register size_t perturb;
+    register size_t mask = DK_MASK(mp->ma_keys);
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
+
+    if (!PyUnicode_CheckExact(key)) {
+        return lookdict(mp, key, hash, value_addr);
+    }
+    i = (size_t)hash & mask;
+    ep = &ep0[i];
+    assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+    if (ep->me_key == NULL || ep->me_key == key ||
+        (ep->me_hash == hash && unicode_eq(ep->me_key, key))) {
+        *value_addr = &mp->ma_values[i];
+        return ep;
+    }
+    for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
+        i = (i << 2) + i + perturb + 1;
+        ep = &ep0[i & mask];
+        assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+        if (ep->me_key == NULL || ep->me_key == key ||
+            (ep->me_hash == hash && unicode_eq(ep->me_key, key))) {
+            *value_addr = &mp->ma_values[i & mask];
+            return ep;
+        }
+    }
+    assert(0);          /* NOT REACHED */
+    return 0;
+}
+
 int
 _PyDict_HasOnlyStringKeys(PyObject *dict)
 {
@@ -457,7 +672,7 @@
     PyObject *key, *value;
     assert(PyDict_Check(dict));
     /* Shortcut */
-    if (((PyDictObject *)dict)->ma_lookup == lookdict_unicode)
+    if (((PyDictObject *)dict)->ma_keys->dk_lookup != lookdict)
         return 1;
     while (PyDict_Next(dict, &pos, &key, &value))
         if (!PyUnicode_Check(key))
@@ -465,23 +680,12 @@
     return 1;
 }
 
-#ifdef SHOW_TRACK_COUNT
-#define INCREASE_TRACK_COUNT \
-    (count_tracked++, count_untracked--);
-#define DECREASE_TRACK_COUNT \
-    (count_tracked--, count_untracked++);
-#else
-#define INCREASE_TRACK_COUNT
-#define DECREASE_TRACK_COUNT
-#endif
-
 #define MAINTAIN_TRACKING(mp, key, value) \
     do { \
         if (!_PyObject_GC_IS_TRACKED(mp)) { \
             if (_PyObject_GC_MAY_BE_TRACKED(key) || \
                 _PyObject_GC_MAY_BE_TRACKED(value)) { \
                 _PyObject_GC_TRACK(mp); \
-                INCREASE_TRACK_COUNT \
             } \
         } \
     } while(0)
@@ -491,59 +695,79 @@
 {
     PyDictObject *mp;
     PyObject *value;
-    Py_ssize_t mask, i;
-    PyDictEntry *ep;
+    Py_ssize_t i, size;
 
     if (!PyDict_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
         return;
 
     mp = (PyDictObject *) op;
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0; i <= mask; i++) {
-        if ((value = ep[i].me_value) == NULL)
-            continue;
-        if (_PyObject_GC_MAY_BE_TRACKED(value) ||
-            _PyObject_GC_MAY_BE_TRACKED(ep[i].me_key))
-            return;
+    size = DK_SIZE(mp->ma_keys);
+    if (_PyDict_HasSplitTable(mp)) {
+        for (i = 0; i < size; i++) {
+            if ((value = mp->ma_values[i]) == NULL)
+                continue;
+            if (_PyObject_GC_MAY_BE_TRACKED(value)) {
+                assert(!_PyObject_GC_MAY_BE_TRACKED(
+                    mp->ma_keys->dk_entries[i].me_key));
+                return;
+            }
+        }
     }
-    DECREASE_TRACK_COUNT
+    else {
+        PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+        for (i = 0; i < size; i++) {
+            if ((value = ep0[i].me_value) == NULL)
+                continue;
+            if (_PyObject_GC_MAY_BE_TRACKED(value) ||
+                _PyObject_GC_MAY_BE_TRACKED(ep0[i].me_key))
+                return;
+        }
+    }
     _PyObject_GC_UNTRACK(op);
 }
 
-/*
-Internal routine to insert a new item into the table when you have entry object.
-Used by insertdict.
-*/
+/* Internal function to find slot for an item from its hash
+ * when it is known that the key is not present in the dict.
+ */
+static PyDictKeyEntry *
+find_empty_slot(PyDictObject *mp, PyObject *key, Py_hash_t hash,
+                PyObject ***value_addr)
+{
+    size_t i;
+    size_t perturb;
+    size_t mask = DK_MASK(mp->ma_keys);
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    PyDictKeyEntry *ep;
+
+    assert(key != NULL);
+    if (!PyUnicode_CheckExact(key))
+        mp->ma_keys->dk_lookup = lookdict;
+    i = hash & mask;
+    ep = &ep0[i];
+    for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
+        i = (i << 2) + i + perturb + 1;
+        ep = &ep0[i & mask];
+    }
+    assert(ep->me_value == NULL);
+    if (mp->ma_values)
+        *value_addr = &mp->ma_values[i & mask];
+    else
+        *value_addr = &ep->me_value;
+    return ep;
+}
+
 static int
-insertdict_by_entry(register PyDictObject *mp, PyObject *key, Py_hash_t hash,
-                    PyDictEntry *ep, PyObject *value)
+insertion_resize(PyDictObject *mp)
 {
-    PyObject *old_value;
-
-    MAINTAIN_TRACKING(mp, key, value);
-    if (ep->me_value != NULL) {
-        old_value = ep->me_value;
-        ep->me_value = value;
-        Py_DECREF(old_value); /* which **CAN** re-enter */
-        Py_DECREF(key);
-    }
-    else {
-        if (ep->me_key == NULL)
-            mp->ma_fill++;
-        else {
-            assert(ep->me_key == dummy);
-            Py_DECREF(dummy);
-        }
-        ep->me_key = key;
-        ep->me_hash = hash;
-        ep->me_value = value;
-        mp->ma_used++;
-    }
-    return 0;
+    /*
+    * Double the size of the dict,
+    * Previous versions quadrupled size, but doing so may result in excessive
+    * memory use. Doubling keeps the number of resizes low without wasting
+    * too much memory.
+    */
+    return dictresize(mp, 2 * mp->ma_used);
 }
 
-
 /*
 Internal routine to insert a new item into the table.
 Used both by the internal resize routine and by the public insert routine.
@@ -551,18 +775,64 @@
 Returns -1 if an error occurred, or 0 on success.
 */
 static int
-insertdict(register PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
+insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
 {
-    register PyDictEntry *ep;
-
-    assert(mp->ma_lookup != NULL);
-    ep = mp->ma_lookup(mp, key, hash);
+    PyObject *old_value;
+    PyObject **value_addr;
+    PyDictKeyEntry *ep;
+    assert(key != dummy);
+
+    if (mp->ma_values != NULL && !PyUnicode_CheckExact(key)) {
+        if (insertion_resize(mp) < 0)
+            return -1;
+    }
+
+    ep = mp->ma_keys->dk_lookup(mp, key, hash, &value_addr);
     if (ep == NULL) {
         Py_DECREF(key);
         Py_DECREF(value);
         return -1;
     }
-    return insertdict_by_entry(mp, key, hash, ep, value);
+    MAINTAIN_TRACKING(mp, key, value);
+    old_value = *value_addr;
+    if (old_value != NULL) {
+        assert(ep->me_key != NULL && ep->me_key != dummy);
+        *value_addr = value;
+        Py_DECREF(old_value); /* which **CAN** re-enter */
+        Py_DECREF(key);
+    }
+    else {
+        if (ep->me_key == NULL) {
+            if (mp->ma_keys->dk_usable <= 0) {
+                /* Need to resize. */
+                if (insertion_resize(mp) < 0) {
+                    Py_DECREF(key);
+                    Py_DECREF(value);
+                    return -1;
+                }
+                ep = find_empty_slot(mp, key, hash, &value_addr);
+            }
+            mp->ma_keys->dk_usable--;
+            assert(mp->ma_keys->dk_usable >= 0);
+            ep->me_key = key;
+            ep->me_hash = hash;
+        }
+        else {
+            if (ep->me_key == dummy) {
+                ep->me_key = key;
+                ep->me_hash = hash;
+                Py_DECREF(dummy);
+            } else {
+                Py_DECREF(key);
+                assert(_PyDict_HasSplitTable(mp));
+            }
+        }
+        mp->ma_used++;
+        *value_addr = value;
+    }
+    assert(ep->me_key != NULL && ep->me_key != dummy);
+    assert(PyUnicode_CheckExact(key) || mp->ma_keys->dk_lookup == lookdict);
+    return 0;
 }
 
 /*
@@ -572,50 +842,57 @@
 using insertdict() in dictresize() is dangerous (SF bug #1456209).
 Note that no refcounts are changed by this routine; if needed, the caller
 is responsible for incref'ing `key` and `value`.
+Neither mp->ma_used nor k->dk_usable are modified by this routine; the caller
+must set them correctly
 */
 static void
-insertdict_clean(register PyDictObject *mp, PyObject *key, Py_hash_t hash,
+insertdict_clean(PyDictObject *mp, PyObject *key, Py_hash_t hash,
                  PyObject *value)
 {
-    register size_t i;
-    register size_t perturb;
-    register size_t mask = (size_t)mp->ma_mask;
-    PyDictEntry *ep0 = mp->ma_table;
-    register PyDictEntry *ep;
-
-    MAINTAIN_TRACKING(mp, key, value);
-    i = (size_t)hash & mask;
+    size_t i;
+    size_t perturb;
+    PyDictKeysObject *k = mp->ma_keys;
+    size_t mask = (size_t)DK_SIZE(k)-1;
+    PyDictKeyEntry *ep0 = &k->dk_entries[0];
+    PyDictKeyEntry *ep;
+
+    assert(k->dk_lookup != NULL);
+    assert(value != NULL);
+    assert(key != NULL);
+    assert(key != dummy);
+    assert(PyUnicode_CheckExact(key) || k->dk_lookup == lookdict);
+    i = hash & mask;
     ep = &ep0[i];
     for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
     }
     assert(ep->me_value == NULL);
-    mp->ma_fill++;
     ep->me_key = key;
     ep->me_hash = hash;
     ep->me_value = value;
-    mp->ma_used++;
 }
 
 /*
 Restructure the table by allocating a new table and reinserting all
 items again.  When entries have been deleted, the new table may
 actually be smaller than the old one.
+If a table is split (its keys and hashes are shared, its values are not),
+then the values are temporarily copied into the table, it is resized as
+a combined table, then the me_value slots in the old table are NULLed out.
+After resizing a table is always combined,
+but can be resplit by make_keys_shared().
 */
 static int
 dictresize(PyDictObject *mp, Py_ssize_t minused)
 {
     Py_ssize_t newsize;
-    PyDictEntry *oldtable, *newtable, *ep;
-    Py_ssize_t i;
-    int is_oldtable_malloced;
-    PyDictEntry small_copy[PyDict_MINSIZE];
-
-    assert(minused >= 0);
-
-    /* Find the smallest table size > minused. */
-    for (newsize = PyDict_MINSIZE;
+    PyDictKeysObject *oldkeys;
+    PyObject **oldvalues;
+    Py_ssize_t i, oldsize;
+
+/* Find the smallest table size > minused. */
+    for (newsize = PyDict_MINSIZE_COMBINED;
          newsize <= minused && newsize > 0;
          newsize <<= 1)
         ;
@@ -623,83 +900,122 @@
         PyErr_NoMemory();
         return -1;
     }
-
-    /* Get space for a new table. */
-    oldtable = mp->ma_table;
-    assert(oldtable != NULL);
-    is_oldtable_malloced = oldtable != mp->ma_smalltable;
-
-    if (newsize == PyDict_MINSIZE) {
-        /* A large table is shrinking, or we can't get any smaller. */
-        newtable = mp->ma_smalltable;
-        if (newtable == oldtable) {
-            if (mp->ma_fill == mp->ma_used) {
-                /* No dummies, so no point doing anything. */
-                return 0;
+    oldkeys = mp->ma_keys;
+    oldvalues = mp->ma_values;
+    /* Allocate a new table. */
+    mp->ma_keys = new_keys_object(newsize);
+    if (mp->ma_keys == NULL) {
+        mp->ma_keys = oldkeys;
+        return -1;
+    }
+    if (oldkeys->dk_lookup == lookdict)
+        mp->ma_keys->dk_lookup = lookdict;
+    oldsize = DK_SIZE(oldkeys);
+    mp->ma_values = NULL;
+    /* If empty then nothing to copy so just return */
+    if (oldsize == 1) {
+        assert(oldkeys == Py_EMPTY_KEYS);
+        DK_DECREF(oldkeys);
+        return 0;
+    }
+    /* Main loop below assumes we can transfer refcount to new keys
+     * and that value is stored in me_value.
+     * Increment ref-counts and copy values here to compensate
+     * This (resizing a split table) should be relatively rare */
+    if (oldvalues != NULL) {
+        for (i = 0; i < oldsize; i++) {
+            if (oldvalues[i] != NULL) {
+                Py_INCREF(oldkeys->dk_entries[i].me_key);
+                oldkeys->dk_entries[i].me_value = oldvalues[i];
             }
-            /* We're not going to resize it, but rebuild the
-               table anyway to purge old dummy entries.
-               Subtle:  This is *necessary* if fill==size,
-               as lookdict needs at least one virgin slot to
-               terminate failing searches.  If fill < size, it's
-               merely desirable, as dummies slow searches. */
-            assert(mp->ma_fill > mp->ma_used);
-            memcpy(small_copy, oldtable, sizeof(small_copy));
-            oldtable = small_copy;
         }
     }
-    else {
-        newtable = PyMem_NEW(PyDictEntry, newsize);
-        if (newtable == NULL) {
-            PyErr_NoMemory();
-            return -1;
+    /* Main loop */
+    for (i = 0; i < oldsize; i++) {
+        PyDictKeyEntry *ep = &oldkeys->dk_entries[i];
+        if (ep->me_value != NULL) {
+            assert(ep->me_key != dummy);
+            insertdict_clean(mp, ep->me_key, ep->me_hash, ep->me_value);
         }
     }
-
-    /* Make the dict empty, using the new table. */
-    assert(newtable != oldtable);
-    mp->ma_table = newtable;
-    mp->ma_mask = newsize - 1;
-    memset(newtable, 0, sizeof(PyDictEntry) * newsize);
-    mp->ma_used = 0;
-    i = mp->ma_fill;
-    mp->ma_fill = 0;
-
-    /* Copy the data over; this is refcount-neutral for active entries;
-       dummy entries aren't copied over, of course */
-    for (ep = oldtable; i > 0; ep++) {
-        if (ep->me_value != NULL) {             /* active entry */
-            --i;
-            insertdict_clean(mp, ep->me_key, ep->me_hash, ep->me_value);
+    mp->ma_keys->dk_usable -= mp->ma_used;
+    if (oldvalues != NULL) {
+        /* NULL out me_value slot in oldkeys, in case it was shared */
+        for (i = 0; i < oldsize; i++)
+            oldkeys->dk_entries[i].me_value = NULL;
+        assert(oldvalues != empty_values);
+        free_values(oldvalues);
+        DK_DECREF(oldkeys);
+    }
+    else {
+        assert(oldkeys->dk_lookup != lookdict_split);
+        if (oldkeys->dk_lookup != lookdict_unicode_nodummy) {
+            PyDictKeyEntry *ep0 = &oldkeys->dk_entries[0];
+            for (i = 0; i < oldsize; i++) {
+                if (ep0[i].me_key == dummy)
+                    Py_DECREF(dummy);
+            }
         }
-        else if (ep->me_key != NULL) {          /* dummy entry */
-            --i;
-            assert(ep->me_key == dummy);
-            Py_DECREF(ep->me_key);
-        }
-        /* else key == value == NULL:  nothing to do */
+        assert(oldkeys->dk_refcnt == 1);
+        PyMem_FREE(oldkeys);
     }
-
-    if (is_oldtable_malloced)
-        PyMem_DEL(oldtable);
     return 0;
 }
 
-/* Create a new dictionary pre-sized to hold an estimated number of elements.
-   Underestimates are okay because the dictionary will resize as necessary.
-   Overestimates just mean the dictionary will be more sparse than usual.
-*/
+static PyDictKeysObject *
+make_keys_shared(PyObject *op)
+{
+    Py_ssize_t i;
+    Py_ssize_t size;
+    PyDictObject *mp = (PyDictObject *)op;
+
+    assert(PyDict_CheckExact(op));
+    if (!_PyDict_HasSplitTable(mp)) {
+        PyDictKeyEntry *ep0;
+        PyObject **values;
+        assert(mp->ma_keys->dk_refcnt == 1);
+        if (mp->ma_keys->dk_lookup == lookdict) {
+            return NULL;
+        }
+        else if (mp->ma_keys->dk_lookup == lookdict_unicode) {
+            /* Remove dummy keys */
+            if (dictresize(mp, DK_SIZE(mp->ma_keys)))
+                return NULL;
+        }
+        assert(mp->ma_keys->dk_lookup == lookdict_unicode_nodummy);
+        /* Copy values into a new array */
+        ep0 = &mp->ma_keys->dk_entries[0];
+        size = DK_SIZE(mp->ma_keys);
+        values = new_values(size);
+        if (values == NULL) {
+            PyErr_SetString(PyExc_MemoryError,
+                "Not enough memory to allocate new values array");
+            return NULL;
+        }
+        for (i = 0; i < size; i++) {
+            values[i] = ep0[i].me_value;
+            ep0[i].me_value = NULL;
+        }
+        mp->ma_keys->dk_lookup = lookdict_split;
+        mp->ma_values = values;
+    }
+    DK_INCREF(mp->ma_keys);
+    return mp->ma_keys;
+}
 
 PyObject *
 _PyDict_NewPresized(Py_ssize_t minused)
 {
-    PyObject *op = PyDict_New();
-
-    if (minused>5 && op != NULL && dictresize((PyDictObject *)op, minused) == -1) {
-        Py_DECREF(op);
+    Py_ssize_t newsize;
+    PyDictKeysObject *new_keys;
+    for (newsize = PyDict_MINSIZE_COMBINED;
+         newsize <= minused && newsize > 0;
+         newsize <<= 1)
+        ;
+    new_keys = new_keys_object(newsize);
+    if (new_keys == NULL)
         return NULL;
-    }
-    return op;
+    return new_dict(new_keys, NULL);
 }
 
 /* Note that, for historical reasons, PyDict_GetItem() suppresses all errors
@@ -717,8 +1033,10 @@
 {
     Py_hash_t hash;
     PyDictObject *mp = (PyDictObject *)op;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
     PyThreadState *tstate;
+    PyObject **value_addr;
+
     if (!PyDict_Check(op))
         return NULL;
     if (!PyUnicode_CheckExact(key) ||
@@ -742,20 +1060,20 @@
         /* preserve the existing exception */
         PyObject *err_type, *err_value, *err_tb;
         PyErr_Fetch(&err_type, &err_value, &err_tb);
-        ep = (mp->ma_lookup)(mp, key, hash);
+        ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
         /* ignore errors */
         PyErr_Restore(err_type, err_value, err_tb);
         if (ep == NULL)
             return NULL;
     }
     else {
-        ep = (mp->ma_lookup)(mp, key, hash);
+        ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
         if (ep == NULL) {
             PyErr_Clear();
             return NULL;
         }
     }
-    return ep->me_value;
+    return *value_addr;
 }
 
 /* Variant of PyDict_GetItem() that doesn't suppress exceptions.
@@ -767,7 +1085,8 @@
 {
     Py_hash_t hash;
     PyDictObject*mp = (PyDictObject *)op;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyDict_Check(op)) {
         PyErr_BadInternalCall();
@@ -782,10 +1101,10 @@
         }
     }
 
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    return ep->me_value;
+    return *value_addr;
 }
 
 PyObject *
@@ -798,43 +1117,39 @@
     return PyDict_GetItemWithError(dp, kv);
 }
 
-static int
-dict_set_item_by_hash_or_entry(register PyObject *op, PyObject *key,
-                               Py_hash_t hash, PyDictEntry *ep, PyObject *value)
+/* Fast version of global value lookup.
+ * Lookup in globals, then builtins.
+ */
+PyObject *
+_PyDict_LoadGlobal(PyDictObject *globals, PyDictObject *builtins, PyObject *key)
 {
-    register PyDictObject *mp;
-    register Py_ssize_t n_used;
-
-    mp = (PyDictObject *)op;
-    assert(mp->ma_fill <= mp->ma_mask);  /* at least one empty slot */
-    n_used = mp->ma_used;
-    Py_INCREF(value);
-    Py_INCREF(key);
-    if (ep == NULL) {
-        if (insertdict(mp, key, hash, value) != 0)
-            return -1;
+    PyObject *x;
+    if (PyUnicode_CheckExact(key)) {
+        PyObject **value_addr;
+        Py_hash_t hash = ((PyASCIIObject *)key)->hash;
+        if (hash != -1) {
+            PyDictKeyEntry *e;
+            e = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
+            if (e == NULL) {
+                return NULL;
+            }
+            x = *value_addr;
+            if (x != NULL)
+                return x;
+            e = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
+            if (e == NULL) {
+                return NULL;
+            }
+            x = *value_addr;
+            return x;
+        }
     }
-    else {
-        if (insertdict_by_entry(mp, key, hash, ep, value) != 0)
-            return -1;
-    }
-    /* If we added a key, we can safely resize.  Otherwise just return!
-     * If fill >= 2/3 size, adjust size.  Normally, this doubles or
-     * quaduples the size, but it's also possible for the dict to shrink
-     * (if ma_fill is much larger than ma_used, meaning a lot of dict
-     * keys have been * deleted).
-     *
-     * Quadrupling the size improves average dictionary sparseness
-     * (reducing collisions) at the cost of some memory and iteration
-     * speed (which loops over every possible entry).  It also halves
-     * the number of expensive resize operations in a growing dictionary.
-     *
-     * Very large dictionaries (over 50K items) use doubling instead.
-     * This may help applications with severe memory constraints.
-     */
-    if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
-        return 0;
-    return dictresize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
+    x = PyDict_GetItemWithError((PyObject *)globals, key);
+    if (x != NULL)
+        return x;
+    if (PyErr_Occurred())
+        return NULL;
+    return PyDict_GetItemWithError((PyObject *)builtins, key);
 }
 
 /* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
@@ -844,36 +1159,39 @@
  * remove them.
  */
 int
-PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
+PyDict_SetItem(PyObject *op, PyObject *key, PyObject *value)
 {
-    register Py_hash_t hash;
-
+    PyDictObject *mp;
+    Py_hash_t hash;
     if (!PyDict_Check(op)) {
         PyErr_BadInternalCall();
         return -1;
     }
     assert(key);
     assert(value);
-    if (PyUnicode_CheckExact(key)) {
-        hash = ((PyASCIIObject *) key)->hash;
-        if (hash == -1)
-            hash = PyObject_Hash(key);
-    }
-    else {
+    mp = (PyDictObject *)op;
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1)
+    {
         hash = PyObject_Hash(key);
         if (hash == -1)
             return -1;
     }
-    return dict_set_item_by_hash_or_entry(op, key, hash, NULL, value);
+    Py_INCREF(value);
+    Py_INCREF(key);
+
+    /* insertdict() handles any resizing that might be necessary */
+    return insertdict(mp, key, hash, value);
 }
 
 int
 PyDict_DelItem(PyObject *op, PyObject *key)
 {
-    register PyDictObject *mp;
-    register Py_hash_t hash;
-    register PyDictEntry *ep;
-    PyObject *old_value, *old_key;
+    PyDictObject *mp;
+    Py_hash_t hash;
+    PyDictKeyEntry *ep;
+    PyObject *old_key, *old_value;
+    PyObject **value_addr;
 
     if (!PyDict_Check(op)) {
         PyErr_BadInternalCall();
@@ -887,21 +1205,24 @@
             return -1;
     }
     mp = (PyDictObject *)op;
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return -1;
-    if (ep->me_value == NULL) {
+    if (*value_addr == NULL) {
         set_key_error(key);
         return -1;
     }
-    old_key = ep->me_key;
-    Py_INCREF(dummy);
-    ep->me_key = dummy;
-    old_value = ep->me_value;
-    ep->me_value = NULL;
+    old_value = *value_addr;
+    *value_addr = NULL;
     mp->ma_used--;
+    if (!_PyDict_HasSplitTable(mp)) {
+        ENSURE_ALLOWS_DELETIONS(mp);
+        old_key = ep->me_key;
+        Py_INCREF(dummy);
+        ep->me_key = dummy;
+        Py_DECREF(old_key);
+    }
     Py_DECREF(old_value);
-    Py_DECREF(old_key);
     return 0;
 }
 
@@ -909,69 +1230,70 @@
 PyDict_Clear(PyObject *op)
 {
     PyDictObject *mp;
-    PyDictEntry *ep, *table;
-    int table_is_malloced;
-    Py_ssize_t fill;
-    PyDictEntry small_copy[PyDict_MINSIZE];
-#ifdef Py_DEBUG
+    PyDictKeysObject *oldkeys;
+    PyObject **oldvalues;
     Py_ssize_t i, n;
-#endif
 
     if (!PyDict_Check(op))
         return;
+    mp = ((PyDictObject *)op);
+    oldkeys = mp->ma_keys;
+    oldvalues = mp->ma_values;
+    if (oldvalues == empty_values)
+        return;
+    /* Empty the dict... */
+    DK_INCREF(Py_EMPTY_KEYS);
+    mp->ma_keys = Py_EMPTY_KEYS;
+    mp->ma_values = empty_values;
+    mp->ma_used = 0;
+    /* ...then clear the keys and values */
+    if (oldvalues != NULL) {
+        n = DK_SIZE(oldkeys);
+        for (i = 0; i < n; i++)
+            Py_CLEAR(oldvalues[i]);
+        free_values(oldvalues);
+        DK_DECREF(oldkeys);
+    }
+    else {
+       assert(oldkeys->dk_refcnt == 1);
+       free_keys_object(oldkeys);
+    }
+}
+
+/* Returns -1 if no more items (or op is not a dict),
+ * index of item otherwise. Stores value in pvalue
+ */
+Py_LOCAL_INLINE(Py_ssize_t)
+dict_next(PyObject *op, Py_ssize_t i, PyObject **pvalue)
+{
+    Py_ssize_t mask, offset;
+    PyDictObject *mp;
+    PyObject **value_ptr;
+
+
+    if (!PyDict_Check(op))
+        return -1;
     mp = (PyDictObject *)op;
-#ifdef Py_DEBUG
-    n = mp->ma_mask + 1;
-    i = 0;
-#endif
-
-    table = mp->ma_table;
-    assert(table != NULL);
-    table_is_malloced = table != mp->ma_smalltable;
-
-    /* This is delicate.  During the process of clearing the dict,
-     * decrefs can cause the dict to mutate.  To avoid fatal confusion
-     * (voice of experience), we have to make the dict empty before
-     * clearing the slots, and never refer to anything via mp->xxx while
-     * clearing.
-     */
-    fill = mp->ma_fill;
-    if (table_is_malloced)
-        EMPTY_TO_MINSIZE(mp);
-
-    else if (fill > 0) {
-        /* It's a small table with something that needs to be cleared.
-         * Afraid the only safe way is to copy the dict entries into
-         * another small table first.
-         */
-        memcpy(small_copy, table, sizeof(small_copy));
-        table = small_copy;
-        EMPTY_TO_MINSIZE(mp);
+    if (i < 0)
+        return -1;
+    if (mp->ma_values) {
+        value_ptr = &mp->ma_values[i];
+        offset = sizeof(PyObject *);
     }
-    /* else it's a small table that's already empty */
-
-    /* Now we can finally clear things.  If C had refcounts, we could
-     * assert that the refcount on table is 1 now, i.e. that this function
-     * has unique access to it, so decref side-effects can't alter it.
-     */
-    for (ep = table; fill > 0; ++ep) {
-#ifdef Py_DEBUG
-        assert(i < n);
-        ++i;
-#endif
-        if (ep->me_key) {
-            --fill;
-            Py_DECREF(ep->me_key);
-            Py_XDECREF(ep->me_value);
-        }
-#ifdef Py_DEBUG
-        else
-            assert(ep->me_value == NULL);
-#endif
+    else {
+        value_ptr = &mp->ma_keys->dk_entries[i].me_value;
+        offset = sizeof(PyDictKeyEntry);
     }
-
-    if (table_is_malloced)
-        PyMem_DEL(table);
+    mask = DK_MASK(mp->ma_keys);
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
+        i++;
+    }
+    if (i > mask)
+        return -1;
+    if (pvalue)
+        *pvalue = *value_ptr;
+    return i;
 }
 
 /*
@@ -992,75 +1314,58 @@
 int
 PyDict_Next(PyObject *op, Py_ssize_t *ppos, PyObject **pkey, PyObject **pvalue)
 {
-    register Py_ssize_t i;
-    register Py_ssize_t mask;
-    register PyDictEntry *ep;
-
-    if (!PyDict_Check(op))
-        return 0;
-    i = *ppos;
+    PyDictObject *mp;
+    Py_ssize_t i = dict_next(op, *ppos, pvalue);
     if (i < 0)
         return 0;
-    ep = ((PyDictObject *)op)->ma_table;
-    mask = ((PyDictObject *)op)->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
-        i++;
+    mp = (PyDictObject *)op;
     *ppos = i+1;
-    if (i > mask)
-        return 0;
     if (pkey)
-        *pkey = ep[i].me_key;
-    if (pvalue)
-        *pvalue = ep[i].me_value;
+        *pkey = mp->ma_keys->dk_entries[i].me_key;
     return 1;
 }
 
-/* Internal version of PyDict_Next that returns a hash value in addition to the key and value.*/
+/* Internal version of PyDict_Next that returns a hash value in addition
+ * to the key and value.
+ */
 int
-_PyDict_Next(PyObject *op, Py_ssize_t *ppos, PyObject **pkey, PyObject **pvalue, Py_hash_t *phash)
+_PyDict_Next(PyObject *op, Py_ssize_t *ppos, PyObject **pkey,
+             PyObject **pvalue, Py_hash_t *phash)
 {
-    register Py_ssize_t i;
-    register Py_ssize_t mask;
-    register PyDictEntry *ep;
-
-    if (!PyDict_Check(op))
-        return 0;
-    i = *ppos;
+    PyDictObject *mp;
+    Py_ssize_t i = dict_next(op, *ppos, pvalue);
     if (i < 0)
         return 0;
-    ep = ((PyDictObject *)op)->ma_table;
-    mask = ((PyDictObject *)op)->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
-        i++;
+    mp = (PyDictObject *)op;
     *ppos = i+1;
-    if (i > mask)
-        return 0;
-    *phash = ep[i].me_hash;
+    *phash = mp->ma_keys->dk_entries[i].me_hash;
     if (pkey)
-        *pkey = ep[i].me_key;
-    if (pvalue)
-        *pvalue = ep[i].me_value;
+        *pkey = mp->ma_keys->dk_entries[i].me_key;
     return 1;
 }
 
 /* Methods */
 
 static void
-dict_dealloc(register PyDictObject *mp)
+dict_dealloc(PyDictObject *mp)
 {
-    register PyDictEntry *ep;
-    Py_ssize_t fill = mp->ma_fill;
+    PyObject **values = mp->ma_values;
+    PyDictKeysObject *keys = mp->ma_keys;
+    Py_ssize_t i, n;
     PyObject_GC_UnTrack(mp);
     Py_TRASHCAN_SAFE_BEGIN(mp)
-    for (ep = mp->ma_table; fill > 0; ep++) {
-        if (ep->me_key) {
-            --fill;
-            Py_DECREF(ep->me_key);
-            Py_XDECREF(ep->me_value);
+    if (values != NULL) {
+        if (values != empty_values) {
+            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+                Py_XDECREF(values[i]);
+            }
+            free_values(values);
         }
+        DK_DECREF(keys);
     }
-    if (mp->ma_table != mp->ma_smalltable)
-        PyMem_DEL(mp->ma_table);
+    else {
+        free_keys_object(keys);
+    }
     if (numfree < PyDict_MAXFREELIST && Py_TYPE(mp) == &PyDict_Type)
         free_list[numfree++] = mp;
     else
@@ -1068,6 +1373,7 @@
     Py_TRASHCAN_SAFE_END(mp)
 }
 
+
 static PyObject *
 dict_repr(PyDictObject *mp)
 {
@@ -1099,11 +1405,13 @@
     i = 0;
     while (PyDict_Next((PyObject *)mp, &i, &key, &value)) {
         int status;
-        /* Prevent repr from deleting value during key format. */
+        /* Prevent repr from deleting key or value during key format. */
+        Py_INCREF(key);
         Py_INCREF(value);
         s = PyObject_Repr(key);
         PyUnicode_Append(&s, colon);
         PyUnicode_AppendAndDel(&s, PyObject_Repr(value));
+        Py_DECREF(key);
         Py_DECREF(value);
         if (s == NULL)
             goto Done;
@@ -1158,18 +1466,19 @@
 {
     PyObject *v;
     Py_hash_t hash;
-    PyDictEntry *ep;
-    assert(mp->ma_table != NULL);
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
+
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
         hash = PyObject_Hash(key);
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    v = ep->me_value;
+    v = *value_addr;
     if (v == NULL) {
         if (!PyDict_CheckExact(mp)) {
             /* Look up __missing__ method if we're a subclass. */
@@ -1213,8 +1522,9 @@
 {
     register PyObject *v;
     register Py_ssize_t i, j;
-    PyDictEntry *ep;
-    Py_ssize_t mask, n;
+    PyDictKeyEntry *ep;
+    Py_ssize_t size, n, offset;
+    PyObject **value_ptr;
 
   again:
     n = mp->ma_used;
@@ -1228,15 +1538,24 @@
         Py_DECREF(v);
         goto again;
     }
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0, j = 0; i <= mask; i++) {
-        if (ep[i].me_value != NULL) {
+    ep = &mp->ma_keys->dk_entries[0];
+    size = DK_SIZE(mp->ma_keys);
+    if (mp->ma_values) {
+        value_ptr = mp->ma_values;
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &ep[0].me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    for (i = 0, j = 0; i < size; i++) {
+        if (*value_ptr != NULL) {
             PyObject *key = ep[i].me_key;
             Py_INCREF(key);
             PyList_SET_ITEM(v, j, key);
             j++;
         }
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
     }
     assert(j == n);
     return v;
@@ -1247,8 +1566,8 @@
 {
     register PyObject *v;
     register Py_ssize_t i, j;
-    PyDictEntry *ep;
-    Py_ssize_t mask, n;
+    Py_ssize_t size, n, offset;
+    PyObject **value_ptr;
 
   again:
     n = mp->ma_used;
@@ -1262,11 +1581,19 @@
         Py_DECREF(v);
         goto again;
     }
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0, j = 0; i <= mask; i++) {
-        if (ep[i].me_value != NULL) {
-            PyObject *value = ep[i].me_value;
+    size = DK_SIZE(mp->ma_keys);
+    if (mp->ma_values) {
+        value_ptr = mp->ma_values;
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &mp->ma_keys->dk_entries[0].me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    for (i = 0, j = 0; i < size; i++) {
+        PyObject *value = *value_ptr;
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
+        if (value != NULL) {
             Py_INCREF(value);
             PyList_SET_ITEM(v, j, value);
             j++;
@@ -1281,9 +1608,10 @@
 {
     register PyObject *v;
     register Py_ssize_t i, j, n;
-    Py_ssize_t mask;
-    PyObject *item, *key, *value;
-    PyDictEntry *ep;
+    Py_ssize_t size, offset;
+    PyObject *item, *key;
+    PyDictKeyEntry *ep;
+    PyObject **value_ptr;
 
     /* Preallocate the list of tuples, to avoid allocations during
      * the loop over the items, which could trigger GC, which
@@ -1310,10 +1638,20 @@
         goto again;
     }
     /* Nothing we do below makes any function calls. */
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0, j = 0; i <= mask; i++) {
-        if ((value=ep[i].me_value) != NULL) {
+    ep = mp->ma_keys->dk_entries;
+    size = DK_SIZE(mp->ma_keys);
+    if (mp->ma_values) {
+        value_ptr = mp->ma_values;
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &ep[0].me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    for (i = 0, j = 0; i < size; i++) {
+        PyObject *value = *value_ptr;
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
+        if (value != NULL) {
             key = ep[i].me_key;
             item = PyList_GET_ITEM(v, j);
             Py_INCREF(key);
@@ -1545,8 +1883,8 @@
 PyDict_Merge(PyObject *a, PyObject *b, int override)
 {
     register PyDictObject *mp, *other;
-    register Py_ssize_t i;
-    PyDictEntry *entry;
+    register Py_ssize_t i, n;
+    PyDictKeyEntry *entry;
 
     /* We accept for the argument either a concrete dictionary object,
      * or an abstract "mapping" object.  For the former, we can do
@@ -1573,20 +1911,25 @@
          * incrementally resizing as we insert new items.  Expect
          * that there will be no (or few) overlapping keys.
          */
-        if ((mp->ma_fill + other->ma_used)*3 >= (mp->ma_mask+1)*2) {
-           if (dictresize(mp, (mp->ma_used + other->ma_used)*2) != 0)
+        if (mp->ma_keys->dk_usable * 3 < other->ma_used * 2)
+            if (dictresize(mp, (mp->ma_used + other->ma_used)*2) != 0)
                return -1;
-        }
-        for (i = 0; i <= other->ma_mask; i++) {
-            entry = &other->ma_table[i];
-            if (entry->me_value != NULL &&
+        for (i = 0, n = DK_SIZE(other->ma_keys); i < n; i++) {
+            PyObject *value;
+            entry = &other->ma_keys->dk_entries[i];
+            if (other->ma_values)
+                value = other->ma_values[i];
+            else
+                value = entry->me_value;
+
+            if (value != NULL &&
                 (override ||
                  PyDict_GetItem(a, entry->me_key) == NULL)) {
                 Py_INCREF(entry->me_key);
-                Py_INCREF(entry->me_value);
+                Py_INCREF(value);
                 if (insertdict(mp, entry->me_key,
                                entry->me_hash,
-                               entry->me_value) != 0)
+                               value) != 0)
                     return -1;
             }
         }
@@ -1648,11 +1991,35 @@
 PyDict_Copy(PyObject *o)
 {
     PyObject *copy;
+    PyDictObject *mp;
+    Py_ssize_t i, n;
 
     if (o == NULL || !PyDict_Check(o)) {
         PyErr_BadInternalCall();
         return NULL;
     }
+    mp = (PyDictObject *)o;
+    if (_PyDict_HasSplitTable(mp)) {
+        PyDictObject *split_copy;
+        PyObject **newvalues = new_values(DK_SIZE(mp->ma_keys));
+        if (newvalues == NULL)
+            return PyErr_NoMemory();
+        split_copy = PyObject_GC_New(PyDictObject, &PyDict_Type);
+        if (split_copy == NULL) {
+            free_values(newvalues);
+            return NULL;
+        }
+        split_copy->ma_values = newvalues;
+        split_copy->ma_keys = mp->ma_keys;
+        split_copy->ma_used = mp->ma_used;
+        DK_INCREF(mp->ma_keys);
+        for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+            PyObject *value = mp->ma_values[i];
+            Py_XINCREF(value);
+            split_copy->ma_values[i] = value;
+        }
+        return (PyObject *)split_copy;
+    }
     copy = PyDict_New();
     if (copy == NULL)
         return NULL;
@@ -1714,14 +2081,18 @@
     if (a->ma_used != b->ma_used)
         /* can't be equal if # of entries differ */
         return 0;
-
     /* Same # of entries -- check all of 'em.  Exit early on any diff. */
-    for (i = 0; i <= a->ma_mask; i++) {
-        PyObject *aval = a->ma_table[i].me_value;
+    for (i = 0; i < DK_SIZE(a->ma_keys); i++) {
+        PyDictKeyEntry *ep = &a->ma_keys->dk_entries[i];
+        PyObject *aval;
+        if (a->ma_values)
+            aval = a->ma_values[i];
+        else
+            aval = ep->me_value;
         if (aval != NULL) {
             int cmp;
             PyObject *bval;
-            PyObject *key = a->ma_table[i].me_key;
+            PyObject *key = ep->me_key;
             /* temporarily bump aval's refcount to ensure it stays
                alive until we're done with it */
             Py_INCREF(aval);
@@ -1742,7 +2113,7 @@
         }
     }
     return 1;
- }
+}
 
 static PyObject *
 dict_richcompare(PyObject *v, PyObject *w, int op)
@@ -1763,13 +2134,14 @@
         res = Py_NotImplemented;
     Py_INCREF(res);
     return res;
- }
+}
 
 static PyObject *
 dict_contains(register PyDictObject *mp, PyObject *key)
 {
     Py_hash_t hash;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
@@ -1777,10 +2149,10 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    return PyBool_FromLong(ep->me_value != NULL);
+    return PyBool_FromLong(*value_addr != NULL);
 }
 
 static PyObject *
@@ -1790,7 +2162,8 @@
     PyObject *failobj = Py_None;
     PyObject *val = NULL;
     Py_hash_t hash;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyArg_UnpackTuple(args, "get", 1, 2, &key, &failobj))
         return NULL;
@@ -1801,17 +2174,16 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    val = ep->me_value;
+    val = *value_addr;
     if (val == NULL)
         val = failobj;
     Py_INCREF(val);
     return val;
 }
 
-
 static PyObject *
 dict_setdefault(register PyDictObject *mp, PyObject *args)
 {
@@ -1819,7 +2191,8 @@
     PyObject *failobj = Py_None;
     PyObject *val = NULL;
     Py_hash_t hash;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyArg_UnpackTuple(args, "setdefault", 1, 2, &key, &failobj))
         return NULL;
@@ -1830,16 +2203,27 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    val = ep->me_value;
+    val = *value_addr;
     if (val == NULL) {
-        if (dict_set_item_by_hash_or_entry((PyObject*)mp, key, hash, ep,
-                                           failobj) == 0)
-            val = failobj;
+        Py_INCREF(failobj);
+        Py_INCREF(key);
+        if (mp->ma_keys->dk_usable <= 0) {
+            /* Need to resize. */
+            if (insertion_resize(mp) < 0)
+                return NULL;
+            ep = find_empty_slot(mp, key, hash, &value_addr);
+        }
+        ep->me_key = key;
+        ep->me_hash = hash;
+        *value_addr = failobj;
+        val = failobj;
+        mp->ma_keys->dk_usable--;
+        mp->ma_used++;
     }
-    Py_XINCREF(val);
+    Py_INCREF(val);
     return val;
 }
 
@@ -1855,9 +2239,10 @@
 dict_pop(PyDictObject *mp, PyObject *args)
 {
     Py_hash_t hash;
-    PyDictEntry *ep;
     PyObject *old_value, *old_key;
     PyObject *key, *deflt = NULL;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if(!PyArg_UnpackTuple(args, "pop", 1, 2, &key, &deflt))
         return NULL;
@@ -1875,10 +2260,11 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    if (ep->me_value == NULL) {
+    old_value = *value_addr;
+    if (old_value == NULL) {
         if (deflt) {
             Py_INCREF(deflt);
             return deflt;
@@ -1886,13 +2272,15 @@
         set_key_error(key);
         return NULL;
     }
-    old_key = ep->me_key;
-    Py_INCREF(dummy);
-    ep->me_key = dummy;
-    old_value = ep->me_value;
-    ep->me_value = NULL;
+    *value_addr = NULL;
     mp->ma_used--;
-    Py_DECREF(old_key);
+    if (!_PyDict_HasSplitTable(mp)) {
+        ENSURE_ALLOWS_DELETIONS(mp);
+        old_key = ep->me_key;
+        Py_INCREF(dummy);
+        ep->me_key = dummy;
+        Py_DECREF(old_key);
+    }
     return old_value;
 }
 
@@ -1900,9 +2288,10 @@
 dict_popitem(PyDictObject *mp)
 {
     Py_hash_t i = 0;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
     PyObject *res;
 
+
     /* Allocate the result tuple before checking the size.  Believe it
      * or not, this allocation could trigger a garbage collection which
      * could empty the dict, so if we checked the size first and that
@@ -1921,25 +2310,34 @@
                         "popitem(): dictionary is empty");
         return NULL;
     }
+    /* Convert split table to combined table */
+    if (mp->ma_keys->dk_lookup == lookdict_split) {
+        if (dictresize(mp, DK_SIZE(mp->ma_keys))) {
+            Py_DECREF(res);
+            return NULL;
+        }
+    }
+    ENSURE_ALLOWS_DELETIONS(mp);
     /* Set ep to "the first" dict entry with a value.  We abuse the hash
      * field of slot 0 to hold a search finger:
      * If slot 0 has a value, use slot 0.
      * Else slot 0 is being used to hold a search finger,
      * and we use its hash value as the first index to look.
      */
-    ep = &mp->ma_table[0];
+    ep = &mp->ma_keys->dk_entries[0];
     if (ep->me_value == NULL) {
+        Py_ssize_t mask = DK_MASK(mp->ma_keys);
         i = ep->me_hash;
         /* The hash field may be a real hash value, or it may be a
          * legit search finger, or it may be a once-legit search
          * finger that's out of bounds now because it wrapped around
          * or the table shrunk -- simply make sure it's in bounds now.
          */
-        if (i > mp->ma_mask || i < 1)
+        if (i > mask || i < 1)
             i = 1;              /* skip slot 0 */
-        while ((ep = &mp->ma_table[i])->me_value == NULL) {
+        while ((ep = &mp->ma_keys->dk_entries[i])->me_value == NULL) {
             i++;
-            if (i > mp->ma_mask)
+            if (i > mask)
                 i = 1;
         }
     }
@@ -1949,21 +2347,34 @@
     ep->me_key = dummy;
     ep->me_value = NULL;
     mp->ma_used--;
-    assert(mp->ma_table[0].me_value == NULL);
-    mp->ma_table[0].me_hash = i + 1;  /* next place to start */
+    assert(mp->ma_keys->dk_entries[0].me_value == NULL);
+    mp->ma_keys->dk_entries[0].me_hash = i + 1;  /* next place to start */
     return res;
 }
 
 static int
 dict_traverse(PyObject *op, visitproc visit, void *arg)
 {
-    Py_ssize_t i = 0;
-    PyObject *pk;
-    PyObject *pv;
-
-    while (PyDict_Next(op, &i, &pk, &pv)) {
-        Py_VISIT(pk);
-        Py_VISIT(pv);
+    Py_ssize_t i, n;
+    PyDictObject *mp = (PyDictObject *)op;
+    if (mp->ma_keys->dk_lookup == lookdict) {
+        for (i = 0; i < DK_SIZE(mp->ma_keys); i++) {
+            if (mp->ma_keys->dk_entries[i].me_value != NULL) {
+                Py_VISIT(mp->ma_keys->dk_entries[i].me_value);
+                Py_VISIT(mp->ma_keys->dk_entries[i].me_key);
+            }
+        }
+    } else {
+        if (mp->ma_values != NULL) {
+            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+                Py_VISIT(mp->ma_values[i]);
+            }
+        }
+        else {
+            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+                Py_VISIT(mp->ma_keys->dk_entries[i].me_value);
+            }
+        }
     }
     return 0;
 }
@@ -1980,12 +2391,22 @@
 static PyObject *
 dict_sizeof(PyDictObject *mp)
 {
-    Py_ssize_t res;
-
+    Py_ssize_t size;
+    double res, keys_size;
+
+    size = DK_SIZE(mp->ma_keys);
     res = sizeof(PyDictObject);
-    if (mp->ma_table != mp->ma_smalltable)
-        res = res + (mp->ma_mask + 1) * sizeof(PyDictEntry);
-    return PyLong_FromSsize_t(res);
+    if (mp->ma_values)
+        res += size * sizeof(PyObject*);
+    /* Count our share of the keys object -- with rounding errors. */
+    keys_size = sizeof(PyDictKeysObject) + (size-1) * sizeof(PyDictKeyEntry);
+    /* If refcnt > 1, then one count is (probably) held by a type */
+    /* XXX  This is somewhat approximate :) */
+    if (mp->ma_keys->dk_refcnt < 3)
+        res += keys_size;
+    else
+        res += keys_size / (mp->ma_keys->dk_refcnt - 1);
+    return PyFloat_FromDouble(res);
 }
 
 PyDoc_STRVAR(contains__doc__,
@@ -2076,7 +2497,8 @@
 {
     Py_hash_t hash;
     PyDictObject *mp = (PyDictObject *)op;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
@@ -2084,8 +2506,8 @@
         if (hash == -1)
             return -1;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
-    return ep == NULL ? -1 : (ep->me_value != NULL);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
+    return (ep == NULL) ? -1 : (*value_addr != NULL);
 }
 
 /* Internal version of PyDict_Contains used when the hash value is already known */
@@ -2093,10 +2515,11 @@
 _PyDict_Contains(PyObject *op, PyObject *key, Py_hash_t hash)
 {
     PyDictObject *mp = (PyDictObject *)op;
-    PyDictEntry *ep;
-
-    ep = (mp->ma_lookup)(mp, key, hash);
-    return ep == NULL ? -1 : (ep->me_value != NULL);
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
+
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
+    return (ep == NULL) ? -1 : (*value_addr != NULL);
 }
 
 /* Hack to implement "key in dict" */
@@ -2122,22 +2545,17 @@
     self = type->tp_alloc(type, 0);
     if (self != NULL) {
         PyDictObject *d = (PyDictObject *)self;
-        /* It's guaranteed that tp->alloc zeroed out the struct. */
-        assert(d->ma_table == NULL && d->ma_fill == 0 && d->ma_used == 0);
-        INIT_NONZERO_DICT_SLOTS(d);
-        d->ma_lookup = lookdict_unicode;
+        d->ma_keys = new_keys_object(PyDict_MINSIZE_COMBINED);
+        /* XXX - Should we raise a no-memory error? */
+        if (d->ma_keys == NULL) {
+            DK_INCREF(Py_EMPTY_KEYS);
+            d->ma_keys = Py_EMPTY_KEYS;
+            d->ma_values = empty_values;
+        }
+        d->ma_used = 0;
         /* The object has been implicitly tracked by tp_alloc */
         if (type == &PyDict_Type)
             _PyObject_GC_UNTRACK(d);
-#ifdef SHOW_CONVERSION_COUNTS
-        ++created;
-#endif
-#ifdef SHOW_TRACK_COUNT
-        if (_PyObject_GC_IS_TRACKED(d))
-            count_tracked++;
-        else
-            count_untracked++;
-#endif
     }
     return self;
 }
@@ -2349,9 +2767,10 @@
 static PyObject *dictiter_iternextkey(dictiterobject *di)
 {
     PyObject *key;
-    register Py_ssize_t i, mask;
-    register PyDictEntry *ep;
+    register Py_ssize_t i, mask, offset;
+    register PyDictKeysObject *k;
     PyDictObject *d = di->di_dict;
+    PyObject **value_ptr;
 
     if (d == NULL)
         return NULL;
@@ -2367,15 +2786,25 @@
     i = di->di_pos;
     if (i < 0)
         goto fail;
-    ep = d->ma_table;
-    mask = d->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
+    k = d->ma_keys;
+    if (d->ma_values) {
+        value_ptr = &d->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &k->dk_entries[i].me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    mask = DK_SIZE(k)-1;
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
+    }
     di->di_pos = i+1;
     if (i > mask)
         goto fail;
     di->len--;
-    key = ep[i].me_key;
+    key = k->dk_entries[i].me_key;
     Py_INCREF(key);
     return key;
 
@@ -2421,9 +2850,9 @@
 static PyObject *dictiter_iternextvalue(dictiterobject *di)
 {
     PyObject *value;
-    register Py_ssize_t i, mask;
-    register PyDictEntry *ep;
+    register Py_ssize_t i, mask, offset;
     PyDictObject *d = di->di_dict;
+    PyObject **value_ptr;
 
     if (d == NULL)
         return NULL;
@@ -2437,17 +2866,26 @@
     }
 
     i = di->di_pos;
-    mask = d->ma_mask;
+    mask = DK_SIZE(d->ma_keys)-1;
     if (i < 0 || i > mask)
         goto fail;
-    ep = d->ma_table;
-    while ((value=ep[i].me_value) == NULL) {
+    if (d->ma_values) {
+        value_ptr = &d->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &d->ma_keys->dk_entries[i].me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
         if (i > mask)
             goto fail;
     }
     di->di_pos = i+1;
     di->len--;
+    value = *value_ptr;
     Py_INCREF(value);
     return value;
 
@@ -2493,9 +2931,9 @@
 static PyObject *dictiter_iternextitem(dictiterobject *di)
 {
     PyObject *key, *value, *result = di->di_result;
-    register Py_ssize_t i, mask;
-    register PyDictEntry *ep;
+    register Py_ssize_t i, mask, offset;
     PyDictObject *d = di->di_dict;
+    PyObject **value_ptr;
 
     if (d == NULL)
         return NULL;
@@ -2511,10 +2949,19 @@
     i = di->di_pos;
     if (i < 0)
         goto fail;
-    ep = d->ma_table;
-    mask = d->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
+    mask = DK_SIZE(d->ma_keys)-1;
+    if (d->ma_values) {
+        value_ptr = &d->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &d->ma_keys->dk_entries[i].me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
+    }
     di->di_pos = i+1;
     if (i > mask)
         goto fail;
@@ -2529,8 +2976,8 @@
             return NULL;
     }
     di->len--;
-    key = ep[i].me_key;
-    value = ep[i].me_value;
+    key = d->ma_keys->dk_entries[i].me_key;
+    value = *value_ptr;
     Py_INCREF(key);
     Py_INCREF(value);
     PyTuple_SET_ITEM(result, 0, key);
@@ -2590,7 +3037,7 @@
     /* copy the itertor state */
     tmp = *di;
     Py_XINCREF(tmp.di_dict);
-    
+
     /* iterate the temporary into a list */
     for(;;) {
         PyObject *element = 0;
@@ -3170,3 +3617,151 @@
 {
     return dictview_new(dict, &PyDictValues_Type);
 }
+
+/* Returns NULL if cannot allocate a new PyDictKeysObject,
+   but does not set an error */
+PyDictKeysObject *
+_PyDict_NewKeysForClass(void)
+{
+    PyDictKeysObject *keys = new_keys_object(PyDict_MINSIZE_SPLIT);
+    if (keys == NULL)
+        PyErr_Clear();
+    else
+        keys->dk_lookup = lookdict_split;
+    return keys;
+}
+
+#define CACHED_KEYS(tp) (((PyHeapTypeObject*)tp)->ht_cached_keys)
+
+PyObject *
+PyObject_GenericGetDict(PyObject *obj, void *context)
+{
+    PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
+    if (dictptr == NULL) {
+        PyErr_SetString(PyExc_AttributeError,
+                        "This object has no __dict__");
+        return NULL;
+    }
+    dict = *dictptr;
+    if (dict == NULL) {
+        PyTypeObject *tp = Py_TYPE(obj);
+        if ((tp->tp_flags & Py_TPFLAGS_HEAPTYPE) && CACHED_KEYS(tp)) {
+            DK_INCREF(CACHED_KEYS(tp));
+            *dictptr = dict = new_dict_with_shared_keys(CACHED_KEYS(tp));
+        }
+        else {
+            *dictptr = dict = PyDict_New();
+        }
+    }
+    Py_XINCREF(dict);
+    return dict;
+}
+
+int
+_PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr,
+                     PyObject *key, PyObject *value)
+{
+    PyObject *dict;
+    int res;
+    PyDictKeysObject *cached;
+
+    assert(dictptr != NULL);
+    if ((tp->tp_flags & Py_TPFLAGS_HEAPTYPE) && (cached = CACHED_KEYS(tp))) {
+        assert(dictptr != NULL);
+        dict = *dictptr;
+        if (dict == NULL) {
+            DK_INCREF(cached);
+            dict = new_dict_with_shared_keys(cached);
+            if (dict == NULL)
+                return -1;
+            *dictptr = dict;
+        }
+        if (value == NULL) {
+            res = PyDict_DelItem(dict, key);
+            if (cached != ((PyDictObject *)dict)->ma_keys) {
+                CACHED_KEYS(tp) = NULL;
+                DK_DECREF(cached);
+            }
+        } else {
+            res = PyDict_SetItem(dict, key, value);
+            if (cached != ((PyDictObject *)dict)->ma_keys) {
+                /* Either update tp->ht_cached_keys or delete it */
+                if (cached->dk_refcnt == 1) {
+                    CACHED_KEYS(tp) = make_keys_shared(dict);
+                    if (CACHED_KEYS(tp) == NULL)
+                        return -1;
+                } else {
+                    CACHED_KEYS(tp) = NULL;
+                }
+                DK_DECREF(cached);
+            }
+        }
+    } else {
+        dict = *dictptr;
+        if (dict == NULL) {
+            dict = PyDict_New();
+            if (dict == NULL)
+                return -1;
+            *dictptr = dict;
+        }
+        if (value == NULL) {
+            res = PyDict_DelItem(dict, key);
+        } else {
+            res = PyDict_SetItem(dict, key, value);
+        }
+    }
+    return res;
+}
+
+void
+_PyDictKeys_DecRef(PyDictKeysObject *keys)
+{
+    DK_DECREF(keys);
+}
+
+
+/* ARGSUSED */
+static PyObject *
+dummy_repr(PyObject *op)
+{
+    return PyUnicode_FromString("<dummy key>");
+}
+
+/* ARGUSED */
+static void
+dummy_dealloc(PyObject* ignore)
+{
+    /* This should never get called, but we also don't want to SEGV if
+     * we accidentally decref dummy-key out of existence.
+     */
+    Py_FatalError("deallocating <dummy key>");
+}
+
+static PyTypeObject PyDictDummy_Type = {
+    PyVarObject_HEAD_INIT(&PyType_Type, 0)
+    "<dummy key> type",
+    0,
+    0,
+    dummy_dealloc,      /*tp_dealloc*/ /*never called*/
+    0,                  /*tp_print*/
+    0,                  /*tp_getattr*/
+    0,                  /*tp_setattr*/
+    0,                  /*tp_reserved*/
+    dummy_repr,         /*tp_repr*/
+    0,                  /*tp_as_number*/
+    0,                  /*tp_as_sequence*/
+    0,                  /*tp_as_mapping*/
+    0,                  /*tp_hash */
+    0,                  /*tp_call */
+    0,                  /*tp_str */
+    0,                  /*tp_getattro */
+    0,                  /*tp_setattro */
+    0,                  /*tp_as_buffer */
+    Py_TPFLAGS_DEFAULT, /*tp_flags */
+};
+
+static PyObject _dummy_struct = {
+  _PyObject_EXTRA_INIT
+  2, &PyDictDummy_Type
+};
+
diff --git a/Objects/object.c b/Objects/object.c
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -1188,13 +1188,10 @@
     if (dict == NULL) {
         dictptr = _PyObject_GetDictPtr(obj);
         if (dictptr != NULL) {
-            dict = *dictptr;
-            if (dict == NULL && value != NULL) {
-                dict = PyDict_New();
-                if (dict == NULL)
-                    goto done;
-                *dictptr = dict;
-            }
+            res = _PyObjectDict_SetItem(Py_TYPE(obj), dictptr, name, value);
+            if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
+                PyErr_SetObject(PyExc_AttributeError, name);
+            goto done;
         }
     }
     if (dict != NULL) {
@@ -1236,22 +1233,6 @@
     return _PyObject_GenericSetAttrWithDict(obj, name, value, NULL);
 }
 
-PyObject *
-PyObject_GenericGetDict(PyObject *obj, void *context)
-{
-    PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
-    if (dictptr == NULL) {
-        PyErr_SetString(PyExc_AttributeError,
-                        "This object has no __dict__");
-        return NULL;
-    }
-    dict = *dictptr;
-    if (dict == NULL)
-        *dictptr = dict = PyDict_New();
-    Py_XINCREF(dict);
-    return dict;
-}
-
 int
 PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
 {
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -14,7 +14,7 @@
    MCACHE_MAX_ATTR_SIZE, since it might be a problem if very large
    strings are used as attribute names. */
 #define MCACHE_MAX_ATTR_SIZE    100
-#define MCACHE_SIZE_EXP         10
+#define MCACHE_SIZE_EXP         9
 #define MCACHE_HASH(version, name_hash)                                 \
         (((unsigned int)(version) * (unsigned int)(name_hash))          \
          >> (8*sizeof(unsigned int) - MCACHE_SIZE_EXP))
@@ -2306,6 +2306,9 @@
             type->tp_dictoffset = slotoffset;
         slotoffset += sizeof(PyObject *);
     }
+    if (type->tp_dictoffset) {
+        et->ht_cached_keys = _PyDict_NewKeysForClass();
+    }
     if (add_weak) {
         assert(!base->tp_itemsize);
         type->tp_weaklistoffset = slotoffset;
@@ -2411,6 +2414,9 @@
             res->ht_type.tp_doc = tp_doc;
         }
     }
+    if (res->ht_type.tp_dictoffset) {
+        res->ht_cached_keys = _PyDict_NewKeysForClass();
+    }
 
     if (PyType_Ready(&res->ht_type) < 0)
         goto fail;
@@ -2767,9 +2773,13 @@
     return 0;
 }
 
+extern void
+_PyDictKeys_DecRef(PyDictKeysObject *keys);
+
 static int
 type_clear(PyTypeObject *type)
 {
+    PyDictKeysObject *cached_keys;
     /* Because of type_is_gc(), the collector only calls this
        for heaptypes. */
     assert(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
@@ -2801,6 +2811,11 @@
     */
 
     PyType_Modified(type);
+    cached_keys = ((PyHeapTypeObject *)type)->ht_cached_keys;
+    if (cached_keys != NULL) {
+        ((PyHeapTypeObject *)type)->ht_cached_keys = NULL;
+        _PyDictKeys_DecRef(cached_keys);
+    }
     if (type->tp_dict)
         PyDict_Clear(type->tp_dict);
     Py_CLEAR(type->tp_mro);
diff --git a/Python/ceval.c b/Python/ceval.c
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -2123,70 +2123,31 @@
             w = GETITEM(names, oparg);
             if (PyDict_CheckExact(f->f_globals)
                 && PyDict_CheckExact(f->f_builtins)) {
-                if (PyUnicode_CheckExact(w)) {
-                    /* Inline the PyDict_GetItem() calls.
-                       WARNING: this is an extreme speed hack.
-                       Do not try this at home. */
-                    Py_hash_t hash = ((PyASCIIObject *)w)->hash;
-                    if (hash != -1) {
-                        PyDictObject *d;
-                        PyDictEntry *e;
-                        d = (PyDictObject *)(f->f_globals);
-                        e = d->ma_lookup(d, w, hash);
-                        if (e == NULL) {
-                            x = NULL;
-                            break;
-                        }
-                        x = e->me_value;
-                        if (x != NULL) {
-                            Py_INCREF(x);
-                            PUSH(x);
-                            DISPATCH();
-                        }
-                        d = (PyDictObject *)(f->f_builtins);
-                        e = d->ma_lookup(d, w, hash);
-                        if (e == NULL) {
-                            x = NULL;
-                            break;
-                        }
-                        x = e->me_value;
-                        if (x != NULL) {
-                            Py_INCREF(x);
-                            PUSH(x);
-                            DISPATCH();
-                        }
-                        goto load_global_error;
-                    }
+                x = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
+                                       (PyDictObject *)f->f_builtins,
+                                       w);
+                if (x == NULL) {
+                    if (!PyErr_Occurred())
+                        format_exc_check_arg(PyExc_NameError,
+                                             GLOBAL_NAME_ERROR_MSG, w);
+                    break;
                 }
-                /* This is the un-inlined version of the code above */
-                x = PyDict_GetItem(f->f_globals, w);
+            }
+            else {
+                /* Slow-path if globals or builtins is not a dict */
+                x = PyObject_GetItem(f->f_globals, w);
                 if (x == NULL) {
-                    x = PyDict_GetItem(f->f_builtins, w);
+                    x = PyObject_GetItem(f->f_builtins, w);
                     if (x == NULL) {
-                      load_global_error:
-                        format_exc_check_arg(
-                                    PyExc_NameError,
-                                    GLOBAL_NAME_ERROR_MSG, w);
+                        if (PyErr_ExceptionMatches(PyExc_KeyError))
+                            format_exc_check_arg(
+                                        PyExc_NameError,
+                                        GLOBAL_NAME_ERROR_MSG, w);
                         break;
                     }
                 }
-                Py_INCREF(x);
-                PUSH(x);
-                DISPATCH();
             }
-
-            /* Slow-path if globals or builtins is not a dict */
-            x = PyObject_GetItem(f->f_globals, w);
-            if (x == NULL) {
-                x = PyObject_GetItem(f->f_builtins, w);
-                if (x == NULL) {
-                    if (PyErr_ExceptionMatches(PyExc_KeyError))
-                        format_exc_check_arg(
-                                    PyExc_NameError,
-                                    GLOBAL_NAME_ERROR_MSG, w);
-                    break;
-                }
-            }
+            Py_INCREF(x);
             PUSH(x);
             DISPATCH();
 
diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py
--- a/Tools/gdb/libpython.py
+++ b/Tools/gdb/libpython.py
@@ -634,9 +634,14 @@
         Yields a sequence of (PyObjectPtr key, PyObjectPtr value) pairs,
         analagous to dict.iteritems()
         '''
-        for i in safe_range(self.field('ma_mask') + 1):
-            ep = self.field('ma_table') + i
-            pyop_value = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
+        keys = self.field('ma_keys')
+        values = self.field('ma_values')
+        for i in safe_range(keys['dk_size']):
+            ep = keys['dk_entries'].address + i
+            if long(values):
+                pyop_value = PyObjectPtr.from_pyobject_ptr(values[i])
+            else:
+                pyop_value = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
             if not pyop_value.is_null():
                 pyop_key = PyObjectPtr.from_pyobject_ptr(ep['me_key'])
                 yield (pyop_key, pyop_value)

-- 
Repository URL: http://hg.python.org/cpython