[Python-checkins] cpython: Issue #24762: Speed-up frozenset_hash() and greatly beef-up the comments.

raymond.hettinger python-checkins at python.org
Sat Aug 1 18:53:08 CEST 2015


https://hg.python.org/cpython/rev/cf707dd190a9
changeset:   97178:cf707dd190a9
user:        Raymond Hettinger <python at rcn.com>
date:        Sat Aug 01 09:53:00 2015 -0700
summary:
  Issue #24762:  Speed-up frozenset_hash() and greatly beef-up the comments.

files:
  Objects/setobject.c |  70 ++++++++++++++++++++------------
  1 files changed, 43 insertions(+), 27 deletions(-)


diff --git a/Objects/setobject.c b/Objects/setobject.c
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -739,41 +739,57 @@
     return 0;
 }
 
+/* Work to increase the bit dispersion for closely spaced hash values.
+   This is important because some use cases have many combinations of a
+   small number of elements with nearby hashes so that many distinct
+   combinations collapse to only a handful of distinct hash values. */
+
+static Py_uhash_t
+_shuffle_bits(Py_uhash_t h)
+{
+    return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
+}
+
+/* Most of the constants in this hash algorithm are randomly chosen
+   large primes with "interesting bit patterns" and that passed tests
+   for good collision statistics on a variety of problematic datasets
+   including powersets and graph structures (such as David Eppstein's
+   graph recipes in Lib/test/test_set.py) */
+
 static Py_hash_t
 frozenset_hash(PyObject *self)
 {
-    /* Most of the constants in this hash algorithm are randomly choosen
-       large primes with "interesting bit patterns" and that passed
-       tests for good collision statistics on a variety of problematic
-       datasets such as:
+    PySetObject *so = (PySetObject *)self;
+    Py_uhash_t hash = 1927868237UL;
+    setentry *entry;
 
-          ps = []
-          for r in range(21):
-              ps += itertools.combinations(range(20), r)
-          num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
+    /* Make hash(frozenset({0})) distinct from hash(frozenset()) */
+    hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
 
-    */
-    PySetObject *so = (PySetObject *)self;
-    Py_uhash_t h, hash = 1927868237UL;
-    setentry *entry;
-    Py_ssize_t pos = 0;
+    /* Xor-in shuffled bits from every entry's hash field because xor is
+       commutative and a frozenset hash should be independent of order.
 
-    if (so->hash != -1)
-        return so->hash;
+       For speed, include null entries and dummy entries and then
+       subtract out their effect afterwards so that the final hash
+       depends only on active entries.  This allows the code to be
+       vectorized by the compiler and it saves the unpredictable
+       branches that would arise when trying to exclude null and dummy
+       entries on every iteration. */
 
-    hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
-    while (set_next(so, &pos, &entry)) {
-        /* Work to increase the bit dispersion for closely spaced hash
-           values.  This is important because some use cases have many
-           combinations of a small number of elements with nearby
-           hashes so that many distinct combinations collapse to only
-           a handful of distinct hash values. */
-        h = entry->hash;
-        hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
-    }
-    /* Make the final result spread-out in a different pattern
-       than the algorithm for tuples or other python objects. */
+    for (entry = so->table; entry <= &so->table[so->mask]; entry++)
+        hash ^= _shuffle_bits(entry->hash);
+
+    /* Remove the effect of an odd number NULL entries */
+    if ((so->mask + 1 - so->fill) & 1)
+        hash ^= _shuffle_bits(0);
+
+    /* Remove the effect of an odd number of dummy entries */
+    if ((so->fill - so->used) & 1)
+        hash ^= _shuffle_bits(-1);
+
+    /* Disperse patterns arising in nested frozensets */
     hash = hash * 69069U + 907133923UL;
+
     if (hash == (Py_uhash_t)-1)
         hash = 590923713UL;
     so->hash = hash;

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list