[issue13703] Hash collision security issue

Mon Jan 23 17:43:24 CET 2012

Marc-Andre Lemburg <mal at egenix.com> added the comment:

Here's a version of the collision counting patch that takes both hash
and slot collisions into account.

I've also added a test script which demonstrates both types of
collisions using integer objects (since it's trivial to calculate
their hashes).

To see the collision counting, enable the DEBUG_DICT_COLLISIONS
macro variable.

----------
Added file: http://bugs.python.org/file24299/hash-attack-3.patch
Added file: http://bugs.python.org/file24300/integercollision.py

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue13703>
_______________________________________
-------------- next part --------------
Index: Objects/dictobject.c
===================================================================

--- Objects/dictobject.c	(revision 88933)
+++ Objects/dictobject.c	(working copy)
@@ -9,7 +9,13 @@
 
 #include "Python.h"
 
+/* Maximum number of allowed collisions. */
+#define Py_MAX_DICT_HASH_COLLISIONS 1000
+#define Py_MAX_DICT_SLOT_COLLISIONS 1000
 
+/* Debug collision detection */
+#define DEBUG_DICT_COLLISIONS 0
+
 /* Set a key error with the specified argument, wrapping it in a
  * tuple automatically so that tuple keys are not unpacked as the
  * exception arguments. */
@@ -327,6 +333,7 @@
     register PyDictEntry *ep;
     register int cmp;
     PyObject *startkey;
+    size_t hash_collisions, slot_collisions;
 
     i = (size_t)hash & mask;
     ep = &ep0[i];
@@ -361,6 +368,8 @@
 
     /* In the loop, me_key == dummy is by far (factor of 100s) the
        least likely outcome, so test for that last. */
+    hash_collisions = 1;
+    slot_collisions = 1;
     for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
@@ -387,9 +396,27 @@
                  */
                 return lookdict(mp, key, hash);
             }
+	    #if DEBUG_DICT_COLLISIONS
+	    printf("hash collisions = %zu (i=%zu)\n", hash_collisions, i);
+	    #endif
+	    if (++hash_collisions > Py_MAX_DICT_HASH_COLLISIONS) {
+		PyErr_SetString(PyExc_KeyError,
+				"too many hash collisions");
+		return NULL;
+	    }
         }
-        else if (ep->me_key == dummy && freeslot == NULL)
-            freeslot = ep;
+        else {
+	    if (ep->me_key == dummy && freeslot == NULL)
+		freeslot = ep;
+	    #if DEBUG_DICT_COLLISIONS
+	    printf("slot collisions = %zu (i=%zu)\n", slot_collisions, i);
+	    #endif
+	    if (++slot_collisions > Py_MAX_DICT_SLOT_COLLISIONS) {
+		PyErr_SetString(PyExc_KeyError,
+				"too many slot collisions");
+		return NULL;
+	    }
+	}
     }
     assert(0);          /* NOT REACHED */
     return 0;
@@ -413,6 +440,7 @@
     register size_t mask = (size_t)mp->ma_mask;
     PyDictEntry *ep0 = mp->ma_table;
     register PyDictEntry *ep;
+    size_t hash_collisions, slot_collisions;
 
     /* Make sure this function doesn't have to handle non-string keys,
        including subclasses of str; e.g., one reason to subclass
@@ -439,18 +467,39 @@
 
     /* In the loop, me_key == dummy is by far (factor of 100s) the
        least likely outcome, so test for that last. */
+    hash_collisions = 1;
+    slot_collisions = 1;
     for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
         if (ep->me_key == NULL)
             return freeslot == NULL ? ep : freeslot;
-        if (ep->me_key == key
-            || (ep->me_hash == hash
-            && ep->me_key != dummy
-            && _PyString_Eq(ep->me_key, key)))
+        if (ep->me_key == key)
             return ep;
-        if (ep->me_key == dummy && freeslot == NULL)
-            freeslot = ep;
+        if (ep->me_hash == hash && ep->me_key != dummy) {
+	    if (_PyString_Eq(ep->me_key, key))
+		return ep;
+	    #if DEBUG_DICT_COLLISIONS
+	    printf("hash collisions = %zu (i=%zu)\n", hash_collisions, i);
+	    #endif
+	    if (++hash_collisions > Py_MAX_DICT_HASH_COLLISIONS) {
+		PyErr_SetString(PyExc_KeyError,
+				"too many hash collisions");
+		return NULL;
+	    }
+	}
+        else {
+	    if (ep->me_key == dummy && freeslot == NULL)
+		freeslot = ep;
+	    #if DEBUG_DICT_COLLISIONS
+	    printf("slot collisions = %zu (i=%zu)\n", slot_collisions, i);
+	    #endif
+	    if (++slot_collisions > Py_MAX_DICT_SLOT_COLLISIONS) {
+		PyErr_SetString(PyExc_KeyError,
+				"too many slot collisions");
+		return NULL;
+	    }
+	}
     }
     assert(0);          /* NOT REACHED */
     return 0;
-------------- next part --------------
def integer_hash_collisions(max=1000):

    print 'Hash collision attack:'

    d = {}
    for x in xrange(max):
        i = x*(2**64 - 1) + 1
        #print 'adding %i with hash %i' % (i, hash(i))
        d[i] = 1
    print 'generated dict with %i items' % len(d)
    return d

def integer_slot_collisions(max=1000):

    print 'Slot collision attack:'

    # Fill the dict slots starting at (hash) position 1
    d = {1:1}
    i = 1
    perturb = i
    print 'seeding the dict:'
    for x in xrange(max):
        i = ((i << 2) + i + perturb + 1) & 0xffffffffffffffff
        #print 'adding %i with hash %i' % (i, hash(i))
        d[i] = 1
        perturb >>= 5

    # Add new keys
    print 'generating slot collisions:'
    # cause a hash collision on the first try to enter the
    # prepared slot collision path
    i = 1*(2**64 - 1) + 1
    for x in xrange(max):
        #print 'adding %i with hash %i' % (i, hash(i))
        d[i] = 1
    print 'generated dict with %i items' % len(d)
    return d

integer_hash_collisions(1000)
integer_slot_collisions(1000)