[pypy-commit] pypy default: A latin-1 unicode string needs to have the same hash as the byte string.

Wed Jan 25 17:14:29 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r89775:d8c036e10b26
Date: 2017-01-25 23:13 +0100
http://bitbucket.org/pypy/pypy/changeset/d8c036e10b26/

Log:	A latin-1 unicode string needs to have the same hash as the byte
	string.

diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -556,22 +556,20 @@
     from rpython.rtyper.lltypesystem import lltype, rffi
     from rpython.rlib.rarithmetic import intmask
 
-    if isinstance(s, str):
-        pass
-    elif isinstance(s, unicode):
-        if rffi.sizeof(lltype.UniChar) == 4:
-            kind = "I"
+    if not isinstance(s, str):
+        if isinstance(s, unicode):
+            lst = map(ord, s)
         else:
-            kind = "H"
-        s = array.array(kind, map(ord, s)).tostring()
-    else:
-        if lltype.typeOf(s).TO.chars.OF == lltype.Char:
+            lst = map(ord, s.chars)    # for rstr.STR or UNICODE
+        # NOTE: a latin-1 unicode string must have the same hash as the
+        # corresponding byte string.
+        if all(n <= 0xFF for n in lst):
             kind = "B"
         elif rffi.sizeof(lltype.UniChar) == 4:
             kind = "I"
         else:
             kind = "H"
-        s = array.array(kind, map(ord, s.chars)).tostring()
+        s = array.array(kind, lst).tostring()
     ptr = rffi.str2charp(s)
     x = siphash24(ptr, len(s))
     rffi.free_charp(ptr)
@@ -580,16 +578,33 @@
 def ll_hash_string_siphash24(ll_s):
     """Called from lltypesystem/rstr.py.  'll_s' is a rstr.STR or UNICODE."""
     from rpython.rlib.rsiphash import siphash24
-    from rpython.rtyper.lltypesystem import lltype, rffi, rstr
+    from rpython.rtyper.lltypesystem import lltype, llmemory, rffi, rstr
     from rpython.rlib.rarithmetic import intmask
 
     length = len(ll_s.chars)
-    # no GC operation from here!
     if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char:
+        # no GC operation from here!
         addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
     else:
-        addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
-        length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+        # NOTE: a latin-1 unicode string must have the same hash as the
+        # corresponding byte string.  If the unicode is all within
+        # 0-255, then we need to allocate a byte buffer and copy the
+        # latin-1 encoding in it manually.
+        for i in range(length):
+            if ord(ll_s.chars[i]) > 0xFF:
+                # no GC operation from here!
+                addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
+                length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+                break
+        else:
+            p = lltype.malloc(rffi.CCHARP.TO, length, flavor='raw')
+            i = 0
+            while i < length:
+                p[i] = chr(ord(ll_s.chars[i]))
+                i += 1
+            x = siphash24(llmemory.cast_ptr_to_adr(p), length)
+            lltype.free(p, flavor='raw')
+            return intmask(x)
     x = siphash24(addr, length)
     keepalive_until_here(ll_s)
     return intmask(x)
diff --git a/rpython/translator/c/test/test_typed.py b/rpython/translator/c/test/test_typed.py
--- a/rpython/translator/c/test/test_typed.py
+++ b/rpython/translator/c/test/test_typed.py
@@ -606,25 +606,34 @@
         objectmodel.set_hash_algorithm(algo)
         s = "hello"
         u = u"world"
+        v = u"\u1234\u2318+\u2bcd\u2102"
         hash_s = compute_hash(s)
         hash_u = compute_hash(u)
+        hash_v = compute_hash(v)
+        assert hash_s == compute_hash(u"hello")   # same hash because it's
+        assert hash_u == compute_hash("world")    #    a latin-1 unicode
         #
         def fn(length):
             assert length >= 1
             return str((compute_hash(s),
                         compute_hash(u),
+                        compute_hash(v),
                         compute_hash(s[0] + s[1:length]),
-                        compute_hash(u[0] + u[1:length])))
+                        compute_hash(u[0] + u[1:length]),
+                        compute_hash(v[0] + v[1:length]),
+                        ))
 
-        assert fn(5) == str((hash_s, hash_u, hash_s, hash_u))
+        assert fn(5) == str((hash_s, hash_u, hash_v, hash_s, hash_u, hash_v))
 
         f = self.getcompiled(fn, [int])
         res = f(5)
         res = [int(a) for a in res[1:-1].split(",")]
         assert res[0] == hash_s
         assert res[1] == hash_u
-        assert res[2] == hash_s
-        assert res[3] == hash_u
+        assert res[2] == hash_v
+        assert res[3] == hash_s
+        assert res[4] == hash_u
+        assert res[5] == hash_v
 
     def test_hash_string_rpython(self):
         self._test_hash_string("rpython")