[pypy-commit] pypy default: A latin-1 unicode string needs to have the same hash as the byte string.
arigo
pypy.commits at gmail.com
Wed Jan 25 17:14:29 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r89775:d8c036e10b26
Date: 2017-01-25 23:13 +0100
http://bitbucket.org/pypy/pypy/changeset/d8c036e10b26/
Log: A latin-1 unicode string needs to have the same hash as the byte
string.
diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -556,22 +556,20 @@
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rlib.rarithmetic import intmask
- if isinstance(s, str):
- pass
- elif isinstance(s, unicode):
- if rffi.sizeof(lltype.UniChar) == 4:
- kind = "I"
+ if not isinstance(s, str):
+ if isinstance(s, unicode):
+ lst = map(ord, s)
else:
- kind = "H"
- s = array.array(kind, map(ord, s)).tostring()
- else:
- if lltype.typeOf(s).TO.chars.OF == lltype.Char:
+ lst = map(ord, s.chars) # for rstr.STR or UNICODE
+ # NOTE: a latin-1 unicode string must have the same hash as the
+ # corresponding byte string.
+ if all(n <= 0xFF for n in lst):
kind = "B"
elif rffi.sizeof(lltype.UniChar) == 4:
kind = "I"
else:
kind = "H"
- s = array.array(kind, map(ord, s.chars)).tostring()
+ s = array.array(kind, lst).tostring()
ptr = rffi.str2charp(s)
x = siphash24(ptr, len(s))
rffi.free_charp(ptr)
@@ -580,16 +578,33 @@
def ll_hash_string_siphash24(ll_s):
"""Called from lltypesystem/rstr.py. 'll_s' is a rstr.STR or UNICODE."""
from rpython.rlib.rsiphash import siphash24
- from rpython.rtyper.lltypesystem import lltype, rffi, rstr
+ from rpython.rtyper.lltypesystem import lltype, llmemory, rffi, rstr
from rpython.rlib.rarithmetic import intmask
length = len(ll_s.chars)
- # no GC operation from here!
if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char:
+ # no GC operation from here!
addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
else:
- addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
- length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+ # NOTE: a latin-1 unicode string must have the same hash as the
+ # corresponding byte string. If the unicode is all within
+ # 0-255, then we need to allocate a byte buffer and copy the
+ # latin-1 encoding in it manually.
+ for i in range(length):
+ if ord(ll_s.chars[i]) > 0xFF:
+ # no GC operation from here!
+ addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
+ length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+ break
+ else:
+ p = lltype.malloc(rffi.CCHARP.TO, length, flavor='raw')
+ i = 0
+ while i < length:
+ p[i] = chr(ord(ll_s.chars[i]))
+ i += 1
+ x = siphash24(llmemory.cast_ptr_to_adr(p), length)
+ lltype.free(p, flavor='raw')
+ return intmask(x)
x = siphash24(addr, length)
keepalive_until_here(ll_s)
return intmask(x)
diff --git a/rpython/translator/c/test/test_typed.py b/rpython/translator/c/test/test_typed.py
--- a/rpython/translator/c/test/test_typed.py
+++ b/rpython/translator/c/test/test_typed.py
@@ -606,25 +606,34 @@
objectmodel.set_hash_algorithm(algo)
s = "hello"
u = u"world"
+ v = u"\u1234\u2318+\u2bcd\u2102"
hash_s = compute_hash(s)
hash_u = compute_hash(u)
+ hash_v = compute_hash(v)
+ assert hash_s == compute_hash(u"hello") # same hash because it's
+ assert hash_u == compute_hash("world") # a latin-1 unicode
#
def fn(length):
assert length >= 1
return str((compute_hash(s),
compute_hash(u),
+ compute_hash(v),
compute_hash(s[0] + s[1:length]),
- compute_hash(u[0] + u[1:length])))
+ compute_hash(u[0] + u[1:length]),
+ compute_hash(v[0] + v[1:length]),
+ ))
- assert fn(5) == str((hash_s, hash_u, hash_s, hash_u))
+ assert fn(5) == str((hash_s, hash_u, hash_v, hash_s, hash_u, hash_v))
f = self.getcompiled(fn, [int])
res = f(5)
res = [int(a) for a in res[1:-1].split(",")]
assert res[0] == hash_s
assert res[1] == hash_u
- assert res[2] == hash_s
- assert res[3] == hash_u
+ assert res[2] == hash_v
+ assert res[3] == hash_s
+ assert res[4] == hash_u
+ assert res[5] == hash_v
def test_hash_string_rpython(self):
self._test_hash_string("rpython")
More information about the pypy-commit
mailing list