BUG: fix hash collisions for from int overflow (#14805)
* BUG: we don't like hash collisions in siphash
xref #14767
* This should be a 64-bit int, not an 8-bit int
* fix tests
(cherry picked from commit 51f725f7e8
)
This commit is contained in:
parent
13f28f558b
commit
dc23751b44
|
@ -40,7 +40,8 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
|
|||
Py_ssize_t i, l, n
|
||||
ndarray[uint64_t] result
|
||||
bytes data, k
|
||||
uint8_t *kb, *lens
|
||||
uint8_t *kb
|
||||
uint64_t *lens
|
||||
char **vecs, *cdata
|
||||
object val
|
||||
|
||||
|
@ -55,7 +56,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
|
|||
|
||||
# create an array of bytes
|
||||
vecs = <char **> malloc(n * sizeof(char *))
|
||||
lens = <uint8_t*> malloc(n * sizeof(uint8_t))
|
||||
lens = <uint64_t*> malloc(n * sizeof(uint64_t))
|
||||
|
||||
cdef list datas = []
|
||||
for i in range(n):
|
||||
|
|
|
@ -142,7 +142,36 @@ class TestHashing(tm.TestCase):
|
|||
obj = Series(list('abc'))
|
||||
self.check_equal(obj, encoding='ascii')
|
||||
|
||||
def test_long_strings(self):
|
||||
def test_same_len_hash_collisions(self):
|
||||
|
||||
obj = Index(tm.rands_array(nchars=10000, size=100))
|
||||
self.check_equal(obj)
|
||||
for l in range(8):
|
||||
length = 2**(l + 8) + 1
|
||||
s = tm.rands_array(length, 2)
|
||||
result = hash_array(s, 'utf8')
|
||||
self.assertFalse(result[0] == result[1])
|
||||
|
||||
for l in range(8):
|
||||
length = 2**(l + 8)
|
||||
s = tm.rands_array(length, 2)
|
||||
result = hash_array(s, 'utf8')
|
||||
self.assertFalse(result[0] == result[1])
|
||||
|
||||
def test_hash_collisions(self):
|
||||
|
||||
# hash collisions are bad
|
||||
# https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
|
||||
L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa
|
||||
'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa
|
||||
|
||||
# these should be different!
|
||||
result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
|
||||
expected1 = np.array([14963968704024874985], dtype=np.uint64)
|
||||
self.assert_numpy_array_equal(result1, expected1)
|
||||
|
||||
result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
|
||||
expected2 = np.array([16428432627716348016], dtype=np.uint64)
|
||||
self.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
result = hash_array(np.asarray(L, dtype=object), 'utf8')
|
||||
self.assert_numpy_array_equal(
|
||||
result, np.concatenate([expected1, expected2], axis=0))
|
||||
|
|
Loading…
Reference in New Issue