[Backport #14767] ERR: raise on python in object hashing, only supporting strings, nulls
xref #14729 Author: Jeff Reback <jeff@reback.net> Closes #14767 from jreback/hashing_object and squashes the following commits:9a5a5d4
[Jeff Reback] ERR: raise on python in object hashing, only supporting strings, nulls (cherry picked from commitde1132d878
)
This commit is contained in:
parent
612508a0ce
commit
560aded980
|
@ -7,6 +7,7 @@ cimport numpy as cnp
|
|||
import numpy as np
|
||||
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
|
||||
|
||||
from util cimport _checknull
|
||||
from cpython cimport (PyString_Check,
|
||||
PyBytes_Check,
|
||||
PyUnicode_Check)
|
||||
|
@ -29,6 +30,11 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
|
|||
-------
|
||||
1-d uint64 ndarray of hashes
|
||||
|
||||
Notes
|
||||
-----
|
||||
allowed values must be strings, or nulls
|
||||
mixed array types will raise TypeError
|
||||
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, l, n
|
||||
|
@ -60,10 +66,14 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
|
|||
data = <bytes>val
|
||||
elif PyUnicode_Check(val):
|
||||
data = <bytes>val.encode(encoding)
|
||||
else:
|
||||
# non-strings
|
||||
elif _checknull(val):
|
||||
# null, stringify and encode
|
||||
data = <bytes>str(val).encode(encoding)
|
||||
|
||||
else:
|
||||
raise TypeError("{} of type {} is not a valid type for "
|
||||
"hashing, must be string or null".format(val, type(val)))
|
||||
|
||||
l = len(data)
|
||||
lens[i] = l
|
||||
cdata = data
|
||||
|
|
|
@ -63,6 +63,7 @@ class TestHashing(tm.TestCase):
|
|||
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
|
||||
Series(['a', 'b', 'c']),
|
||||
Series(['a', np.nan, 'c']),
|
||||
Series(['a', None, 'c']),
|
||||
Series([True, False, True]),
|
||||
Index([1, 2, 3]),
|
||||
Index([True, False, True]),
|
||||
|
@ -71,9 +72,7 @@ class TestHashing(tm.TestCase):
|
|||
tm.makeMixedDataFrame(),
|
||||
tm.makeTimeDataFrame(),
|
||||
tm.makeTimeSeries(),
|
||||
tm.makeTimedeltaIndex(),
|
||||
Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
|
||||
[('a', 1), ('a', 2), ('b', 1)]))]:
|
||||
tm.makeTimedeltaIndex()]:
|
||||
self.check_equal(obj)
|
||||
self.check_not_equal_with_index(obj)
|
||||
|
||||
|
@ -115,16 +114,22 @@ class TestHashing(tm.TestCase):
|
|||
hash_pandas_object(Series(list('abc')), hash_key='foo')
|
||||
self.assertRaises(ValueError, f)
|
||||
|
||||
def test_mixed(self):
|
||||
# mixed objects
|
||||
obj = Series(['1', 2, 3])
|
||||
self.check_equal(obj)
|
||||
self.check_not_equal_with_index(obj)
|
||||
def test_unsupported_objects(self):
|
||||
|
||||
# mixed are actually equal when stringified
|
||||
a = hash_pandas_object(obj)
|
||||
b = hash_pandas_object(Series(list('123')))
|
||||
self.assert_series_equal(a, b)
|
||||
# mixed objects are not supported
|
||||
obj = Series(['1', 2, 3])
|
||||
|
||||
def f():
|
||||
hash_pandas_object(obj)
|
||||
self.assertRaises(TypeError, f)
|
||||
|
||||
# MultiIndex are represented as tuples
|
||||
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
|
||||
[('a', 1), ('a', 2), ('b', 1)]))
|
||||
|
||||
def f():
|
||||
hash_pandas_object(obj)
|
||||
self.assertRaises(TypeError, f)
|
||||
|
||||
def test_alread_encoded(self):
|
||||
# if already encoded then ok
|
||||
|
|
Loading…
Reference in New Issue