ERR: raise on python in object hashing, only supporting strings, nulls

xref #14729

Author: Jeff Reback <jeff@reback.net>

Closes #14767 from jreback/hashing_object and squashes the following commits:

9a5a5d4 [Jeff Reback] ERR: raise on python in object hashing, only supporting strings, nulls
This commit is contained in:
Jeff Reback 2016-11-30 06:01:28 -05:00
parent 423c16a2ee
commit de1132d878
2 changed files with 29 additions and 14 deletions

View File

@ -7,6 +7,7 @@ cimport numpy as cnp
import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
from util cimport _checknull
from cpython cimport (PyString_Check,
PyBytes_Check,
PyUnicode_Check)
@ -29,6 +30,11 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
-------
1-d uint64 ndarray of hashes
Notes
-----
allowed values must be strings, or nulls
mixed array types will raise TypeError
"""
cdef:
Py_ssize_t i, l, n
@ -60,10 +66,14 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
else:
# non-strings
elif _checknull(val):
# null, stringify and encode
data = <bytes>str(val).encode(encoding)
else:
raise TypeError("{} of type {} is not a valid type for "
"hashing, must be string or null".format(val, type(val)))
l = len(data)
lens[i] = l
cdata = data

View File

@ -63,6 +63,7 @@ class TestHashing(tm.TestCase):
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
Series(['a', 'b', 'c']),
Series(['a', np.nan, 'c']),
Series(['a', None, 'c']),
Series([True, False, True]),
Index([1, 2, 3]),
Index([True, False, True]),
@ -71,9 +72,7 @@ class TestHashing(tm.TestCase):
tm.makeMixedDataFrame(),
tm.makeTimeDataFrame(),
tm.makeTimeSeries(),
tm.makeTimedeltaIndex(),
Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
[('a', 1), ('a', 2), ('b', 1)]))]:
tm.makeTimedeltaIndex()]:
self.check_equal(obj)
self.check_not_equal_with_index(obj)
@ -115,16 +114,22 @@ class TestHashing(tm.TestCase):
hash_pandas_object(Series(list('abc')), hash_key='foo')
self.assertRaises(ValueError, f)
def test_mixed(self):
# mixed objects
obj = Series(['1', 2, 3])
self.check_equal(obj)
self.check_not_equal_with_index(obj)
def test_unsupported_objects(self):
# mixed are actually equal when stringified
a = hash_pandas_object(obj)
b = hash_pandas_object(Series(list('123')))
self.assert_series_equal(a, b)
# mixed objects are not supported
obj = Series(['1', 2, 3])
def f():
hash_pandas_object(obj)
self.assertRaises(TypeError, f)
# MultiIndex are represented as tuples
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
[('a', 1), ('a', 2), ('b', 1)]))
def f():
hash_pandas_object(obj)
self.assertRaises(TypeError, f)
def test_alread_encoded(self):
# if already encoded then ok