ENH: add data hashing routines (#14729)

xref https://github.com/dask/dask/pull/1807
(cherry picked from commit 06f26b51e9)
This commit is contained in:
Jeff Reback 2016-11-28 11:19:05 -05:00 committed by Joris Van den Bossche
parent 6c688b947c
commit 59f633f330
5 changed files with 498 additions and 2 deletions

View File

@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from pandas.util import testing as tm
class algorithm(object):
@ -55,3 +56,35 @@ class algorithm(object):
def time_add_overflow_mixed_arr(self):
self.checked_add(self.arr, self.arrmixed)
class hashing(object):
goal_time = 0.2
def setup(self):
N = 100000
self.df = pd.DataFrame(
{'A': pd.Series(tm.makeStringIndex(100).take(
np.random.randint(0, 100, size=N))),
'B': pd.Series(tm.makeStringIndex(10000).take(
np.random.randint(0, 10000, size=N))),
'D': np.random.randn(N),
'E': np.arange(N),
'F': pd.date_range('20110101', freq='s', periods=N),
'G': pd.timedelta_range('1 day', freq='s', periods=N),
})
self.df['C'] = self.df['B'].astype('category')
self.df.iloc[10:20] = np.nan
def time_frame(self):
self.df.hash()
def time_series_int(self):
self.df.E.hash()
def time_series_string(self):
self.df.B.hash()
def time_series_categorical(self):
self.df.C.hash()

180
pandas/src/hash.pyx Normal file
View File

@ -0,0 +1,180 @@
# cython: profile=False
# Translated from the reference implementation
# at https://github.com/veorq/SipHash
import cython
cimport numpy as cnp
import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
from cpython cimport (PyString_Check,
PyBytes_Check,
PyUnicode_Check)
from libc.stdlib cimport malloc, free
DEF cROUNDS = 2
DEF dROUNDS = 4
@cython.boundscheck(False)
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
"""
Parameters
----------
arr : 1-d object ndarray of objects
key : hash key, must be 16 byte len encoded
encoding : encoding for key & arr, default to 'utf8'
Returns
-------
1-d uint64 ndarray of hashes
"""
cdef:
Py_ssize_t i, l, n
ndarray[uint64_t] result
bytes data, k
uint8_t *kb, *lens
char **vecs, *cdata
object val
k = <bytes>key.encode(encoding)
kb = <uint8_t *>k
if len(k) != 16:
raise ValueError(
'key should be a 16-byte string encoded, got {!r} (len {})'.format(
k, len(k)))
n = len(arr)
# create an array of bytes
vecs = <char **> malloc(n * sizeof(char *))
lens = <uint8_t*> malloc(n * sizeof(uint8_t))
cdef list datas = []
for i in range(n):
val = arr[i]
if PyString_Check(val):
data = <bytes>val.encode(encoding)
elif PyBytes_Check(val):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
else:
# non-strings
data = <bytes>str(val).encode(encoding)
l = len(data)
lens[i] = l
cdata = data
# keep the refernce alive thru the end of the
# function
datas.append(data)
vecs[i] = cdata
result = np.empty(n, dtype=np.uint64)
with nogil:
for i in range(n):
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
free(vecs)
free(lens)
return result
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
return (x << b) | (x >> (64 - b))
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
p[0] = <uint8_t>(v)
p[1] = <uint8_t>(v >> 8)
p[2] = <uint8_t>(v >> 16)
p[3] = <uint8_t>(v >> 24)
cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
u32to8_le(p, <uint32_t>v)
u32to8_le(p + 4, <uint32_t>(v >> 32))
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
return (<uint64_t>p[0] |
<uint64_t>p[1] << 8 |
<uint64_t>p[2] << 16 |
<uint64_t>p[3] << 24 |
<uint64_t>p[4] << 32 |
<uint64_t>p[5] << 40 |
<uint64_t>p[6] << 48 |
<uint64_t>p[7] << 56)
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
uint64_t* v2, uint64_t* v3) nogil:
v0[0] += v1[0]
v1[0] = _rotl(v1[0], 13)
v1[0] ^= v0[0]
v0[0] = _rotl(v0[0], 32)
v2[0] += v3[0]
v3[0] = _rotl(v3[0], 16)
v3[0] ^= v2[0]
v0[0] += v3[0]
v3[0] = _rotl(v3[0], 21)
v3[0] ^= v0[0]
v2[0] += v1[0]
v1[0] = _rotl(v1[0], 17)
v1[0] ^= v2[0]
v2[0] = _rotl(v2[0], 32)
cpdef uint64_t siphash(bytes data, bytes key) except? 0:
if len(key) != 16:
raise ValueError(
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
key, len(key)))
return low_level_siphash(data, len(data), key)
@cython.cdivision(True)
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
uint8_t* key) nogil:
cdef uint64_t v0 = 0x736f6d6570736575ULL
cdef uint64_t v1 = 0x646f72616e646f6dULL
cdef uint64_t v2 = 0x6c7967656e657261ULL
cdef uint64_t v3 = 0x7465646279746573ULL
cdef uint64_t b
cdef uint64_t k0 = u8to64_le(key)
cdef uint64_t k1 = u8to64_le(key + 8)
cdef uint64_t m
cdef int i
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
cdef int left = datalen & 7
cdef int left_byte
b = (<uint64_t>datalen) << 56
v3 ^= k1
v2 ^= k0
v1 ^= k1
v0 ^= k0
while (data != end):
m = u8to64_le(data)
v3 ^= m
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= m
data += sizeof(uint64_t)
for i in range(left-1, -1, -1):
b |= (<uint64_t>data[i]) << (i * 8)
v3 ^= b
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= b
v2 ^= 0xff
for i in range(dROUNDS):
_sipround(&v0, &v1, &v2, &v3)
b = v0 ^ v1 ^ v2 ^ v3
return b

137
pandas/tools/hashing.py Normal file
View File

@ -0,0 +1,137 @@
"""
data hash pandas / numpy objects
"""
import numpy as np
from pandas import _hash, Series, factorize, Categorical, Index
from pandas.lib import infer_dtype
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
from pandas.types.common import is_categorical_dtype
# 16 byte long hashing key
_default_hash_key = '0123456789123456'
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
"""
Return a data hash of the Index/Series/DataFrame
.. versionadded:: 0.19.2
Parameters
----------
index : boolean, default True
include the index in the hash (if Series/DataFrame)
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key
Returns
-------
Series of uint64, same length as the object
"""
if hash_key is None:
hash_key = _default_hash_key
def adder(h, hashed_to_add):
h = np.multiply(h, np.uint(3), h)
return np.add(h, hashed_to_add, h)
if isinstance(obj, ABCIndexClass):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
h = Series(h, index=obj, dtype='uint64')
elif isinstance(obj, ABCSeries):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)
h = Series(h, index=obj.index, dtype='uint64')
elif isinstance(obj, ABCDataFrame):
cols = obj.iteritems()
first_series = next(cols)[1]
h = hash_array(first_series.values, encoding,
hash_key).astype('uint64')
for _, col in cols:
h = adder(h, hash_array(col.values, encoding, hash_key))
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)
h = Series(h, index=obj.index, dtype='uint64')
else:
raise TypeError("Unexpected type for hashing %s" % type(obj))
return h
def hash_array(vals, encoding='utf8', hash_key=None):
"""
Given a 1d array, return an array of deterministic integers.
.. versionadded:: 0.19.2
Parameters
----------
vals : ndarray
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key
Returns
-------
1d uint64 numpy array of hash values, same length as the vals
"""
# work with cagegoricals as ints. (This check is above the complex
# check so that we don't ask numpy if categorical is a subdtype of
# complex, as it will choke.
if hash_key is None:
hash_key = _default_hash_key
if is_categorical_dtype(vals.dtype):
vals = vals.codes
# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(vals.dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)
# MAIN LOGIC:
inferred = infer_dtype(vals)
# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
if inferred == 'boolean':
vals = vals.astype('u8')
if (np.issubdtype(vals.dtype, np.datetime64) or
np.issubdtype(vals.dtype, np.timedelta64) or
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:
# its MUCH faster to categorize object dtypes, then hash and rename
codes, categories = factorize(vals, sort=False)
categories = Index(categories)
c = Series(Categorical(codes, categories,
ordered=False, fastpath=True))
vals = _hash.hash_object_array(categories.values,
hash_key,
encoding)
# rename & extract
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
vals *= np.uint64(0xbf58476d1ce4e5b9)
vals ^= vals >> 27
vals *= np.uint64(0x94d049bb133111eb)
vals ^= vals >> 31
return vals

View File

@ -0,0 +1,143 @@
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Index
from pandas.tools.hashing import hash_array, hash_pandas_object
import pandas.util.testing as tm
class TestHashing(tm.TestCase):
_multiprocess_can_split_ = True
def setUp(self):
self.df = DataFrame(
{'i32': np.array([1, 2, 3] * 3, dtype='int32'),
'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
'obj': Series(['d', 'e', 'f'] * 3),
'bool': np.array([True, False, True] * 3),
'dt': Series(pd.date_range('20130101', periods=9)),
'dt_tz': Series(pd.date_range('20130101', periods=9,
tz='US/Eastern')),
'td': Series(pd.timedelta_range('2000', periods=9))})
def test_consistency(self):
# check that our hash doesn't change because of a mistake
# in the actual code; this is the ground truth
result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
expected = Series(np.array([3600424527151052760, 1374399572096150070,
477881037637427054], dtype='uint64'),
index=['foo', 'bar', 'baz'])
tm.assert_series_equal(result, expected)
def test_hash_array(self):
for name, s in self.df.iteritems():
a = s.values
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
def check_equal(self, obj, **kwargs):
a = hash_pandas_object(obj, **kwargs)
b = hash_pandas_object(obj, **kwargs)
tm.assert_series_equal(a, b)
kwargs.pop('index', None)
a = hash_pandas_object(obj, **kwargs)
b = hash_pandas_object(obj, **kwargs)
tm.assert_series_equal(a, b)
def check_not_equal_with_index(self, obj):
# check that we are not hashing the same if
# we include the index
if not isinstance(obj, Index):
a = hash_pandas_object(obj, index=True)
b = hash_pandas_object(obj, index=False)
self.assertFalse((a == b).all())
def test_hash_pandas_object(self):
for obj in [Series([1, 2, 3]),
Series([1.0, 1.5, 3.2]),
Series([1.0, 1.5, np.nan]),
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
Series(['a', 'b', 'c']),
Series(['a', np.nan, 'c']),
Series([True, False, True]),
Index([1, 2, 3]),
Index([True, False, True]),
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
tm.makeMissingDataframe(),
tm.makeMixedDataFrame(),
tm.makeTimeDataFrame(),
tm.makeTimeSeries(),
tm.makeTimedeltaIndex(),
Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
[('a', 1), ('a', 2), ('b', 1)]))]:
self.check_equal(obj)
self.check_not_equal_with_index(obj)
def test_hash_pandas_object2(self):
for name, s in self.df.iteritems():
self.check_equal(s)
self.check_not_equal_with_index(s)
def test_hash_pandas_empty_object(self):
for obj in [Series([], dtype='float64'),
Series([], dtype='object'),
Index([])]:
self.check_equal(obj)
# these are by-definition the same with
# or w/o the index as the data is empty
def test_errors(self):
for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
def f():
hash_pandas_object(f)
self.assertRaises(TypeError, f)
def test_hash_keys(self):
# using different hash keys, should have different hashes
# for the same data
# this only matters for object dtypes
obj = Series(list('abc'))
a = hash_pandas_object(obj, hash_key='9876543210123456')
b = hash_pandas_object(obj, hash_key='9876543210123465')
self.assertTrue((a != b).all())
def test_invalid_key(self):
# this only matters for object dtypes
def f():
hash_pandas_object(Series(list('abc')), hash_key='foo')
self.assertRaises(ValueError, f)
def test_mixed(self):
# mixed objects
obj = Series(['1', 2, 3])
self.check_equal(obj)
self.check_not_equal_with_index(obj)
# mixed are actually equal when stringified
a = hash_pandas_object(obj)
b = hash_pandas_object(Series(list('123')))
self.assert_series_equal(a, b)
def test_alread_encoded(self):
# if already encoded then ok
obj = Series(list('abc')).str.encode('utf8')
self.check_equal(obj)
def test_alternate_encoding(self):
obj = Series(list('abc'))
self.check_equal(obj, encoding='ascii')
def test_long_strings(self):
obj = Index(tm.rands_array(nchars=10000, size=100))
self.check_equal(obj)

View File

@ -331,6 +331,7 @@ class CheckSDist(sdist_class):
'pandas/src/period.pyx',
'pandas/src/sparse.pyx',
'pandas/src/testing.pyx',
'pandas/src/hash.pyx',
'pandas/io/sas/saslib.pyx']
def initialize_options(self):
@ -501,10 +502,12 @@ ext_data = dict(
'sources': ['pandas/src/parser/tokenizer.c',
'pandas/src/parser/io.c']},
_sparse={'pyxfile': 'src/sparse',
'depends': ([srcpath('sparse', suffix='.pyx')]
+ _pxi_dep['_sparse'])},
'depends': ([srcpath('sparse', suffix='.pyx')] +
_pxi_dep['_sparse'])},
_testing={'pyxfile': 'src/testing',
'depends': [srcpath('testing', suffix='.pyx')]},
_hash={'pyxfile': 'src/hash',
'depends': [srcpath('hash', suffix='.pyx')]},
)
ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'}