ENH: add data hashing routines (#14729)
xref https://github.com/dask/dask/pull/1807
(cherry picked from commit 06f26b51e9
)
This commit is contained in:
parent
6c688b947c
commit
59f633f330
|
@ -1,5 +1,6 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
class algorithm(object):
|
||||
|
@ -55,3 +56,35 @@ class algorithm(object):
|
|||
|
||||
def time_add_overflow_mixed_arr(self):
|
||||
self.checked_add(self.arr, self.arrmixed)
|
||||
|
||||
|
||||
class hashing(object):
|
||||
goal_time = 0.2
|
||||
|
||||
def setup(self):
|
||||
N = 100000
|
||||
|
||||
self.df = pd.DataFrame(
|
||||
{'A': pd.Series(tm.makeStringIndex(100).take(
|
||||
np.random.randint(0, 100, size=N))),
|
||||
'B': pd.Series(tm.makeStringIndex(10000).take(
|
||||
np.random.randint(0, 10000, size=N))),
|
||||
'D': np.random.randn(N),
|
||||
'E': np.arange(N),
|
||||
'F': pd.date_range('20110101', freq='s', periods=N),
|
||||
'G': pd.timedelta_range('1 day', freq='s', periods=N),
|
||||
})
|
||||
self.df['C'] = self.df['B'].astype('category')
|
||||
self.df.iloc[10:20] = np.nan
|
||||
|
||||
def time_frame(self):
|
||||
self.df.hash()
|
||||
|
||||
def time_series_int(self):
|
||||
self.df.E.hash()
|
||||
|
||||
def time_series_string(self):
|
||||
self.df.B.hash()
|
||||
|
||||
def time_series_categorical(self):
|
||||
self.df.C.hash()
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
# cython: profile=False
|
||||
# Translated from the reference implementation
|
||||
# at https://github.com/veorq/SipHash
|
||||
|
||||
import cython
|
||||
cimport numpy as cnp
|
||||
import numpy as np
|
||||
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
|
||||
|
||||
from cpython cimport (PyString_Check,
|
||||
PyBytes_Check,
|
||||
PyUnicode_Check)
|
||||
from libc.stdlib cimport malloc, free
|
||||
|
||||
DEF cROUNDS = 2
|
||||
DEF dROUNDS = 4
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
arr : 1-d object ndarray of objects
|
||||
key : hash key, must be 16 byte len encoded
|
||||
encoding : encoding for key & arr, default to 'utf8'
|
||||
|
||||
Returns
|
||||
-------
|
||||
1-d uint64 ndarray of hashes
|
||||
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, l, n
|
||||
ndarray[uint64_t] result
|
||||
bytes data, k
|
||||
uint8_t *kb, *lens
|
||||
char **vecs, *cdata
|
||||
object val
|
||||
|
||||
k = <bytes>key.encode(encoding)
|
||||
kb = <uint8_t *>k
|
||||
if len(k) != 16:
|
||||
raise ValueError(
|
||||
'key should be a 16-byte string encoded, got {!r} (len {})'.format(
|
||||
k, len(k)))
|
||||
|
||||
n = len(arr)
|
||||
|
||||
# create an array of bytes
|
||||
vecs = <char **> malloc(n * sizeof(char *))
|
||||
lens = <uint8_t*> malloc(n * sizeof(uint8_t))
|
||||
|
||||
cdef list datas = []
|
||||
for i in range(n):
|
||||
val = arr[i]
|
||||
if PyString_Check(val):
|
||||
data = <bytes>val.encode(encoding)
|
||||
elif PyBytes_Check(val):
|
||||
data = <bytes>val
|
||||
elif PyUnicode_Check(val):
|
||||
data = <bytes>val.encode(encoding)
|
||||
else:
|
||||
# non-strings
|
||||
data = <bytes>str(val).encode(encoding)
|
||||
|
||||
l = len(data)
|
||||
lens[i] = l
|
||||
cdata = data
|
||||
|
||||
# keep the refernce alive thru the end of the
|
||||
# function
|
||||
datas.append(data)
|
||||
vecs[i] = cdata
|
||||
|
||||
result = np.empty(n, dtype=np.uint64)
|
||||
with nogil:
|
||||
for i in range(n):
|
||||
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
|
||||
|
||||
free(vecs)
|
||||
free(lens)
|
||||
return result
|
||||
|
||||
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
|
||||
return (x << b) | (x >> (64 - b))
|
||||
|
||||
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
|
||||
p[0] = <uint8_t>(v)
|
||||
p[1] = <uint8_t>(v >> 8)
|
||||
p[2] = <uint8_t>(v >> 16)
|
||||
p[3] = <uint8_t>(v >> 24)
|
||||
|
||||
cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
|
||||
u32to8_le(p, <uint32_t>v)
|
||||
u32to8_le(p + 4, <uint32_t>(v >> 32))
|
||||
|
||||
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
|
||||
return (<uint64_t>p[0] |
|
||||
<uint64_t>p[1] << 8 |
|
||||
<uint64_t>p[2] << 16 |
|
||||
<uint64_t>p[3] << 24 |
|
||||
<uint64_t>p[4] << 32 |
|
||||
<uint64_t>p[5] << 40 |
|
||||
<uint64_t>p[6] << 48 |
|
||||
<uint64_t>p[7] << 56)
|
||||
|
||||
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
|
||||
uint64_t* v2, uint64_t* v3) nogil:
|
||||
v0[0] += v1[0]
|
||||
v1[0] = _rotl(v1[0], 13)
|
||||
v1[0] ^= v0[0]
|
||||
v0[0] = _rotl(v0[0], 32)
|
||||
v2[0] += v3[0]
|
||||
v3[0] = _rotl(v3[0], 16)
|
||||
v3[0] ^= v2[0]
|
||||
v0[0] += v3[0]
|
||||
v3[0] = _rotl(v3[0], 21)
|
||||
v3[0] ^= v0[0]
|
||||
v2[0] += v1[0]
|
||||
v1[0] = _rotl(v1[0], 17)
|
||||
v1[0] ^= v2[0]
|
||||
v2[0] = _rotl(v2[0], 32)
|
||||
|
||||
cpdef uint64_t siphash(bytes data, bytes key) except? 0:
|
||||
if len(key) != 16:
|
||||
raise ValueError(
|
||||
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
|
||||
key, len(key)))
|
||||
return low_level_siphash(data, len(data), key)
|
||||
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
|
||||
uint8_t* key) nogil:
|
||||
cdef uint64_t v0 = 0x736f6d6570736575ULL
|
||||
cdef uint64_t v1 = 0x646f72616e646f6dULL
|
||||
cdef uint64_t v2 = 0x6c7967656e657261ULL
|
||||
cdef uint64_t v3 = 0x7465646279746573ULL
|
||||
cdef uint64_t b
|
||||
cdef uint64_t k0 = u8to64_le(key)
|
||||
cdef uint64_t k1 = u8to64_le(key + 8)
|
||||
cdef uint64_t m
|
||||
cdef int i
|
||||
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
|
||||
cdef int left = datalen & 7
|
||||
cdef int left_byte
|
||||
|
||||
b = (<uint64_t>datalen) << 56
|
||||
v3 ^= k1
|
||||
v2 ^= k0
|
||||
v1 ^= k1
|
||||
v0 ^= k0
|
||||
|
||||
while (data != end):
|
||||
m = u8to64_le(data)
|
||||
v3 ^= m
|
||||
for i in range(cROUNDS):
|
||||
_sipround(&v0, &v1, &v2, &v3)
|
||||
v0 ^= m
|
||||
|
||||
data += sizeof(uint64_t)
|
||||
|
||||
for i in range(left-1, -1, -1):
|
||||
b |= (<uint64_t>data[i]) << (i * 8)
|
||||
|
||||
v3 ^= b
|
||||
|
||||
for i in range(cROUNDS):
|
||||
_sipround(&v0, &v1, &v2, &v3)
|
||||
|
||||
v0 ^= b
|
||||
v2 ^= 0xff
|
||||
|
||||
for i in range(dROUNDS):
|
||||
_sipround(&v0, &v1, &v2, &v3)
|
||||
|
||||
b = v0 ^ v1 ^ v2 ^ v3
|
||||
|
||||
return b
|
|
@ -0,0 +1,137 @@
|
|||
"""
|
||||
data hash pandas / numpy objects
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from pandas import _hash, Series, factorize, Categorical, Index
|
||||
from pandas.lib import infer_dtype
|
||||
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
|
||||
from pandas.types.common import is_categorical_dtype
|
||||
|
||||
# 16 byte long hashing key
|
||||
_default_hash_key = '0123456789123456'
|
||||
|
||||
|
||||
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
|
||||
"""
|
||||
Return a data hash of the Index/Series/DataFrame
|
||||
|
||||
.. versionadded:: 0.19.2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index : boolean, default True
|
||||
include the index in the hash (if Series/DataFrame)
|
||||
encoding : string, default 'utf8'
|
||||
encoding for data & key when strings
|
||||
hash_key : string key to encode, default to _default_hash_key
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series of uint64, same length as the object
|
||||
|
||||
"""
|
||||
if hash_key is None:
|
||||
hash_key = _default_hash_key
|
||||
|
||||
def adder(h, hashed_to_add):
|
||||
h = np.multiply(h, np.uint(3), h)
|
||||
return np.add(h, hashed_to_add, h)
|
||||
|
||||
if isinstance(obj, ABCIndexClass):
|
||||
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
|
||||
h = Series(h, index=obj, dtype='uint64')
|
||||
elif isinstance(obj, ABCSeries):
|
||||
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
|
||||
if index:
|
||||
h = adder(h, hash_pandas_object(obj.index,
|
||||
index=False,
|
||||
encoding=encoding,
|
||||
hash_key=hash_key).values)
|
||||
h = Series(h, index=obj.index, dtype='uint64')
|
||||
elif isinstance(obj, ABCDataFrame):
|
||||
cols = obj.iteritems()
|
||||
first_series = next(cols)[1]
|
||||
h = hash_array(first_series.values, encoding,
|
||||
hash_key).astype('uint64')
|
||||
for _, col in cols:
|
||||
h = adder(h, hash_array(col.values, encoding, hash_key))
|
||||
if index:
|
||||
h = adder(h, hash_pandas_object(obj.index,
|
||||
index=False,
|
||||
encoding=encoding,
|
||||
hash_key=hash_key).values)
|
||||
|
||||
h = Series(h, index=obj.index, dtype='uint64')
|
||||
else:
|
||||
raise TypeError("Unexpected type for hashing %s" % type(obj))
|
||||
return h
|
||||
|
||||
|
||||
def hash_array(vals, encoding='utf8', hash_key=None):
|
||||
"""
|
||||
Given a 1d array, return an array of deterministic integers.
|
||||
|
||||
.. versionadded:: 0.19.2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vals : ndarray
|
||||
encoding : string, default 'utf8'
|
||||
encoding for data & key when strings
|
||||
hash_key : string key to encode, default to _default_hash_key
|
||||
|
||||
Returns
|
||||
-------
|
||||
1d uint64 numpy array of hash values, same length as the vals
|
||||
|
||||
"""
|
||||
|
||||
# work with cagegoricals as ints. (This check is above the complex
|
||||
# check so that we don't ask numpy if categorical is a subdtype of
|
||||
# complex, as it will choke.
|
||||
if hash_key is None:
|
||||
hash_key = _default_hash_key
|
||||
|
||||
if is_categorical_dtype(vals.dtype):
|
||||
vals = vals.codes
|
||||
|
||||
# we'll be working with everything as 64-bit values, so handle this
|
||||
# 128-bit value early
|
||||
if np.issubdtype(vals.dtype, np.complex128):
|
||||
return hash_array(vals.real) + 23 * hash_array(vals.imag)
|
||||
|
||||
# MAIN LOGIC:
|
||||
inferred = infer_dtype(vals)
|
||||
|
||||
# First, turn whatever array this is into unsigned 64-bit ints, if we can
|
||||
# manage it.
|
||||
if inferred == 'boolean':
|
||||
vals = vals.astype('u8')
|
||||
|
||||
if (np.issubdtype(vals.dtype, np.datetime64) or
|
||||
np.issubdtype(vals.dtype, np.timedelta64) or
|
||||
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
|
||||
|
||||
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
|
||||
else:
|
||||
|
||||
# its MUCH faster to categorize object dtypes, then hash and rename
|
||||
codes, categories = factorize(vals, sort=False)
|
||||
categories = Index(categories)
|
||||
c = Series(Categorical(codes, categories,
|
||||
ordered=False, fastpath=True))
|
||||
vals = _hash.hash_object_array(categories.values,
|
||||
hash_key,
|
||||
encoding)
|
||||
|
||||
# rename & extract
|
||||
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
|
||||
|
||||
# Then, redistribute these 64-bit ints within the space of 64-bit ints
|
||||
vals ^= vals >> 30
|
||||
vals *= np.uint64(0xbf58476d1ce4e5b9)
|
||||
vals ^= vals >> 27
|
||||
vals *= np.uint64(0x94d049bb133111eb)
|
||||
vals ^= vals >> 31
|
||||
return vals
|
|
@ -0,0 +1,143 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pandas import DataFrame, Series, Index
|
||||
from pandas.tools.hashing import hash_array, hash_pandas_object
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestHashing(tm.TestCase):
|
||||
|
||||
_multiprocess_can_split_ = True
|
||||
|
||||
def setUp(self):
|
||||
self.df = DataFrame(
|
||||
{'i32': np.array([1, 2, 3] * 3, dtype='int32'),
|
||||
'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
|
||||
'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
|
||||
'obj': Series(['d', 'e', 'f'] * 3),
|
||||
'bool': np.array([True, False, True] * 3),
|
||||
'dt': Series(pd.date_range('20130101', periods=9)),
|
||||
'dt_tz': Series(pd.date_range('20130101', periods=9,
|
||||
tz='US/Eastern')),
|
||||
'td': Series(pd.timedelta_range('2000', periods=9))})
|
||||
|
||||
def test_consistency(self):
|
||||
# check that our hash doesn't change because of a mistake
|
||||
# in the actual code; this is the ground truth
|
||||
result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
|
||||
expected = Series(np.array([3600424527151052760, 1374399572096150070,
|
||||
477881037637427054], dtype='uint64'),
|
||||
index=['foo', 'bar', 'baz'])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_hash_array(self):
|
||||
for name, s in self.df.iteritems():
|
||||
a = s.values
|
||||
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
|
||||
|
||||
def check_equal(self, obj, **kwargs):
|
||||
a = hash_pandas_object(obj, **kwargs)
|
||||
b = hash_pandas_object(obj, **kwargs)
|
||||
tm.assert_series_equal(a, b)
|
||||
|
||||
kwargs.pop('index', None)
|
||||
a = hash_pandas_object(obj, **kwargs)
|
||||
b = hash_pandas_object(obj, **kwargs)
|
||||
tm.assert_series_equal(a, b)
|
||||
|
||||
def check_not_equal_with_index(self, obj):
|
||||
|
||||
# check that we are not hashing the same if
|
||||
# we include the index
|
||||
if not isinstance(obj, Index):
|
||||
a = hash_pandas_object(obj, index=True)
|
||||
b = hash_pandas_object(obj, index=False)
|
||||
self.assertFalse((a == b).all())
|
||||
|
||||
def test_hash_pandas_object(self):
|
||||
|
||||
for obj in [Series([1, 2, 3]),
|
||||
Series([1.0, 1.5, 3.2]),
|
||||
Series([1.0, 1.5, np.nan]),
|
||||
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
|
||||
Series(['a', 'b', 'c']),
|
||||
Series(['a', np.nan, 'c']),
|
||||
Series([True, False, True]),
|
||||
Index([1, 2, 3]),
|
||||
Index([True, False, True]),
|
||||
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
|
||||
tm.makeMissingDataframe(),
|
||||
tm.makeMixedDataFrame(),
|
||||
tm.makeTimeDataFrame(),
|
||||
tm.makeTimeSeries(),
|
||||
tm.makeTimedeltaIndex(),
|
||||
Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
|
||||
[('a', 1), ('a', 2), ('b', 1)]))]:
|
||||
self.check_equal(obj)
|
||||
self.check_not_equal_with_index(obj)
|
||||
|
||||
def test_hash_pandas_object2(self):
|
||||
for name, s in self.df.iteritems():
|
||||
self.check_equal(s)
|
||||
self.check_not_equal_with_index(s)
|
||||
|
||||
def test_hash_pandas_empty_object(self):
|
||||
for obj in [Series([], dtype='float64'),
|
||||
Series([], dtype='object'),
|
||||
Index([])]:
|
||||
self.check_equal(obj)
|
||||
|
||||
# these are by-definition the same with
|
||||
# or w/o the index as the data is empty
|
||||
|
||||
def test_errors(self):
|
||||
|
||||
for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
|
||||
def f():
|
||||
hash_pandas_object(f)
|
||||
|
||||
self.assertRaises(TypeError, f)
|
||||
|
||||
def test_hash_keys(self):
|
||||
# using different hash keys, should have different hashes
|
||||
# for the same data
|
||||
|
||||
# this only matters for object dtypes
|
||||
obj = Series(list('abc'))
|
||||
a = hash_pandas_object(obj, hash_key='9876543210123456')
|
||||
b = hash_pandas_object(obj, hash_key='9876543210123465')
|
||||
self.assertTrue((a != b).all())
|
||||
|
||||
def test_invalid_key(self):
|
||||
# this only matters for object dtypes
|
||||
def f():
|
||||
hash_pandas_object(Series(list('abc')), hash_key='foo')
|
||||
self.assertRaises(ValueError, f)
|
||||
|
||||
def test_mixed(self):
|
||||
# mixed objects
|
||||
obj = Series(['1', 2, 3])
|
||||
self.check_equal(obj)
|
||||
self.check_not_equal_with_index(obj)
|
||||
|
||||
# mixed are actually equal when stringified
|
||||
a = hash_pandas_object(obj)
|
||||
b = hash_pandas_object(Series(list('123')))
|
||||
self.assert_series_equal(a, b)
|
||||
|
||||
def test_alread_encoded(self):
|
||||
# if already encoded then ok
|
||||
|
||||
obj = Series(list('abc')).str.encode('utf8')
|
||||
self.check_equal(obj)
|
||||
|
||||
def test_alternate_encoding(self):
|
||||
|
||||
obj = Series(list('abc'))
|
||||
self.check_equal(obj, encoding='ascii')
|
||||
|
||||
def test_long_strings(self):
|
||||
|
||||
obj = Index(tm.rands_array(nchars=10000, size=100))
|
||||
self.check_equal(obj)
|
7
setup.py
7
setup.py
|
@ -331,6 +331,7 @@ class CheckSDist(sdist_class):
|
|||
'pandas/src/period.pyx',
|
||||
'pandas/src/sparse.pyx',
|
||||
'pandas/src/testing.pyx',
|
||||
'pandas/src/hash.pyx',
|
||||
'pandas/io/sas/saslib.pyx']
|
||||
|
||||
def initialize_options(self):
|
||||
|
@ -501,10 +502,12 @@ ext_data = dict(
|
|||
'sources': ['pandas/src/parser/tokenizer.c',
|
||||
'pandas/src/parser/io.c']},
|
||||
_sparse={'pyxfile': 'src/sparse',
|
||||
'depends': ([srcpath('sparse', suffix='.pyx')]
|
||||
+ _pxi_dep['_sparse'])},
|
||||
'depends': ([srcpath('sparse', suffix='.pyx')] +
|
||||
_pxi_dep['_sparse'])},
|
||||
_testing={'pyxfile': 'src/testing',
|
||||
'depends': [srcpath('testing', suffix='.pyx')]},
|
||||
_hash={'pyxfile': 'src/hash',
|
||||
'depends': [srcpath('hash', suffix='.pyx')]},
|
||||
)
|
||||
|
||||
ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'}
|
||||
|
|
Loading…
Reference in New Issue