merge_asof() has type specializations and can take multiple 'by' parameters (#13936)

This commit is contained in:
Christopher C. Aycock 2016-12-01 13:52:16 -05:00
parent 3552dc0c45
commit 75157fcbc0
5 changed files with 1762 additions and 59 deletions

View File

@ -310,6 +310,25 @@ class merge_asof_noby(object):
merge_asof(self.df1, self.df2, on='time')
class merge_asof_int32_noby(object):
def setup(self):
np.random.seed(0)
one_count = 200000
two_count = 1000000
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
'value1': np.random.randn(one_count)})
self.df1.time = np.int32(self.df1.time)
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
'value2': np.random.randn(two_count)})
self.df2.time = np.int32(self.df2.time)
self.df1 = self.df1.sort_values('time')
self.df2 = self.df2.sort_values('time')
def time_merge_asof_int32_noby(self):
merge_asof(self.df1, self.df2, on='time')
class merge_asof_by_object(object):
def setup(self):
@ -318,10 +337,10 @@ class merge_asof_by_object(object):
one_count = 200000
two_count = 1000000
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
'key': np.random.choice(list(string.uppercase), one_count),
'key': np.random.choice(list(string.ascii_uppercase), one_count),
'value1': np.random.randn(one_count)})
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
'key': np.random.choice(list(string.uppercase), two_count),
'key': np.random.choice(list(string.ascii_uppercase), two_count),
'value2': np.random.randn(two_count)})
self.df1 = self.df1.sort_values('time')
self.df2 = self.df2.sort_values('time')
@ -349,6 +368,28 @@ class merge_asof_by_int(object):
merge_asof(self.df1, self.df2, on='time', by='key')
class merge_asof_multiby(object):
def setup(self):
import string
np.random.seed(0)
one_count = 200000
two_count = 1000000
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
'key1': np.random.choice(list(string.ascii_uppercase), one_count),
'key2': np.random.choice(list(string.ascii_uppercase), one_count),
'value1': np.random.randn(one_count)})
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
'key1': np.random.choice(list(string.ascii_uppercase), two_count),
'key2': np.random.choice(list(string.ascii_uppercase), two_count),
'value2': np.random.randn(two_count)})
self.df1 = self.df1.sort_values('time')
self.df2 = self.df2.sort_values('time')
def time_merge_asof_multiby(self):
merge_asof(self.df1, self.df2, on='time', by=['key1', 'key2'])
class join_non_unique_equal(object):
goal_time = 0.2

View File

@ -1,3 +1,4 @@
# cython: boundscheck=False, wraparound=False
"""
Template for each `dtype` helper function for hashtable
@ -12,10 +13,10 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
from hashtable cimport *
def asof_join_int64_t_by_object(ndarray[int64_t] left_values,
ndarray[int64_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
def asof_join_uint8_t_by_int64_t(ndarray[uint8_t] left_values,
ndarray[uint8_t] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
@ -23,9 +24,9 @@ def asof_join_int64_t_by_object(ndarray[int64_t] left_values,
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int64_t tolerance_
PyObjectHashTable hash_table
object by_value
uint8_t tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
@ -38,7 +39,7 @@ def asof_join_int64_t_by_object(ndarray[int64_t] left_values,
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = PyObjectHashTable(right_size)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
@ -75,10 +76,10 @@ def asof_join_int64_t_by_object(ndarray[int64_t] left_values,
return left_indexer, right_indexer
def asof_join_double_by_object(ndarray[double] left_values,
ndarray[double] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
def asof_join_uint16_t_by_int64_t(ndarray[uint16_t] left_values,
ndarray[uint16_t] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
@ -86,9 +87,9 @@ def asof_join_double_by_object(ndarray[double] left_values,
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
double tolerance_
PyObjectHashTable hash_table
object by_value
uint16_t tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
@ -101,7 +102,322 @@ def asof_join_double_by_object(ndarray[double] left_values,
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = PyObjectHashTable(right_size)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint32_t_by_int64_t(ndarray[uint32_t] left_values,
ndarray[uint32_t] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint32_t tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint64_t_by_int64_t(ndarray[uint64_t] left_values,
ndarray[uint64_t] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint64_t tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int8_t_by_int64_t(ndarray[int8_t] left_values,
ndarray[int8_t] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int8_t tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int16_t_by_int64_t(ndarray[int16_t] left_values,
ndarray[int16_t] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int16_t tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int32_t_by_int64_t(ndarray[int32_t] left_values,
ndarray[int32_t] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int32_t tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
@ -201,6 +517,69 @@ def asof_join_int64_t_by_int64_t(ndarray[int64_t] left_values,
return left_indexer, right_indexer
def asof_join_float_by_int64_t(ndarray[float] left_values,
ndarray[float] right_values,
ndarray[int64_t] left_by_values,
ndarray[int64_t] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
float tolerance_
Int64HashTable hash_table
int64_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = Int64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_double_by_int64_t(ndarray[double] left_values,
ndarray[double] right_values,
ndarray[int64_t] left_by_values,
@ -264,11 +643,1005 @@ def asof_join_double_by_int64_t(ndarray[double] left_values,
return left_indexer, right_indexer
def asof_join_uint8_t_by_object(ndarray[uint8_t] left_values,
ndarray[uint8_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint8_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint16_t_by_object(ndarray[uint16_t] left_values,
ndarray[uint16_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint16_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint32_t_by_object(ndarray[uint32_t] left_values,
ndarray[uint32_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint32_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint64_t_by_object(ndarray[uint64_t] left_values,
ndarray[uint64_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint64_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int8_t_by_object(ndarray[int8_t] left_values,
ndarray[int8_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int8_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int16_t_by_object(ndarray[int16_t] left_values,
ndarray[int16_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int16_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int32_t_by_object(ndarray[int32_t] left_values,
ndarray[int32_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int32_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int64_t_by_object(ndarray[int64_t] left_values,
ndarray[int64_t] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int64_t tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_float_by_object(ndarray[float] left_values,
ndarray[float] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
float tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_double_by_object(ndarray[double] left_values,
ndarray[double] right_values,
ndarray[object] left_by_values,
ndarray[object] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
double tolerance_
dict hash_table
object by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
hash_table = {}
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table[right_by_values[right_pos]] = right_pos
right_pos += 1
right_pos -= 1
# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table[by_value]\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
#----------------------------------------------------------------------
# asof_join
#----------------------------------------------------------------------
def asof_join_uint8_t(ndarray[uint8_t] left_values,
ndarray[uint8_t] right_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint8_t tolerance_
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1
# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos
# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint16_t(ndarray[uint16_t] left_values,
ndarray[uint16_t] right_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint16_t tolerance_
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1
# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos
# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint32_t(ndarray[uint32_t] left_values,
ndarray[uint32_t] right_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint32_t tolerance_
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1
# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos
# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_uint64_t(ndarray[uint64_t] left_values,
ndarray[uint64_t] right_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
uint64_t tolerance_
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1
# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos
# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int8_t(ndarray[int8_t] left_values,
ndarray[int8_t] right_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int8_t tolerance_
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1
# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos
# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int16_t(ndarray[int16_t] left_values,
ndarray[int16_t] right_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int16_t tolerance_
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1
# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos
# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int32_t(ndarray[int32_t] left_values,
ndarray[int32_t] right_values,
bint allow_exact_matches=1,
tolerance=None):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
int32_t tolerance_
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1
# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos
# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_int64_t(ndarray[int64_t] left_values,
ndarray[int64_t] right_values,