ENH: merge_asof() has left_index/right_index and left_by/right_by (#14253) (#14531)

(cherry picked from commit 84cad61556)
This commit is contained in:
Christopher C. Aycock 2016-12-14 11:10:42 -05:00 committed by Joris Van den Bossche
parent f1d43a4b50
commit 9a6a78f36b
3 changed files with 215 additions and 29 deletions

View File

@ -32,8 +32,6 @@ Other Enhancements
- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)
>>>>>>> 49e3137... DOC: whatsnew 0.19.2
.. _whatsnew_0192.bug_fixes:
Bug Fixes

View File

@ -259,7 +259,8 @@ ordered_merge.__doc__ = merge_ordered.__doc__
def merge_asof(left, right, on=None,
left_on=None, right_on=None,
by=None,
left_index=False, right_index=False,
by=None, left_by=None, right_by=None,
suffixes=('_x', '_y'),
tolerance=None,
allow_exact_matches=True):
@ -288,9 +289,29 @@ def merge_asof(left, right, on=None,
Field name to join on in left DataFrame.
right_on : label
Field name to join on in right DataFrame.
left_index : boolean
Use the index of the left DataFrame as the join key.
.. versionadded:: 0.19.2
right_index : boolean
Use the index of the right DataFrame as the join key.
.. versionadded:: 0.19.2
by : column name
Group both the left and right DataFrames by the group column; perform
the merge operation on these pieces and recombine.
left_by : column name
Field name to group by in the left DataFrame.
.. versionadded:: 0.19.2
right_by : column name
Field name to group by in the right DataFrame.
.. versionadded:: 0.19.2
suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and right
side, respectively
@ -348,6 +369,28 @@ def merge_asof(left, right, on=None,
3 5 b 3.0
6 10 c 7.0
We can use indexed DataFrames as well.
>>> left
left_val
1 a
5 b
10 c
>>> right
right_val
1 1
2 2
3 3
6 6
7 7
>>> pd.merge_asof(left, right, left_index=True, right_index=True)
left_val right_val
1 a 1
5 b 3
10 c 7
Here is a real-world times-series example
>>> quotes
@ -418,7 +461,9 @@ def merge_asof(left, right, on=None,
"""
op = _AsOfMerge(left, right,
on=on, left_on=left_on, right_on=right_on,
by=by, suffixes=suffixes,
left_index=left_index, right_index=right_index,
by=by, left_by=left_by, right_by=right_by,
suffixes=suffixes,
how='asof', tolerance=tolerance,
allow_exact_matches=allow_exact_matches)
return op.get_result()
@ -650,7 +695,7 @@ class _MergeOperation(object):
left_ax = self.left._data.axes[self.axis]
right_ax = self.right._data.axes[self.axis]
if self.left_index and self.right_index:
if self.left_index and self.right_index and self.how != 'asof':
join_index, left_indexer, right_indexer = \
left_ax.join(right_ax, how=self.how, return_indexers=True)
elif self.right_index and self.how == 'left':
@ -731,6 +776,16 @@ class _MergeOperation(object):
is_rkey = lambda x: isinstance(
x, (np.ndarray, ABCSeries)) and len(x) == len(right)
# Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
# user could, for example, request 'left_index' and 'left_by'. In a
# regular pd.merge(), users cannot specify both 'left_index' and
# 'left_on'. (Instead, users have a MultiIndex). That means the
# self.left_on in this function is always empty in a pd.merge(), but
# a pd.merge_asof(left_index=True, left_by=...) will result in a
# self.left_on array with a None in the middle of it. This requires
# a work-around as designated in the code below.
# See _validate_specification() for where this happens.
# ugh, spaghetti re #733
if _any(self.left_on) and _any(self.right_on):
for lk, rk in zip(self.left_on, self.right_on):
@ -740,12 +795,21 @@ class _MergeOperation(object):
right_keys.append(rk)
join_names.append(None) # what to do?
else:
right_keys.append(right[rk]._values)
join_names.append(rk)
if rk is not None:
right_keys.append(right[rk]._values)
join_names.append(rk)
else:
# work-around for merge_asof(right_index=True)
right_keys.append(right.index)
join_names.append(right.index.name)
else:
if not is_rkey(rk):
right_keys.append(right[rk]._values)
if lk == rk:
if rk is not None:
right_keys.append(right[rk]._values)
else:
# work-around for merge_asof(right_index=True)
right_keys.append(right.index)
if lk is not None and lk == rk:
# avoid key upcast in corner case (length-0)
if len(left) > 0:
right_drop.append(rk)
@ -753,8 +817,13 @@ class _MergeOperation(object):
left_drop.append(lk)
else:
right_keys.append(rk)
left_keys.append(left[lk]._values)
join_names.append(lk)
if lk is not None:
left_keys.append(left[lk]._values)
join_names.append(lk)
else:
# work-around for merge_asof(left_index=True)
left_keys.append(left.index)
join_names.append(left.index.name)
elif _any(self.left_on):
for k in self.left_on:
if is_lkey(k):
@ -879,13 +948,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
class _OrderedMerge(_MergeOperation):
_merge_type = 'ordered_merge'
def __init__(self, left, right, on=None, left_on=None,
right_on=None, axis=1,
def __init__(self, left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, axis=1,
suffixes=('_x', '_y'), copy=True,
fill_method=None, how='outer'):
self.fill_method = fill_method
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
left_index=left_index,
right_index=right_index,
right_on=right_on, axis=axis,
how=how, suffixes=suffixes,
sort=True # factorize sorts
@ -958,19 +1029,23 @@ def _get_cython_type(dtype):
class _AsOfMerge(_OrderedMerge):
_merge_type = 'asof_merge'
def __init__(self, left, right, on=None, by=None, left_on=None,
right_on=None, axis=1,
suffixes=('_x', '_y'), copy=True,
def __init__(self, left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False,
by=None, left_by=None, right_by=None,
axis=1, suffixes=('_x', '_y'), copy=True,
fill_method=None,
how='asof', tolerance=None,
allow_exact_matches=True):
self.by = by
self.left_by = left_by
self.right_by = right_by
self.tolerance = tolerance
self.allow_exact_matches = allow_exact_matches
_OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
right_on=right_on, axis=axis,
right_on=right_on, left_index=left_index,
right_index=right_index, axis=axis,
how=how, suffixes=suffixes,
fill_method=fill_method)
@ -978,23 +1053,44 @@ class _AsOfMerge(_OrderedMerge):
super(_AsOfMerge, self)._validate_specification()
# we only allow on to be a single item for on
if len(self.left_on) != 1:
if len(self.left_on) != 1 and not self.left_index:
raise MergeError("can only asof on a key for left")
if len(self.right_on) != 1:
if len(self.right_on) != 1 and not self.right_index:
raise MergeError("can only asof on a key for right")
if self.left_index and isinstance(self.left.index, MultiIndex):
raise MergeError("left can only have one index")
if self.right_index and isinstance(self.right.index, MultiIndex):
raise MergeError("right can only have one index")
# set 'by' columns
if self.by is not None:
if self.left_by is not None or self.right_by is not None:
raise MergeError('Can only pass by OR left_by '
'and right_by')
self.left_by = self.right_by = self.by
if self.left_by is None and self.right_by is not None:
raise MergeError('missing left_by')
if self.left_by is not None and self.right_by is None:
raise MergeError('missing right_by')
# add by to our key-list so we can have it in the
# output as a key
if self.by is not None:
if not is_list_like(self.by):
self.by = [self.by]
if self.left_by is not None:
if not is_list_like(self.left_by):
self.left_by = [self.left_by]
if not is_list_like(self.right_by):
self.right_by = [self.right_by]
if len(self.by) != 1:
if len(self.left_by) != 1:
raise MergeError("can only asof by a single key")
if len(self.right_by) != 1:
raise MergeError("can only asof by a single key")
self.left_on = self.by + list(self.left_on)
self.right_on = self.by + list(self.right_on)
self.left_on = self.left_by + list(self.left_on)
self.right_on = self.right_by + list(self.right_on)
@property
def _asof_key(self):
@ -1017,7 +1113,7 @@ class _AsOfMerge(_OrderedMerge):
# validate tolerance; must be a Timedelta if we have a DTI
if self.tolerance is not None:
lt = left_join_keys[self.left_on.index(self._asof_key)]
lt = left_join_keys[-1]
msg = "incompatible tolerance, must be compat " \
"with type {0}".format(type(lt))
@ -1047,8 +1143,10 @@ class _AsOfMerge(_OrderedMerge):
""" return the join indexers """
# values to compare
left_values = self.left_join_keys[-1]
right_values = self.right_join_keys[-1]
left_values = (self.left.index.values if self.left_index else
self.left_join_keys[-1])
right_values = (self.right.index.values if self.right_index else
self.right_join_keys[-1])
tolerance = self.tolerance
# we required sortedness in the join keys
@ -1066,7 +1164,7 @@ class _AsOfMerge(_OrderedMerge):
tolerance = tolerance.value
# a "by" parameter requires special handling
if self.by is not None:
if self.left_by is not None:
left_by_values = self.left_join_keys[0]
right_by_values = self.right_join_keys[0]

View File

@ -118,6 +118,96 @@ class TestAsOfMerge(tm.TestCase):
by='ticker')
assert_frame_equal(result, expected)
def test_basic_left_index(self):
# GH14253
expected = self.asof
trades = self.trades.set_index('time')
quotes = self.quotes
result = merge_asof(trades, quotes,
left_index=True,
right_on='time',
by='ticker')
# left-only index uses right's index, oddly
expected.index = result.index
# time column appears after left's columns
expected = expected[result.columns]
assert_frame_equal(result, expected)
def test_basic_right_index(self):
expected = self.asof
trades = self.trades
quotes = self.quotes.set_index('time')
result = merge_asof(trades, quotes,
left_on='time',
right_index=True,
by='ticker')
assert_frame_equal(result, expected)
def test_basic_left_index_right_index(self):
expected = self.asof.set_index('time')
trades = self.trades.set_index('time')
quotes = self.quotes.set_index('time')
result = merge_asof(trades, quotes,
left_index=True,
right_index=True,
by='ticker')
assert_frame_equal(result, expected)
def test_multi_index(self):
# MultiIndex is prohibited
trades = self.trades.set_index(['time', 'price'])
quotes = self.quotes.set_index('time')
with self.assertRaises(MergeError):
merge_asof(trades, quotes,
left_index=True,
right_index=True)
trades = self.trades.set_index('time')
quotes = self.quotes.set_index(['time', 'bid'])
with self.assertRaises(MergeError):
merge_asof(trades, quotes,
left_index=True,
right_index=True)
def test_on_and_index(self):
# 'on' parameter and index together is prohibited
trades = self.trades.set_index('time')
quotes = self.quotes.set_index('time')
with self.assertRaises(MergeError):
merge_asof(trades, quotes,
left_on='price',
left_index=True,
right_index=True)
trades = self.trades.set_index('time')
quotes = self.quotes.set_index('time')
with self.assertRaises(MergeError):
merge_asof(trades, quotes,
right_on='bid',
left_index=True,
right_index=True)
def test_basic_left_by_right_by(self):
# GH14253
expected = self.asof
trades = self.trades
quotes = self.quotes
result = merge_asof(trades, quotes,
on='time',
left_by='ticker',
right_by='ticker')
assert_frame_equal(result, expected)
def test_missing_right_by(self):
expected = self.asof