(cherry picked from commit 84cad61556
)
This commit is contained in:
parent
f1d43a4b50
commit
9a6a78f36b
|
@ -32,8 +32,6 @@ Other Enhancements
|
|||
- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)
|
||||
|
||||
|
||||
|
||||
>>>>>>> 49e3137... DOC: whatsnew 0.19.2
|
||||
.. _whatsnew_0192.bug_fixes:
|
||||
|
||||
Bug Fixes
|
||||
|
|
|
@ -259,7 +259,8 @@ ordered_merge.__doc__ = merge_ordered.__doc__
|
|||
|
||||
def merge_asof(left, right, on=None,
|
||||
left_on=None, right_on=None,
|
||||
by=None,
|
||||
left_index=False, right_index=False,
|
||||
by=None, left_by=None, right_by=None,
|
||||
suffixes=('_x', '_y'),
|
||||
tolerance=None,
|
||||
allow_exact_matches=True):
|
||||
|
@ -288,9 +289,29 @@ def merge_asof(left, right, on=None,
|
|||
Field name to join on in left DataFrame.
|
||||
right_on : label
|
||||
Field name to join on in right DataFrame.
|
||||
left_index : boolean
|
||||
Use the index of the left DataFrame as the join key.
|
||||
|
||||
.. versionadded:: 0.19.2
|
||||
|
||||
right_index : boolean
|
||||
Use the index of the right DataFrame as the join key.
|
||||
|
||||
.. versionadded:: 0.19.2
|
||||
|
||||
by : column name
|
||||
Group both the left and right DataFrames by the group column; perform
|
||||
the merge operation on these pieces and recombine.
|
||||
left_by : column name
|
||||
Field name to group by in the left DataFrame.
|
||||
|
||||
.. versionadded:: 0.19.2
|
||||
|
||||
right_by : column name
|
||||
Field name to group by in the right DataFrame.
|
||||
|
||||
.. versionadded:: 0.19.2
|
||||
|
||||
suffixes : 2-length sequence (tuple, list, ...)
|
||||
Suffix to apply to overlapping column names in the left and right
|
||||
side, respectively
|
||||
|
@ -348,6 +369,28 @@ def merge_asof(left, right, on=None,
|
|||
3 5 b 3.0
|
||||
6 10 c 7.0
|
||||
|
||||
We can use indexed DataFrames as well.
|
||||
|
||||
>>> left
|
||||
left_val
|
||||
1 a
|
||||
5 b
|
||||
10 c
|
||||
|
||||
>>> right
|
||||
right_val
|
||||
1 1
|
||||
2 2
|
||||
3 3
|
||||
6 6
|
||||
7 7
|
||||
|
||||
>>> pd.merge_asof(left, right, left_index=True, right_index=True)
|
||||
left_val right_val
|
||||
1 a 1
|
||||
5 b 3
|
||||
10 c 7
|
||||
|
||||
Here is a real-world times-series example
|
||||
|
||||
>>> quotes
|
||||
|
@ -418,7 +461,9 @@ def merge_asof(left, right, on=None,
|
|||
"""
|
||||
op = _AsOfMerge(left, right,
|
||||
on=on, left_on=left_on, right_on=right_on,
|
||||
by=by, suffixes=suffixes,
|
||||
left_index=left_index, right_index=right_index,
|
||||
by=by, left_by=left_by, right_by=right_by,
|
||||
suffixes=suffixes,
|
||||
how='asof', tolerance=tolerance,
|
||||
allow_exact_matches=allow_exact_matches)
|
||||
return op.get_result()
|
||||
|
@ -650,7 +695,7 @@ class _MergeOperation(object):
|
|||
left_ax = self.left._data.axes[self.axis]
|
||||
right_ax = self.right._data.axes[self.axis]
|
||||
|
||||
if self.left_index and self.right_index:
|
||||
if self.left_index and self.right_index and self.how != 'asof':
|
||||
join_index, left_indexer, right_indexer = \
|
||||
left_ax.join(right_ax, how=self.how, return_indexers=True)
|
||||
elif self.right_index and self.how == 'left':
|
||||
|
@ -731,6 +776,16 @@ class _MergeOperation(object):
|
|||
is_rkey = lambda x: isinstance(
|
||||
x, (np.ndarray, ABCSeries)) and len(x) == len(right)
|
||||
|
||||
# Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
|
||||
# user could, for example, request 'left_index' and 'left_by'. In a
|
||||
# regular pd.merge(), users cannot specify both 'left_index' and
|
||||
# 'left_on'. (Instead, users have a MultiIndex). That means the
|
||||
# self.left_on in this function is always empty in a pd.merge(), but
|
||||
# a pd.merge_asof(left_index=True, left_by=...) will result in a
|
||||
# self.left_on array with a None in the middle of it. This requires
|
||||
# a work-around as designated in the code below.
|
||||
# See _validate_specification() for where this happens.
|
||||
|
||||
# ugh, spaghetti re #733
|
||||
if _any(self.left_on) and _any(self.right_on):
|
||||
for lk, rk in zip(self.left_on, self.right_on):
|
||||
|
@ -740,12 +795,21 @@ class _MergeOperation(object):
|
|||
right_keys.append(rk)
|
||||
join_names.append(None) # what to do?
|
||||
else:
|
||||
right_keys.append(right[rk]._values)
|
||||
join_names.append(rk)
|
||||
if rk is not None:
|
||||
right_keys.append(right[rk]._values)
|
||||
join_names.append(rk)
|
||||
else:
|
||||
# work-around for merge_asof(right_index=True)
|
||||
right_keys.append(right.index)
|
||||
join_names.append(right.index.name)
|
||||
else:
|
||||
if not is_rkey(rk):
|
||||
right_keys.append(right[rk]._values)
|
||||
if lk == rk:
|
||||
if rk is not None:
|
||||
right_keys.append(right[rk]._values)
|
||||
else:
|
||||
# work-around for merge_asof(right_index=True)
|
||||
right_keys.append(right.index)
|
||||
if lk is not None and lk == rk:
|
||||
# avoid key upcast in corner case (length-0)
|
||||
if len(left) > 0:
|
||||
right_drop.append(rk)
|
||||
|
@ -753,8 +817,13 @@ class _MergeOperation(object):
|
|||
left_drop.append(lk)
|
||||
else:
|
||||
right_keys.append(rk)
|
||||
left_keys.append(left[lk]._values)
|
||||
join_names.append(lk)
|
||||
if lk is not None:
|
||||
left_keys.append(left[lk]._values)
|
||||
join_names.append(lk)
|
||||
else:
|
||||
# work-around for merge_asof(left_index=True)
|
||||
left_keys.append(left.index)
|
||||
join_names.append(left.index.name)
|
||||
elif _any(self.left_on):
|
||||
for k in self.left_on:
|
||||
if is_lkey(k):
|
||||
|
@ -879,13 +948,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
|
|||
class _OrderedMerge(_MergeOperation):
|
||||
_merge_type = 'ordered_merge'
|
||||
|
||||
def __init__(self, left, right, on=None, left_on=None,
|
||||
right_on=None, axis=1,
|
||||
def __init__(self, left, right, on=None, left_on=None, right_on=None,
|
||||
left_index=False, right_index=False, axis=1,
|
||||
suffixes=('_x', '_y'), copy=True,
|
||||
fill_method=None, how='outer'):
|
||||
|
||||
self.fill_method = fill_method
|
||||
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
|
||||
left_index=left_index,
|
||||
right_index=right_index,
|
||||
right_on=right_on, axis=axis,
|
||||
how=how, suffixes=suffixes,
|
||||
sort=True # factorize sorts
|
||||
|
@ -958,19 +1029,23 @@ def _get_cython_type(dtype):
|
|||
class _AsOfMerge(_OrderedMerge):
|
||||
_merge_type = 'asof_merge'
|
||||
|
||||
def __init__(self, left, right, on=None, by=None, left_on=None,
|
||||
right_on=None, axis=1,
|
||||
suffixes=('_x', '_y'), copy=True,
|
||||
def __init__(self, left, right, on=None, left_on=None, right_on=None,
|
||||
left_index=False, right_index=False,
|
||||
by=None, left_by=None, right_by=None,
|
||||
axis=1, suffixes=('_x', '_y'), copy=True,
|
||||
fill_method=None,
|
||||
how='asof', tolerance=None,
|
||||
allow_exact_matches=True):
|
||||
|
||||
self.by = by
|
||||
self.left_by = left_by
|
||||
self.right_by = right_by
|
||||
self.tolerance = tolerance
|
||||
self.allow_exact_matches = allow_exact_matches
|
||||
|
||||
_OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
|
||||
right_on=right_on, axis=axis,
|
||||
right_on=right_on, left_index=left_index,
|
||||
right_index=right_index, axis=axis,
|
||||
how=how, suffixes=suffixes,
|
||||
fill_method=fill_method)
|
||||
|
||||
|
@ -978,23 +1053,44 @@ class _AsOfMerge(_OrderedMerge):
|
|||
super(_AsOfMerge, self)._validate_specification()
|
||||
|
||||
# we only allow on to be a single item for on
|
||||
if len(self.left_on) != 1:
|
||||
if len(self.left_on) != 1 and not self.left_index:
|
||||
raise MergeError("can only asof on a key for left")
|
||||
|
||||
if len(self.right_on) != 1:
|
||||
if len(self.right_on) != 1 and not self.right_index:
|
||||
raise MergeError("can only asof on a key for right")
|
||||
|
||||
if self.left_index and isinstance(self.left.index, MultiIndex):
|
||||
raise MergeError("left can only have one index")
|
||||
|
||||
if self.right_index and isinstance(self.right.index, MultiIndex):
|
||||
raise MergeError("right can only have one index")
|
||||
|
||||
# set 'by' columns
|
||||
if self.by is not None:
|
||||
if self.left_by is not None or self.right_by is not None:
|
||||
raise MergeError('Can only pass by OR left_by '
|
||||
'and right_by')
|
||||
self.left_by = self.right_by = self.by
|
||||
if self.left_by is None and self.right_by is not None:
|
||||
raise MergeError('missing left_by')
|
||||
if self.left_by is not None and self.right_by is None:
|
||||
raise MergeError('missing right_by')
|
||||
|
||||
# add by to our key-list so we can have it in the
|
||||
# output as a key
|
||||
if self.by is not None:
|
||||
if not is_list_like(self.by):
|
||||
self.by = [self.by]
|
||||
if self.left_by is not None:
|
||||
if not is_list_like(self.left_by):
|
||||
self.left_by = [self.left_by]
|
||||
if not is_list_like(self.right_by):
|
||||
self.right_by = [self.right_by]
|
||||
|
||||
if len(self.by) != 1:
|
||||
if len(self.left_by) != 1:
|
||||
raise MergeError("can only asof by a single key")
|
||||
if len(self.right_by) != 1:
|
||||
raise MergeError("can only asof by a single key")
|
||||
|
||||
self.left_on = self.by + list(self.left_on)
|
||||
self.right_on = self.by + list(self.right_on)
|
||||
self.left_on = self.left_by + list(self.left_on)
|
||||
self.right_on = self.right_by + list(self.right_on)
|
||||
|
||||
@property
|
||||
def _asof_key(self):
|
||||
|
@ -1017,7 +1113,7 @@ class _AsOfMerge(_OrderedMerge):
|
|||
# validate tolerance; must be a Timedelta if we have a DTI
|
||||
if self.tolerance is not None:
|
||||
|
||||
lt = left_join_keys[self.left_on.index(self._asof_key)]
|
||||
lt = left_join_keys[-1]
|
||||
msg = "incompatible tolerance, must be compat " \
|
||||
"with type {0}".format(type(lt))
|
||||
|
||||
|
@ -1047,8 +1143,10 @@ class _AsOfMerge(_OrderedMerge):
|
|||
""" return the join indexers """
|
||||
|
||||
# values to compare
|
||||
left_values = self.left_join_keys[-1]
|
||||
right_values = self.right_join_keys[-1]
|
||||
left_values = (self.left.index.values if self.left_index else
|
||||
self.left_join_keys[-1])
|
||||
right_values = (self.right.index.values if self.right_index else
|
||||
self.right_join_keys[-1])
|
||||
tolerance = self.tolerance
|
||||
|
||||
# we required sortedness in the join keys
|
||||
|
@ -1066,7 +1164,7 @@ class _AsOfMerge(_OrderedMerge):
|
|||
tolerance = tolerance.value
|
||||
|
||||
# a "by" parameter requires special handling
|
||||
if self.by is not None:
|
||||
if self.left_by is not None:
|
||||
left_by_values = self.left_join_keys[0]
|
||||
right_by_values = self.right_join_keys[0]
|
||||
|
||||
|
|
|
@ -118,6 +118,96 @@ class TestAsOfMerge(tm.TestCase):
|
|||
by='ticker')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_left_index(self):
|
||||
|
||||
# GH14253
|
||||
expected = self.asof
|
||||
trades = self.trades.set_index('time')
|
||||
quotes = self.quotes
|
||||
|
||||
result = merge_asof(trades, quotes,
|
||||
left_index=True,
|
||||
right_on='time',
|
||||
by='ticker')
|
||||
# left-only index uses right's index, oddly
|
||||
expected.index = result.index
|
||||
# time column appears after left's columns
|
||||
expected = expected[result.columns]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_right_index(self):
|
||||
|
||||
expected = self.asof
|
||||
trades = self.trades
|
||||
quotes = self.quotes.set_index('time')
|
||||
|
||||
result = merge_asof(trades, quotes,
|
||||
left_on='time',
|
||||
right_index=True,
|
||||
by='ticker')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_left_index_right_index(self):
|
||||
|
||||
expected = self.asof.set_index('time')
|
||||
trades = self.trades.set_index('time')
|
||||
quotes = self.quotes.set_index('time')
|
||||
|
||||
result = merge_asof(trades, quotes,
|
||||
left_index=True,
|
||||
right_index=True,
|
||||
by='ticker')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_multi_index(self):
|
||||
|
||||
# MultiIndex is prohibited
|
||||
trades = self.trades.set_index(['time', 'price'])
|
||||
quotes = self.quotes.set_index('time')
|
||||
with self.assertRaises(MergeError):
|
||||
merge_asof(trades, quotes,
|
||||
left_index=True,
|
||||
right_index=True)
|
||||
|
||||
trades = self.trades.set_index('time')
|
||||
quotes = self.quotes.set_index(['time', 'bid'])
|
||||
with self.assertRaises(MergeError):
|
||||
merge_asof(trades, quotes,
|
||||
left_index=True,
|
||||
right_index=True)
|
||||
|
||||
def test_on_and_index(self):
|
||||
|
||||
# 'on' parameter and index together is prohibited
|
||||
trades = self.trades.set_index('time')
|
||||
quotes = self.quotes.set_index('time')
|
||||
with self.assertRaises(MergeError):
|
||||
merge_asof(trades, quotes,
|
||||
left_on='price',
|
||||
left_index=True,
|
||||
right_index=True)
|
||||
|
||||
trades = self.trades.set_index('time')
|
||||
quotes = self.quotes.set_index('time')
|
||||
with self.assertRaises(MergeError):
|
||||
merge_asof(trades, quotes,
|
||||
right_on='bid',
|
||||
left_index=True,
|
||||
right_index=True)
|
||||
|
||||
def test_basic_left_by_right_by(self):
|
||||
|
||||
# GH14253
|
||||
expected = self.asof
|
||||
trades = self.trades
|
||||
quotes = self.quotes
|
||||
|
||||
result = merge_asof(trades, quotes,
|
||||
on='time',
|
||||
left_by='ticker',
|
||||
right_by='ticker')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_right_by(self):
|
||||
|
||||
expected = self.asof
|
||||
|
|
Loading…
Reference in New Issue