BUG: _nsorted incorrect with duplicated values in index
closes #13412 closes #14707
This commit is contained in:
parent
4378f82967
commit
6e514dacc1
|
@ -1012,3 +1012,14 @@ class frame_quantile_axis1(object):
|
|||
|
||||
def time_frame_quantile_axis1(self):
|
||||
self.df.quantile([0.1, 0.5], axis=1)
|
||||
|
||||
|
||||
class frame_nlargest(object):
|
||||
goal_time = 0.2
|
||||
|
||||
def setup(self):
|
||||
self.df = DataFrame(np.random.randn(1000, 3),
|
||||
columns=list('ABC'))
|
||||
|
||||
def time_frame_nlargest(self):
|
||||
self.df.nlargest(100, 'A')
|
||||
|
|
|
@ -61,6 +61,7 @@ Bug Fixes
|
|||
- Bug in ``HDFStore`` when writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`)
|
||||
- Bug in ``HDFStore.append()`` when writing a ``Series`` and passing a ``min_itemsize`` argument containing a value for the ``index`` (:issue:`11412`)
|
||||
- Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`)
|
||||
- Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -684,11 +684,12 @@ def select_n_slow(dropped, n, keep, method):
|
|||
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
|
||||
|
||||
|
||||
def select_n(series, n, keep, method):
|
||||
"""Implement n largest/smallest.
|
||||
def select_n_series(series, n, keep, method):
|
||||
"""Implement n largest/smallest for pandas Series
|
||||
|
||||
Parameters
|
||||
----------
|
||||
series : pandas.Series object
|
||||
n : int
|
||||
keep : {'first', 'last'}, default 'first'
|
||||
method : str, {'nlargest', 'nsmallest'}
|
||||
|
@ -717,6 +718,31 @@ def select_n(series, n, keep, method):
|
|||
return dropped.iloc[inds]
|
||||
|
||||
|
||||
def select_n_frame(frame, columns, n, method, keep):
|
||||
"""Implement n largest/smallest for pandas DataFrame
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame : pandas.DataFrame object
|
||||
columns : list or str
|
||||
n : int
|
||||
keep : {'first', 'last'}, default 'first'
|
||||
method : str, {'nlargest', 'nsmallest'}
|
||||
|
||||
Returns
|
||||
-------
|
||||
nordered : DataFrame
|
||||
"""
|
||||
from pandas.core.series import Series
|
||||
if not is_list_like(columns):
|
||||
columns = [columns]
|
||||
columns = list(columns)
|
||||
ser = getattr(frame[columns[0]], method)(n, keep=keep)
|
||||
if isinstance(ser, Series):
|
||||
ser = ser.to_frame()
|
||||
return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]
|
||||
|
||||
|
||||
def _finalize_nsmallest(arr, kth_val, n, keep, narr):
|
||||
ns, = np.nonzero(arr <= kth_val)
|
||||
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
|
||||
|
|
|
@ -3337,15 +3337,6 @@ class DataFrame(NDFrame):
|
|||
return self.sort_index(level=level, axis=axis, ascending=ascending,
|
||||
inplace=inplace, sort_remaining=sort_remaining)
|
||||
|
||||
def _nsorted(self, columns, n, method, keep):
|
||||
if not is_list_like(columns):
|
||||
columns = [columns]
|
||||
columns = list(columns)
|
||||
ser = getattr(self[columns[0]], method)(n, keep=keep)
|
||||
ascending = dict(nlargest=False, nsmallest=True)[method]
|
||||
return self.loc[ser.index].sort_values(columns, ascending=ascending,
|
||||
kind='mergesort')
|
||||
|
||||
def nlargest(self, n, columns, keep='first'):
|
||||
"""Get the rows of a DataFrame sorted by the `n` largest
|
||||
values of `columns`.
|
||||
|
@ -3378,7 +3369,7 @@ class DataFrame(NDFrame):
|
|||
1 10 b 2
|
||||
2 8 d NaN
|
||||
"""
|
||||
return self._nsorted(columns, n, 'nlargest', keep)
|
||||
return algos.select_n_frame(self, columns, n, 'nlargest', keep)
|
||||
|
||||
def nsmallest(self, n, columns, keep='first'):
|
||||
"""Get the rows of a DataFrame sorted by the `n` smallest
|
||||
|
@ -3412,7 +3403,7 @@ class DataFrame(NDFrame):
|
|||
0 1 a 1
|
||||
2 8 d NaN
|
||||
"""
|
||||
return self._nsorted(columns, n, 'nsmallest', keep)
|
||||
return algos.select_n_frame(self, columns, n, 'nsmallest', keep)
|
||||
|
||||
def swaplevel(self, i=-2, j=-1, axis=0):
|
||||
"""
|
||||
|
|
|
@ -1940,7 +1940,7 @@ class Series(base.IndexOpsMixin, strings.StringAccessorMixin,
|
|||
>>> s = pd.Series(np.random.randn(1e6))
|
||||
>>> s.nlargest(10) # only sorts up to the N requested
|
||||
"""
|
||||
return algos.select_n(self, n=n, keep=keep, method='nlargest')
|
||||
return algos.select_n_series(self, n=n, keep=keep, method='nlargest')
|
||||
|
||||
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
|
||||
False: 'first'})
|
||||
|
@ -1978,7 +1978,7 @@ class Series(base.IndexOpsMixin, strings.StringAccessorMixin,
|
|||
>>> s = pd.Series(np.random.randn(1e6))
|
||||
>>> s.nsmallest(10) # only sorts up to the N requested
|
||||
"""
|
||||
return algos.select_n(self, n=n, keep=keep, method='nsmallest')
|
||||
return algos.select_n_series(self, n=n, keep=keep, method='nsmallest')
|
||||
|
||||
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
|
||||
"""
|
||||
|
|
|
@ -1323,6 +1323,35 @@ class TestDataFrameAnalytics(tm.TestCase, TestData):
|
|||
expected = df.sort_values(['a', 'c']).head(5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nsmallest_nlargest_duplicate_index(self):
|
||||
# GH 13412
|
||||
df = pd.DataFrame({'a': [1, 2, 3, 4],
|
||||
'b': [4, 3, 2, 1],
|
||||
'c': [0, 1, 2, 3]},
|
||||
index=[0, 0, 1, 1])
|
||||
result = df.nsmallest(4, 'a')
|
||||
expected = df.sort_values('a').head(4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nlargest(4, 'a')
|
||||
expected = df.sort_values('a', ascending=False).head(4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nsmallest(4, ['a', 'c'])
|
||||
expected = df.sort_values(['a', 'c']).head(4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nsmallest(4, ['c', 'a'])
|
||||
expected = df.sort_values(['c', 'a']).head(4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nlargest(4, ['a', 'c'])
|
||||
expected = df.sort_values(['a', 'c'], ascending=False).head(4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.nlargest(4, ['c', 'a'])
|
||||
expected = df.sort_values(['c', 'a'], ascending=False).head(4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# ----------------------------------------------------------------------
|
||||
# Isin
|
||||
|
||||
|
|
|
@ -1532,6 +1532,15 @@ class TestSeriesAnalytics(TestData, tm.TestCase):
|
|||
with tm.assertRaisesRegexp(ValueError, msg):
|
||||
s.nlargest(keep='invalid')
|
||||
|
||||
# GH 13412
|
||||
s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
|
||||
result = s.nlargest(3)
|
||||
expected = s.sort_values(ascending=False).head(3)
|
||||
assert_series_equal(result, expected)
|
||||
result = s.nsmallest(3)
|
||||
expected = s.sort_values().head(3)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_sortlevel(self):
|
||||
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
|
||||
s = Series([1, 2], mi)
|
||||
|
|
Loading…
Reference in New Issue