[Backport #14777] BUG: Bug in a groupby of a non-lexsorted MultiIndex

closes #14776

Author: Jeff Reback <jeff@reback.net>

Closes #14777 from jreback/mi_sort and squashes the following commits:

cf31905 [Jeff Reback] BUG: Bug in a groupby of a non-lexsorted MultiIndex and multiple grouping levels

(cherry picked from commit f23010aa93)
This commit is contained in:
Jeff Reback 2016-12-04 12:34:14 -05:00 committed by Joris Van den Bossche
parent 7814a6654e
commit 04b83e021b
3 changed files with 31 additions and 2 deletions

View File

@ -36,7 +36,7 @@ Bug Fixes
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when ``skipfooter`` was not being respected by Python's CSV library (:issue:`13879`)
- Bug in ``.groupby(..., sort=True)`` of a non-lexsorted MultiIndex when grouping with multiple levels (:issue:`14776`)

View File

@ -861,7 +861,17 @@ class _GroupBy(PandasObject, SelectionMixin):
if isinstance(result, Series):
result = result.reindex(ax)
else:
result = result.reindex_axis(ax, axis=self.axis)
# this is a very unfortunate situation
# we have a multi-index that is NOT lexsorted
# and we have a result which is duplicated
# we can't reindex, so we resort to this
# GH 14776
if isinstance(ax, MultiIndex) and not ax.is_unique:
result = result.take(result.index.get_indexer_for(
ax.values).unique(), axis=self.axis)
else:
result = result.reindex_axis(ax, axis=self.axis)
elif self.group_keys:

View File

@ -4736,6 +4736,25 @@ class TestGroupBy(tm.TestCase):
result = not_lexsorted_df.groupby('a').mean()
tm.assert_frame_equal(expected, result)
# a transforming function should work regardless of sort
# GH 14776
df = DataFrame({'x': ['a', 'a', 'b', 'a'],
'y': [1, 1, 2, 2],
'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
self.assertFalse(df.index.is_lexsorted())
for level in [0, 1, [0, 1]]:
for sort in [False, True]:
result = df.groupby(level=level, sort=sort).apply(
DataFrame.drop_duplicates)
expected = df
tm.assert_frame_equal(expected, result)
result = df.sort_index().groupby(level=level, sort=sort).apply(
DataFrame.drop_duplicates)
expected = df.sort_index()
tm.assert_frame_equal(expected, result)
def test_groupby_levels_and_columns(self):
# GH9344, GH9049
idx_names = ['x', 'y']