Merge pull request #6734 from sinhrks/ind_nunique

ENH: added nunique function to Index
pull/6784/head
jreback 9 years ago
commit 657d255393
  1. 6
      doc/source/api.rst
  2. 2
      doc/source/release.rst
  3. 1
      doc/source/v0.14.0.txt
  4. 50
      pandas/core/base.py
  5. 12
      pandas/core/index.py
  6. 49
      pandas/core/series.py
  7. 212
      pandas/tests/test_base.py
  8. 97
      pandas/tests/test_series.py
  9. 11
      pandas/tseries/tests/test_period.py
  10. 17
      pandas/tseries/tests/test_timeseries.py

@ -348,7 +348,6 @@ Computations / Descriptive Stats
Series.median
Series.min
Series.mode
Series.nunique
Series.pct_change
Series.prod
Series.quantile
@ -356,8 +355,9 @@ Computations / Descriptive Stats
Series.skew
Series.std
Series.sum
Series.unique
Series.var
Series.unique
Series.nunique
Series.value_counts
Reindexing / Selection / Label manipulation
@ -1053,6 +1053,8 @@ Modifying and Computations
Index.repeat
Index.set_names
Index.unique
Index.nunique
Index.value_counts
Conversion
~~~~~~~~~~

@ -159,6 +159,8 @@ API Changes
- Arithmetic ops are now disallowed when passed two bool dtype Series or
DataFrames (:issue:`6762`).
- Added ``nunique`` and ``value_counts`` functions to ``Index`` for counting unique elements. (:issue:`6734`)
Deprecations
~~~~~~~~~~~~

@ -199,6 +199,7 @@ API changes
- ``Series.iteritems()`` is now lazy (returns an iterator rather than a list). This was the documented behavior prior to 0.14. (:issue:`6760`)
- ``Panel.shift`` now uses ``NDFrame.shift``. It no longer drops the ``nan`` data and retains its original shape. (:issue:`4867`)
- Added ``nunique`` and ``value_counts`` functions to ``Index`` for counting unique elements. (:issue:`6734`)
MultiIndexing Using Slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -269,6 +269,56 @@ class IndexOpsMixin(object):
self._is_allowed_index_op('min')
return self.values.min()
def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None):
"""
Returns object containing counts of unique values. The resulting object
will be in descending order so that the first element is the most
frequently-occurring element. Excludes NA values.
Parameters
----------
normalize : boolean, default False
If True then the object returned will contain the relative
frequencies of the unique values.
sort : boolean, default True
Sort by values
ascending : boolean, default False
Sort in ascending order
bins : integer, optional
Rather than count values, group them into half-open bins,
a convenience for pd.cut, only works with numeric data
Returns
-------
counts : Series
"""
from pandas.core.algorithms import value_counts
return value_counts(self.values, sort=sort, ascending=ascending,
normalize=normalize, bins=bins)
def unique(self):
"""
Return array of unique values in the object. Significantly faster than
numpy.unique. Includes NA values.
Returns
-------
uniques : ndarray
"""
from pandas.core.nanops import unique1d
return unique1d(self.values)
def nunique(self):
"""
Return count of unique elements in the object. Excludes NA values.
Returns
-------
nunique : int
"""
return len(self.value_counts())
date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps')
time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps')
year = _field_accessor('year', "The year of the datetime")

@ -1102,18 +1102,6 @@ class Index(IndexOpsMixin, FrozenNDArray):
the_diff = sorted(set((self - other) + (other - self)))
return Index(the_diff, name=result_name)
def unique(self):
"""
Return array of unique values in the Index. Significantly faster than
numpy.unique
Returns
-------
uniques : ndarray
"""
from pandas.core.nanops import unique1d
return unique1d(self.values)
def get_loc(self, key):
"""
Get integer location for requested label

@ -1095,34 +1095,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
return notnull(_values_from_object(self)).sum()
def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None):
"""
Returns Series containing counts of unique values. The resulting Series
will be in descending order so that the first element is the most
frequently-occurring element. Excludes NA values
Parameters
----------
normalize : boolean, default False
If True then the Series returned will contain the relative
frequencies of the unique values.
sort : boolean, default True
Sort by values
ascending : boolean, default False
Sort in ascending order
bins : integer, optional
Rather than count values, group them into half-open bins,
a convenience for pd.cut, only works with numeric data
Returns
-------
counts : Series
"""
from pandas.core.algorithms import value_counts
return value_counts(self.values, sort=sort, ascending=ascending,
normalize=normalize, bins=bins)
def mode(self):
"""Returns the mode(s) of the dataset.
@ -1143,27 +1115,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
from pandas.core.algorithms import mode
return mode(self)
def unique(self):
"""
Return array of unique values in the Series. Significantly faster than
numpy.unique
Returns
-------
uniques : ndarray
"""
return nanops.unique1d(self.values)
def nunique(self):
"""
Return count of unique elements in the Series
Returns
-------
nunique : int
"""
return len(self.value_counts())
def drop_duplicates(self, take_last=False, inplace=False):
"""
Return Series with duplicate values removed

@ -1,11 +1,12 @@
import re
from datetime import timedelta
import numpy as np
import pandas.compat as compat
import pandas as pd
from pandas.compat import u
from pandas.compat import u, StringIO
from pandas.core.base import FrozenList, FrozenNDArray
from pandas.util.testing import assertRaisesRegexp, assert_isinstance
from pandas import Series, Index, DatetimeIndex, PeriodIndex
from pandas import Series, Index, Int64Index, DatetimeIndex, PeriodIndex
from pandas import _np_version_under1p7
import nose
@ -130,6 +131,7 @@ class Ops(tm.TestCase):
self.int_index = tm.makeIntIndex(10)
self.float_index = tm.makeFloatIndex(10)
self.dt_index = tm.makeDateIndex(10)
self.dt_tz_index = tm.makeDateIndex(10).tz_localize(tz='US/Eastern')
self.period_index = tm.makePeriodIndex(10)
self.string_index = tm.makeStringIndex(10)
@ -137,10 +139,12 @@ class Ops(tm.TestCase):
self.int_series = Series(arr, index=self.int_index)
self.float_series = Series(arr, index=self.int_index)
self.dt_series = Series(arr, index=self.dt_index)
self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True)
self.period_series = Series(arr, index=self.period_index)
self.string_series = Series(arr, index=self.string_index)
self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in ['int','float','dt','period','string'] for f in ['index','series'] ]
types = ['int','float','dt', 'dt_tz', 'period','string']
self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in types for f in ['index','series'] ]
def check_ops_properties(self, props, filter=None, ignore_failures=False):
for op in props:
@ -193,7 +197,207 @@ class TestIndexOps(Ops):
for o in self.objs:
result = getattr(o,op)()
expected = getattr(o.values,op)()
self.assertEqual(result, expected)
try:
self.assertEqual(result, expected)
except ValueError:
# comparing tz-aware series with np.array results in ValueError
expected = expected.astype('M8[ns]').astype('int64')
self.assertEqual(result.value, expected)
def test_value_counts_unique_nunique(self):
for o in self.objs:
klass = type(o)
values = o.values
# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(o, PeriodIndex):
# freq must be specified because repeat makes freq ambiguous
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)
else:
o = klass(np.repeat(values, range(1, len(o) + 1)))
expected_s = Series(range(10, 0, -1), index=values[::-1])
tm.assert_series_equal(o.value_counts(), expected_s)
if isinstance(o, DatetimeIndex):
# DatetimeIndex.unique returns DatetimeIndex
self.assert_(o.unique().equals(klass(values)))
else:
self.assert_numpy_array_equal(o.unique(), values)
self.assertEqual(o.nunique(), len(np.unique(o.values)))
for null_obj in [np.nan, None]:
for o in self.objs:
klass = type(o)
values = o.values
if o.values.dtype == 'int64':
# skips int64 because it doesn't allow to include nan or None
continue
if o.values.dtype == 'datetime64[ns]' and _np_version_under1p7:
# Unable to assign None
continue
values[0:2] = null_obj
# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(o, PeriodIndex):
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)
else:
o = klass(np.repeat(values, range(1, len(o) + 1)))
if isinstance(o, DatetimeIndex):
# DatetimeIndex: nan is casted to Nat and included
expected_s = Series(list(range(10, 2, -1)) + [3], index=values[9:0:-1])
else:
# nan is excluded
expected_s = Series(range(10, 2, -1), index=values[9:1:-1])
tm.assert_series_equal(o.value_counts(), expected_s)
# numpy_array_equal cannot compare arrays includes nan
result = o.unique()
self.assert_numpy_array_equal(result[1:], values[2:])
if isinstance(o, DatetimeIndex):
self.assert_(result[0] is pd.NaT)
else:
self.assert_(pd.isnull(result[0]))
if isinstance(o, DatetimeIndex):
self.assertEqual(o.nunique(), 9)
else:
self.assertEqual(o.nunique(), 8)
def test_value_counts_inferred(self):
klasses = [Index, Series]
for klass in klasses:
s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
s = klass(s_values)
expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
tm.assert_series_equal(s.value_counts(), expected)
self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
self.assertEquals(s.nunique(), 4)
# don't sort, have to sort after the fact as not sorting is platform-dep
hist = s.value_counts(sort=False)
hist.sort()
expected = Series([3, 1, 4, 2], index=list('acbd'))
expected.sort()
tm.assert_series_equal(hist, expected)
# sort ascending
hist = s.value_counts(ascending=True)
expected = Series([1, 2, 3, 4], index=list('cdab'))
tm.assert_series_equal(hist, expected)
# relative histogram.
hist = s.value_counts(normalize=True)
expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
tm.assert_series_equal(hist, expected)
# bins
self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)
s1 = Series([1, 1, 2, 3])
res1 = s1.value_counts(bins=1)
exp1 = Series({0.998: 4})
tm.assert_series_equal(res1, exp1)
res1n = s1.value_counts(bins=1, normalize=True)
exp1n = Series({0.998: 1.0})
tm.assert_series_equal(res1n, exp1n)
self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
self.assertEquals(s1.nunique(), 3)
res4 = s1.value_counts(bins=4)
exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
tm.assert_series_equal(res4, exp4)
res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
tm.assert_series_equal(res4n, exp4n)
# handle NA's properly
s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b']
s = klass(s_values)
expected = Series([4, 3, 2], index=['b', 'a', 'd'])
tm.assert_series_equal(s.value_counts(), expected)
self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
self.assertEquals(s.nunique(), 3)
s = klass({})
expected = Series([], dtype=np.int64)
tm.assert_series_equal(s.value_counts(), expected)
self.assert_numpy_array_equal(s.unique(), np.array([]))
self.assertEquals(s.nunique(), 0)
# GH 3002, datetime64[ns]
txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'])
f = StringIO(txt)
df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"],
parse_dates=["dt"])
s = klass(df['dt'].copy())
idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X'])
expected_s = Series([3, 2, 1], index=idx)
tm.assert_series_equal(s.value_counts(), expected_s)
expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'],
dtype='datetime64[ns]')
if isinstance(s, DatetimeIndex):
expected = DatetimeIndex(expected)
self.assert_(s.unique().equals(expected))
else:
self.assert_numpy_array_equal(s.unique(), expected)
self.assertEquals(s.nunique(), 3)
# with NaT
s = df['dt'].copy()
s = klass([v for v in s.values] + [pd.NaT])
result = s.value_counts()
self.assertEqual(result.index.dtype, 'datetime64[ns]')
expected_s[pd.NaT] = 1
tm.assert_series_equal(result, expected_s)
unique = s.unique()
self.assertEqual(unique.dtype, 'datetime64[ns]')
# numpy_array_equal cannot compare pd.NaT
self.assert_numpy_array_equal(unique[:3], expected)
self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT)
self.assertEquals(s.nunique(), 4)
# timedelta64[ns]
td = df.dt - df.dt + timedelta(1)
td = klass(td)
result = td.value_counts()
expected_s = Series([6], index=[86400000000000])
self.assertEqual(result.index.dtype, 'int64')
tm.assert_series_equal(result, expected_s)
# get nanoseconds to compare
expected = np.array([86400000000000])
self.assert_numpy_array_equal(td.unique(), expected)
self.assertEquals(td.nunique(), 1)
td2 = timedelta(1) + (df.dt - df.dt)
td2 = klass(td2)
result2 = td2.value_counts()
self.assertEqual(result2.index.dtype, 'int64')
tm.assert_series_equal(result2, expected_s)
self.assert_numpy_array_equal(td.unique(), expected)
self.assertEquals(td.nunique(), 1)
class TestDatetimeIndexOps(Ops):
_allowed = '_allow_datetime_index_ops'

@ -3805,84 +3805,7 @@ class TestSeries(tm.TestCase, CheckNameIntegration):
self.assertRaises(ValueError, a.dot, b.T)
def test_value_counts_nunique(self):
s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a'])
hist = s.value_counts()
expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
assert_series_equal(hist, expected)
# don't sort, have to sort after the fact as not sorting is platform-dep
hist = s.value_counts(sort=False)
hist.sort()
expected = Series([3, 1, 4, 2], index=list('acbd'))
expected.sort()
assert_series_equal(hist, expected)
# sort ascending
hist = s.value_counts(ascending=True)
expected = Series([1, 2, 3, 4], index=list('cdab'))
assert_series_equal(hist, expected)
# relative histogram.
hist = s.value_counts(normalize=True)
expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
assert_series_equal(hist, expected)
self.assertEquals(s.nunique(), 4)
# bins
self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)
s1 = Series([1, 1, 2, 3])
res1 = s1.value_counts(bins=1)
exp1 = Series({0.998: 4})
assert_series_equal(res1, exp1)
res1n = s1.value_counts(bins=1, normalize=True)
exp1n = Series({0.998: 1.0})
assert_series_equal(res1n, exp1n)
res4 = s1.value_counts(bins=4)
exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
assert_series_equal(res4, exp4)
res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
assert_series_equal(res4n, exp4n)
# handle NA's properly
s[5:7] = np.nan
hist = s.value_counts()
expected = s.dropna().value_counts()
assert_series_equal(hist, expected)
s = Series({})
hist = s.value_counts()
expected = Series([], dtype=np.int64)
assert_series_equal(hist, expected)
# GH 3002, datetime64[ns]
import pandas as pd
f = StringIO(
"xxyyzz20100101PIE\nxxyyzz20100101GUM\nxxyyww20090101EGG\nfoofoo20080909PIE")
df = pd.read_fwf(f, widths=[6, 8, 3], names=[
"person_id", "dt", "food"], parse_dates=["dt"])
s = df.dt.copy()
result = s.value_counts()
self.assertEqual(result.index.dtype, 'datetime64[ns]')
# with NaT
s = s.append(Series({4: pd.NaT}))
result = s.value_counts()
self.assertEqual(result.index.dtype, 'datetime64[ns]')
# timedelta64[ns]
from datetime import timedelta
td = df.dt - df.dt + timedelta(1)
td2 = timedelta(1) + (df.dt - df.dt)
result = td.value_counts()
result2 = td2.value_counts()
#self.assertEqual(result.index.dtype, 'timedelta64[ns]')
self.assertEqual(result.index.dtype, 'int64')
self.assertEqual(result2.index.dtype, 'int64')
# basics.rst doc example
series = Series(np.random.randn(500))
series[20:500] = np.nan
@ -3909,25 +3832,7 @@ class TestSeries(tm.TestCase, CheckNameIntegration):
result = s.unique()
self.assertEqual(len(result), 2)
# integers
s = Series(np.random.randint(0, 100, size=100))
result = np.sort(s.unique())
expected = np.unique(s.values)
self.assert_numpy_array_equal(result, expected)
s = Series(np.random.randint(0, 100, size=100).astype(np.int32))
result = np.sort(s.unique())
expected = np.unique(s.values)
self.assert_numpy_array_equal(result, expected)
# test string arrays for coverage
strings = np.tile(np.array([tm.rands(10) for _ in range(10)]), 10)
result = np.sort(nanops.unique1d(strings))
expected = np.unique(strings)
self.assert_numpy_array_equal(result, expected)
# decision about None
s = Series([1, 2, 3, None, None, None], dtype=object)
result = s.unique()
expected = np.array([1, 2, 3, None], dtype=object)

@ -1487,6 +1487,17 @@ class TestPeriodIndex(tm.TestCase):
expected = ts[idx == 2007]
assert_series_equal(result, expected)
def test_index_unique(self):
idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN')
expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN')
self.assert_numpy_array_equal(idx.unique(), expected.values)
self.assertEqual(idx.nunique(), 3)
idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', tz='US/Eastern')
expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', tz='US/Eastern')
self.assert_numpy_array_equal(idx.unique(), expected.values)
self.assertEqual(idx.nunique(), 3)
def test_constructor(self):
pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009')
assert_equal(len(pi), 9)

@ -77,7 +77,11 @@ class TestTimeSeriesDuplicates(tm.TestCase):
def test_index_unique(self):
uniques = self.dups.index.unique()
expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3),
datetime(2000, 1, 4), datetime(2000, 1, 5)])
self.assertEqual(uniques.dtype, 'M8[ns]') # sanity
self.assert_(uniques.equals(expected))
self.assertEqual(self.dups.index.nunique(), 4)
# #2563
self.assertTrue(isinstance(uniques, DatetimeIndex))
@ -85,8 +89,21 @@ class TestTimeSeriesDuplicates(tm.TestCase):
dups_local = self.dups.index.tz_localize('US/Eastern')
dups_local.name = 'foo'
result = dups_local.unique()
expected = DatetimeIndex(expected, tz='US/Eastern')
self.assertTrue(result.tz is not None)
self.assertEquals(result.name, 'foo')
self.assert_(result.equals(expected))
# NaT
arr = [ 1370745748 + t for t in range(20) ] + [iNaT]
idx = DatetimeIndex(arr * 3)
self.assert_(idx.unique().equals(DatetimeIndex(arr)))
self.assertEqual(idx.nunique(), 21)
arr = [ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]
idx = DatetimeIndex(arr * 3)
self.assert_(idx.unique().equals(DatetimeIndex(arr)))
self.assertEqual(idx.nunique(), 21)
def test_index_dupes_contains(self):
d = datetime(2011, 12, 5, 20, 30)

Loading…
Cancel
Save