[Backport #12745] PERF: Improve replace perf

When .replace is called with
`dict`, replacements are done per value. Current impl try to soft
convert the dtype in every replacement, but it is enough to be done in
the final replacement.

Author: sinhrks <sinhrks@gmail.com>

Closes #12745 from sinhrks/replace_perf and squashes the following commits:

ffc59b0 [sinhrks] PERF: Improve replace perf

(cherry picked from commit e299560dff)
This commit is contained in:
sinhrks 2016-11-30 06:44:52 -05:00 committed by Joris Van den Bossche
parent 560aded980
commit 7479d4185f
4 changed files with 52 additions and 13 deletions

View File

@ -32,6 +32,30 @@ class replace_large_dict(object):
self.s.replace(self.to_rep, inplace=True)
class replace_convert(object):
goal_time = 0.5
def setup(self):
self.n = (10 ** 3)
self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n)))
self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n)))
self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)),
'B': np.random.randint(self.n, size=(10 ** 3))})
def time_replace_series_timestamp(self):
self.s.replace(self.to_ts)
def time_replace_series_timedelta(self):
self.s.replace(self.to_td)
def time_replace_frame_timestamp(self):
self.df.replace(self.to_ts)
def time_replace_frame_timedelta(self):
self.df.replace(self.to_td)
class replace_replacena(object):
goal_time = 0.2

View File

@ -21,6 +21,7 @@ Highlights include:
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of ``.replace()`` (:issue:`12745`)
.. _whatsnew_0192.bug_fixes:

View File

@ -3477,20 +3477,27 @@ class NDFrame(PandasObject):
res = self if inplace else self.copy()
for c, src in compat.iteritems(to_replace):
if c in value and c in self:
# object conversion is handled in
# series.replace which is called recursivelly
res[c] = res[c].replace(to_replace=src,
value=value[c],
inplace=False, regex=regex)
inplace=False,
regex=regex)
return None if inplace else res
# {'A': NA} -> 0
elif not is_list_like(value):
for k, src in compat.iteritems(to_replace):
if k in self:
new_data = new_data.replace(to_replace=src,
value=value,
filter=[k],
inplace=inplace,
regex=regex)
keys = [(k, src) for k, src in compat.iteritems(to_replace)
if k in self]
keys_len = len(keys) - 1
for i, (k, src) in enumerate(keys):
convert = i == keys_len
new_data = new_data.replace(to_replace=src,
value=value,
filter=[k],
inplace=inplace,
regex=regex,
convert=convert)
else:
raise TypeError('value argument must be scalar, dict, or '
'Series')

View File

@ -622,7 +622,6 @@ class Block(PandasObject):
original_to_replace = to_replace
mask = isnull(self.values)
# try to replace, if we raise an error, convert to ObjectBlock and
# retry
try:
@ -1794,13 +1793,14 @@ class BoolBlock(NumericBlock):
return issubclass(value.dtype.type, np.bool_)
def replace(self, to_replace, value, inplace=False, filter=None,
regex=False, mgr=None):
regex=False, convert=True, mgr=None):
to_replace_values = np.atleast_1d(to_replace)
if not np.can_cast(to_replace_values, bool):
return self
return super(BoolBlock, self).replace(to_replace, value,
inplace=inplace, filter=filter,
regex=regex, mgr=mgr)
regex=regex, convert=convert,
mgr=mgr)
class ObjectBlock(Block):
@ -3213,6 +3213,7 @@ class BlockManager(PandasObject):
masks = [comp(s) for i, s in enumerate(src_list)]
result_blocks = []
src_len = len(src_list) - 1
for blk in self.blocks:
# its possible to get multiple result blocks here
@ -3222,8 +3223,9 @@ class BlockManager(PandasObject):
new_rb = []
for b in rb:
if b.dtype == np.object_:
convert = i == src_len
result = b.replace(s, d, inplace=inplace, regex=regex,
mgr=mgr)
mgr=mgr, convert=convert)
new_rb = _extend_blocks(result, new_rb)
else:
# get our mask for this element, sized to this
@ -4787,7 +4789,12 @@ def _putmask_smart(v, m, n):
# change the dtype
dtype, _ = _maybe_promote(n.dtype)
nv = v.astype(dtype)
if is_extension_type(v.dtype) and is_object_dtype(dtype):
nv = v.get_values(dtype)
else:
nv = v.astype(dtype)
try:
nv[m] = n[m]
except ValueError: