Merge master branch into GH13936

This commit is contained in:
Christopher C. Aycock 2016-12-14 13:53:32 -05:00
commit 77eb47baa4
28 changed files with 1151 additions and 343 deletions

View File

@ -1,5 +1,5 @@
from .pandas_vb_common import *
from pandas.core.reshape import melt
from pandas.core.reshape import melt, wide_to_long
class melt_dataframe(object):
@ -74,3 +74,25 @@ class unstack_sparse_keyspace(object):
def time_unstack_sparse_keyspace(self):
self.idf.unstack()
class wide_to_long_big(object):
goal_time = 0.2
def setup(self):
vars = 'ABCD'
nyrs = 20
nidvars = 20
N = 5000
yrvars = []
for var in vars:
for yr in range(1, nyrs + 1):
yrvars.append(var + str(yr))
self.df = pd.DataFrame(np.random.randn(N, nidvars + len(yrvars)),
columns=list(range(nidvars)) + yrvars)
self.vars = vars
def time_wide_to_long_big(self):
self.df['id'] = self.df.index
wide_to_long(self.df, list(self.vars), i='id', j='year')

View File

@ -157,6 +157,7 @@ Data manipulations
concat
get_dummies
factorize
wide_to_long
Top-level missing data
~~~~~~~~~~~~~~~~~~~~~~

View File

@ -486,7 +486,9 @@ standard deviation 1), very concisely:
xs_stand.std(1)
Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod`
preserve the location of NA values:
preserve the location of ``NaN`` values. This is somewhat different from
:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling`.
For more details please see :ref:`this note <stats.moments.expanding.note>`.
.. ipython:: python

View File

@ -691,6 +691,8 @@ Method Summary
:meth:`~Expanding.cov`, Unbiased covariance (binary)
:meth:`~Expanding.corr`, Correlation (binary)
.. currentmodule:: pandas
Aside from not having a ``window`` parameter, these functions have the same
interfaces as their ``.rolling`` counterparts. Like above, the parameters they
all accept are:
@ -700,18 +702,37 @@ all accept are:
``min_periods`` non-null data points have been seen.
- ``center``: boolean, whether to set the labels at the center (default is False)
.. _stats.moments.expanding.note:
.. note::
The output of the ``.rolling`` and ``.expanding`` methods do not return a
``NaN`` if there are at least ``min_periods`` non-null values in the current
window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and
``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is
encountered in the input.
window. For example,
.. ipython:: python
sn = pd.Series([1, 2, np.nan, 3, np.nan, 4])
sn
sn.rolling(2).max()
sn.rolling(2, min_periods=1).max()
In case of expanding functions, this differs from :meth:`~DataFrame.cumsum`,
:meth:`~DataFrame.cumprod`, :meth:`~DataFrame.cummax`,
and :meth:`~DataFrame.cummin`, which return ``NaN`` in the output wherever
a ``NaN`` is encountered in the input. In order to match the output of ``cumsum``
with ``expanding``, use :meth:`~DataFrame.fillna`:
.. ipython:: python
sn.expanding().sum()
sn.cumsum()
sn.cumsum().fillna(method='ffill')
An expanding window statistic will be more stable (and less responsive) than
its rolling window counterpart as the increasing window size decreases the
relative impact of an individual data point. As an example, here is the
:meth:`~Expanding.mean` output for the previous time series dataset:
:meth:`~core.window.Expanding.mean` output for the previous time series dataset:
.. ipython:: python
:suppress:
@ -731,13 +752,14 @@ relative impact of an individual data point. As an example, here is the
Exponentially Weighted Windows
------------------------------
.. currentmodule:: pandas.core.window
A related set of functions are exponentially weighted versions of several of
the above statistics. A similar interface to ``.rolling`` and ``.expanding`` is accessed
thru the ``.ewm`` method to receive an :class:`~pandas.core.window.EWM` object.
through the ``.ewm`` method to receive an :class:`~EWM` object.
A number of expanding EW (exponentially weighted)
methods are provided:
.. currentmodule:: pandas.core.window
.. csv-table::
:header: "Function", "Description"

View File

@ -867,6 +867,12 @@ data columns:
index_col=0) #index is the nominal column
df
.. note::
If a column or index contains an unparseable date, the entire column or
index will be returned unaltered as an object data type. For non-standard
datetime parsing, use :func:`to_datetime` after ``pd.read_csv``.
.. note::
read_csv has a fast_path for parsing datetime strings in iso8601 format,
e.g "2000-01-01T00:01:02+00:00" and similar variations. If you can arrange

View File

@ -18,6 +18,8 @@ What's New
These are new features and improvements of note in each release.
.. include:: whatsnew/v0.20.0.txt
.. include:: whatsnew/v0.19.2.txt
.. include:: whatsnew/v0.19.1.txt

View File

@ -58,4 +58,4 @@ Bug Fixes
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
- Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`)
- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns``
is not scalar and ``values`` is not specified (:issue:`14380`)
is not scalar and ``values`` is not specified (:issue:`14380`)

View File

@ -23,6 +23,16 @@ Performance Improvements
- Improved performance of ``.replace()`` (:issue:`12745`)
.. _whatsnew_0192.enhancements.other:
Other enhancements
^^^^^^^^^^^^^^^^^^
- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)
- ``pd.merge_asof()`` can take multiple columns in ``by`` parameter and has specialized dtypes for better performace (:issue:`13936`)
.. _whatsnew_0192.bug_fixes:
Bug Fixes
@ -82,11 +92,3 @@ Bug Fixes
- Bug in ``unstack()`` if called with a list of column(s) as an argument, regardless of the dtypes of all columns, they get coerced to ``object`` (:issue:`11847`)
.. _whatsnew_0192.enhancements.other:
Other enhancements
^^^^^^^^^^^^^^^^^^
- ``pd.merge_asof()`` can take multiple columns in ``by`` parameter and has specialized dtypes for better performace (:issue:`13936`)

View File

@ -52,6 +52,9 @@ Other enhancements
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
@ -61,6 +64,8 @@ Other enhancements
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
.. _whatsnew_0200.api_breaking:
@ -111,6 +116,7 @@ Removal of prior version deprecations/changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)

View File

@ -3354,12 +3354,16 @@ class NDFrame(PandasObject):
return self._constructor(new_data).__finalize__(self)
def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
"""Synonym for NDFrame.fillna(method='ffill')"""
"""
Synonym for :meth:`DataFrame.fillna(method='ffill') <DataFrame.fillna>`
"""
return self.fillna(method='ffill', axis=axis, inplace=inplace,
limit=limit, downcast=downcast)
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
"""Synonym for NDFrame.fillna(method='bfill')"""
"""
Synonym for :meth:`DataFrame.fillna(method='bfill') <DataFrame.fillna>`
"""
return self.fillna(method='bfill', axis=axis, inplace=inplace,
limit=limit, downcast=downcast)
@ -5477,16 +5481,18 @@ class NDFrame(PandasObject):
cls.cummin = _make_cum_function(
cls, 'cummin', name, name2, axis_descr, "cumulative minimum",
lambda y, axis: np.minimum.accumulate(y, axis), np.inf, np.nan)
lambda y, axis: np.minimum.accumulate(y, axis), "min",
np.inf, np.nan)
cls.cumsum = _make_cum_function(
cls, 'cumsum', name, name2, axis_descr, "cumulative sum",
lambda y, axis: y.cumsum(axis), 0., np.nan)
lambda y, axis: y.cumsum(axis), "sum", 0., np.nan)
cls.cumprod = _make_cum_function(
cls, 'cumprod', name, name2, axis_descr, "cumulative product",
lambda y, axis: y.cumprod(axis), 1., np.nan)
lambda y, axis: y.cumprod(axis), "prod", 1., np.nan)
cls.cummax = _make_cum_function(
cls, 'cummax', name, name2, axis_descr, "cumulative max",
lambda y, axis: np.maximum.accumulate(y, axis), -np.inf, np.nan)
lambda y, axis: np.maximum.accumulate(y, axis), "max",
-np.inf, np.nan)
cls.sum = _make_stat_function(
cls, 'sum', name, name2, axis_descr,
@ -5674,7 +5680,15 @@ skipna : boolean, default True
Returns
-------
%(outname)s : %(name1)s\n"""
%(outname)s : %(name1)s\n
See also
--------
pandas.core.window.Expanding.%(accum_func_name)s : Similar functionality
but ignores ``NaN`` values.
"""
def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
@ -5717,10 +5731,10 @@ def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f):
return set_function_name(stat_func, name, cls)
def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func,
mask_a, mask_b):
def _make_cum_function(cls, name, name1, name2, axis_descr, desc,
accum_func, accum_func_name, mask_a, mask_b):
@Substitution(outname=name, desc=desc, name1=name1, name2=name2,
axis_descr=axis_descr)
axis_descr=axis_descr, accum_func_name=accum_func_name)
@Appender("Return {0} over requested axis.".format(desc) +
_cnum_doc)
def cum_func(self, axis=None, skipna=True, *args, **kwargs):

View File

@ -3,6 +3,7 @@
from pandas.compat import range, zip
from pandas import compat
import itertools
import re
import numpy as np
@ -877,29 +878,55 @@ def lreshape(data, groups, dropna=True, label=None):
return DataFrame(mdata, columns=id_cols + pivot_cols)
def wide_to_long(df, stubnames, i, j):
"""
def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
r"""
Wide panel to long format. Less flexible but more user-friendly than melt.
With stubnames ['A', 'B'], this function expects to find one or more
group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
You specify what you want to call this suffix in the resulting long format
with `j` (for example `j='year'`)
Each row of these wide variables are assumed to be uniquely identified by
`i` (can be a single column name or a list of column names)
All remaining variables in the data frame are left intact.
Parameters
----------
df : DataFrame
The wide-format DataFrame
stubnames : list
A list of stub names. The wide format variables are assumed to
stubnames : str or list-like
The stub name(s). The wide format variables are assumed to
start with the stub names.
i : str
The name of the id variable.
i : str or list-like
Column(s) to use as id variable(s)
j : str
The name of the subobservation variable.
stubend : str
Regex to match for the end of the stubs.
The name of the subobservation variable. What you wish to name your
suffix in the long format.
sep : str, default ""
A character indicating the separation of the variable names
in the wide format, to be stripped from the names in the long format.
For example, if your column names are A-suffix1, A-suffix2, you
can strip the hypen by specifying `sep='-'`
.. versionadded:: 0.20.0
suffix : str, default '\\d+'
A regular expression capturing the wanted suffixes. '\\d+' captures
numeric suffixes. Suffixes with no numbers could be specified with the
negated character class '\\D+'. You can also further disambiguate
suffixes, for example, if your wide variables are of the form
Aone, Btwo,.., and you have an unrelated column Arating, you can
ignore the last one by specifying `suffix='(!?one|two)'`
.. versionadded:: 0.20.0
Returns
-------
DataFrame
A DataFrame that contains each stub name as a variable as well as
variables for i and j.
A DataFrame that contains each stub name as a variable, with new index
(i, j)
Examples
--------
@ -918,7 +945,7 @@ def wide_to_long(df, stubnames, i, j):
0 a d 2.5 3.2 -1.085631 0
1 b e 1.2 1.3 0.997345 1
2 c f 0.7 0.1 0.282978 2
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
X A B
id year
0 1970 -1.085631 a 2.5
@ -928,38 +955,151 @@ def wide_to_long(df, stubnames, i, j):
1 1980 0.997345 e 1.3
2 1980 0.282978 f 0.1
With multuple id columns
>>> df = pd.DataFrame({
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
... })
>>> df
birth famid ht1 ht2
0 1 1 2.8 3.4
1 2 1 2.9 3.8
2 3 1 2.2 2.9
3 1 2 2.0 3.2
4 2 2 1.8 2.8
5 3 2 1.9 2.4
6 1 3 2.2 3.3
7 2 3 2.3 3.4
8 3 3 2.1 2.9
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
>>> l
ht
famid birth age
1 1 1 2.8
2 3.4
2 1 2.9
2 3.8
3 1 2.2
2 2.9
2 1 1 2.0
2 3.2
2 1 1.8
2 2.8
3 1 1.9
2 2.4
3 1 1 2.2
2 3.3
2 1 2.3
2 3.4
3 1 2.1
2 2.9
Going from long back to wide just takes some creative use of `unstack`
>>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
>>> w.columns = pd.Index(w.columns).str.join('')
>>> w.reset_index()
famid birth ht1 ht2
0 1 1 2.8 3.4
1 1 2 2.9 3.8
2 1 3 2.2 2.9
3 2 1 2.0 3.2
4 2 2 1.8 2.8
5 2 3 1.9 2.4
6 3 1 2.2 3.3
7 3 2 2.3 3.4
8 3 3 2.1 2.9
Less wieldy column names are also handled
>>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
... 'A(quarterly)-2011': np.random.rand(3),
... 'B(quarterly)-2010': np.random.rand(3),
... 'B(quarterly)-2011': np.random.rand(3),
... 'X' : np.random.randint(3, size=3)})
>>> df['id'] = df.index
>>> df
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
0 0.531828 0.724455 0.322959 0.293714
1 0.634401 0.611024 0.361789 0.630976
2 0.849432 0.722443 0.228263 0.092105
\
X id
0 0 0
1 1 1
2 2 2
>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
i='id', j='year', sep='-')
X A(quarterly) B(quarterly)
id year
0 2010 0 0.531828 0.322959
1 2010 2 0.634401 0.361789
2 2010 2 0.849432 0.228263
0 2011 0 0.724455 0.293714
1 2011 2 0.611024 0.630976
2 2011 2 0.722443 0.092105
If we have many columns, we could also use a regex to find our
stubnames and pass that list on to wide_to_long
>>> stubnames = set([match[0] for match in
df.columns.str.findall('[A-B]\(.*\)').values
if match != [] ])
>>> list(stubnames)
['B(quarterly)', 'A(quarterly)']
Notes
-----
All extra variables are treated as extra id variables. This simply uses
All extra variables are left untouched. This simply uses
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
in a typicaly case.
"""
def get_var_names(df, regex):
def get_var_names(df, stub, sep, suffix):
regex = "^{0}{1}{2}".format(re.escape(stub), re.escape(sep), suffix)
return df.filter(regex=regex).columns.tolist()
def melt_stub(df, stub, i, j):
varnames = get_var_names(df, "^" + stub)
newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
var_name=j)
newdf_j = newdf[j].str.replace(stub, "")
try:
newdf_j = newdf_j.astype(int)
except ValueError:
pass
newdf[j] = newdf_j
return newdf
def melt_stub(df, stub, i, j, value_vars, sep):
newdf = melt(df, id_vars=i, value_vars=value_vars,
value_name=stub.rstrip(sep), var_name=j)
newdf[j] = Categorical(newdf[j])
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
if i not in id_vars:
id_vars += [i]
return newdf.set_index(i + [j])
newdf = melt_stub(df, stubnames[0], id_vars, j)
if any(map(lambda s: s in df.columns.tolist(), stubnames)):
raise ValueError("stubname can't be identical to a column name")
for stub in stubnames[1:]:
new = melt_stub(df, stub, id_vars, j)
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
return newdf.set_index([i, j])
if not is_list_like(stubnames):
stubnames = [stubnames]
else:
stubnames = list(stubnames)
if not is_list_like(i):
i = [i]
else:
i = list(i)
value_vars = list(map(lambda stub:
get_var_names(df, stub, sep, suffix), stubnames))
value_vars_flattened = [e for sublist in value_vars for e in sublist]
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
melted = []
for s, v in zip(stubnames, value_vars):
melted.append(melt_stub(df, s, i, j, v, sep))
melted = melted[0].join(melted[1:], how='outer')
if len(i) == 1:
new = df[id_vars].set_index(i).join(melted)
return new
new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
return new
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,

View File

@ -1455,9 +1455,9 @@ class CSVFormatter(object):
f = self.path_or_buf
close = False
else:
f = _get_handle(self.path_or_buf, self.mode,
encoding=self.encoding,
compression=self.compression)
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=self.encoding,
compression=self.compression)
close = True
try:

View File

@ -1,11 +1,9 @@
"""Common IO api utilities"""
import sys
import os
import csv
import codecs
import mmap
import zipfile
from contextlib import contextmanager, closing
from pandas.compat import StringIO, BytesIO, string_types, text_type
@ -141,39 +139,6 @@ def _is_s3_url(url):
return False
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
"""read an encoded stream from the reader and transform the bytes to
unicode if required based on the encoding
Parameters
----------
reader : a streamable file-like object
encoding : optional, the encoding to attempt to read
Returns
-------
a tuple of (a stream of decoded bytes, the encoding which was used)
"""
if compat.PY3 or encoding is not None: # pragma: no cover
if encoding:
errors = 'strict'
else:
errors = 'replace'
encoding = 'utf-8'
if compression == 'gzip':
reader = BytesIO(reader.read())
else:
reader = StringIO(reader.read().decode(encoding, errors))
else:
if compression == 'gzip':
reader = BytesIO(reader.read())
encoding = None
return reader, encoding
def _expand_user(filepath_or_buffer):
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
"""
if _is_url(filepath_or_buffer):
req = _urlopen(str(filepath_or_buffer))
if compression == 'infer':
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
compression = 'gzip'
else:
compression = None
# cat on the compression to the tuple returned by the function
to_return = (list(maybe_read_encoded_stream(req, encoding,
compression)) +
[compression])
return tuple(to_return)
url = str(filepath_or_buffer)
req = _urlopen(url)
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
# Override compression based on Content-Encoding header
compression = 'gzip'
reader = BytesIO(req.read())
return reader, encoding, compression
if _is_s3_url(filepath_or_buffer):
from pandas.io.s3 import get_filepath_or_buffer
@ -276,64 +237,161 @@ def file_path_to_url(path):
return urljoin('file:', pathname2url(path))
# ZipFile is not a context manager for <= 2.6
# must be tuple index here since 2.6 doesn't use namedtuple for version_info
if sys.version_info[1] <= 6:
@contextmanager
def ZipFile(*args, **kwargs):
with closing(zipfile.ZipFile(*args, **kwargs)) as zf:
yield zf
else:
ZipFile = zipfile.ZipFile
_compression_to_extension = {
'gzip': '.gz',
'bz2': '.bz2',
'zip': '.zip',
'xz': '.xz',
}
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
"""Gets file handle for given path and mode.
def _infer_compression(filepath_or_buffer, compression):
"""
if compression is not None:
if encoding is not None and not compat.PY3:
msg = 'encoding + compression not yet supported in Python 2'
Get file handle for given path/buffer and mode.
Parameters
----------
filepath_or_buf :
a path (str) or buffer
compression : str, or None
Returns
-------
string compression method, None
Raises
------
ValueError on invalid compression specified
If compression='infer', infer compression. If compression
"""
# No compression has been explicitly specified
if compression is None:
return None
# Cannot infer compression of a buffer. Hence assume no compression.
is_path = isinstance(filepath_or_buffer, compat.string_types)
if compression == 'infer' and not is_path:
return None
# Infer compression from the filename/URL extension
if compression == 'infer':
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.endswith(extension):
return compression
return None
# Compression has been specified. Check that it's valid
if compression in _compression_to_extension:
return compression
msg = 'Unrecognized compression type: {}'.format(compression)
valid = ['infer', None] + sorted(_compression_to_extension)
msg += '\nValid compression types are {}'.format(valid)
raise ValueError(msg)
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
memory_map=False):
"""
Get file handle for given path/buffer and mode.
Parameters
----------
path_or_buf :
a path (str) or buffer
mode : str
mode to open path_or_buf with
encoding : str or None
compression : str or None
Supported compression protocols are gzip, bz2, zip, and xz
memory_map : boolean, default False
See parsers._parser_params for more information.
Returns
-------
f : file-like
A file-like object
handles : list of file-like objects
A list of file-like object that were openned in this function.
"""
handles = list()
f = path_or_buf
is_path = isinstance(path_or_buf, compat.string_types)
if compression:
if compat.PY2 and not is_path and encoding:
msg = 'compression with encoding is not yet supported in Python 2'
raise ValueError(msg)
# GZ Compression
if compression == 'gzip':
import gzip
f = gzip.GzipFile(path, mode)
if is_path:
f = gzip.open(path_or_buf, mode)
else:
f = gzip.GzipFile(fileobj=path_or_buf)
# BZ Compression
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(path, mode)
if is_path:
f = bz2.BZ2File(path_or_buf, mode)
elif compat.PY2:
# Python 2's bz2 module can't take file objects, so have to
# run through decompress manually
f = StringIO(bz2.decompress(path_or_buf.read()))
path_or_buf.close()
else:
f = bz2.BZ2File(path_or_buf)
# ZIP Compression
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(path)
zip_file = zipfile.ZipFile(path_or_buf)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
file_name = zip_names.pop()
f = zip_file.open(file_name)
f = zip_file.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path))
.format(path_or_buf))
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP :{}'
' Only one file per ZIP: {}'
.format(zip_names))
# XZ Compression
elif compression == 'xz':
lzma = compat.import_lzma()
f = lzma.LZMAFile(path, mode)
f = lzma.LZMAFile(path_or_buf, mode)
# Unrecognized Compression
else:
raise ValueError('Unrecognized compression type: %s' %
compression)
if compat.PY3:
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
return f
else:
if compat.PY3:
if encoding:
f = open(path, mode, encoding=encoding)
else:
f = open(path, mode, errors='replace')
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)
handles.append(f)
elif is_path:
if compat.PY2:
# Python 2
f = open(path_or_buf, mode)
elif encoding:
# Python 3 and encoding
f = open(path_or_buf, mode, encoding=encoding)
else:
f = open(path, mode)
# Python 3 and no explicit encoding
f = open(path_or_buf, mode, errors='replace')
handles.append(f)
# in Python 3, convert BytesIO or fileobjects passed with an encoding
if compat.PY3 and (compression or isinstance(f, compat.BytesIO)):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
handles.append(f)
if memory_map and hasattr(f, 'fileno'):
try:
@ -347,7 +405,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
# leave the file handler as is then
pass
return f
return f, handles
class MMapWrapper(BaseIterator):

View File

@ -259,8 +259,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
exists = False
if exists:
with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
json = fh.read()
fh, handles = _get_handle(filepath_or_buffer, 'r',
encoding=encoding)
json = fh.read()
fh.close()
else:
json = filepath_or_buffer
elif hasattr(filepath_or_buffer, 'read'):
@ -723,7 +725,9 @@ def nested_to_record(ds, prefix="", level=0):
def json_normalize(data, record_path=None, meta=None,
meta_prefix=None,
record_prefix=None):
record_prefix=None,
errors='raise'):
"""
"Normalize" semi-structured JSON data into a flat table
@ -740,6 +744,13 @@ def json_normalize(data, record_path=None, meta=None,
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar']
meta_prefix : string, default None
errors : {'raise', 'ignore'}, default 'raise'
* ignore : will ignore KeyError if keys listed in meta are not
always present
* raise : will raise KeyError if keys listed in meta are not
always present
.. versionadded:: 0.20.0
Returns
-------
@ -839,7 +850,16 @@ def json_normalize(data, record_path=None, meta=None,
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
meta_val = _pull_field(obj, val[level:])
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == 'ignore':
meta_val = np.nan
else:
raise \
KeyError("Try running with "
"errors='ignore' as key "
"%s is not always present", e)
meta_vals[key].append(meta_val)
records.extend(recs)

View File

@ -27,12 +27,11 @@ from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.categorical import Categorical
from pandas.core.common import AbstractMethodError
from pandas.core.config import get_option
from pandas.io.date_converters import generic_parser
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
_get_handle, UnicodeReader, UTF8Recoder,
BaseIterator, ParserError, EmptyDataError,
ParserWarning, _NA_VALUES)
ParserWarning, _NA_VALUES, _infer_compression)
from pandas.tseries import tools
from pandas.util.decorators import Appender
@ -168,6 +167,10 @@ default False
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result
'foo'
If a column or index contains an unparseable date, the entire column or
index will be returned unaltered as an object data type. For non-standard
datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``
Note: A fast-path exists for iso8601-formatted dates.
infer_datetime_format : boolean, default False
If True and parse_dates is enabled, pandas will attempt to infer the format
@ -354,37 +357,17 @@ def _validate_nrows(nrows):
def _read(filepath_or_buffer, kwds):
"Generic reader of line files."
"""Generic reader of line files."""
encoding = kwds.get('encoding', None)
if encoding is not None:
encoding = re.sub('_', '-', encoding).lower()
kwds['encoding'] = encoding
# If the input could be a filename, check for a recognizable compression
# extension. If we're reading from a URL, the `get_filepath_or_buffer`
# will use header info to determine compression, so use what it finds in
# that case.
inferred_compression = kwds.get('compression')
if inferred_compression == 'infer':
if isinstance(filepath_or_buffer, compat.string_types):
if filepath_or_buffer.endswith('.gz'):
inferred_compression = 'gzip'
elif filepath_or_buffer.endswith('.bz2'):
inferred_compression = 'bz2'
elif filepath_or_buffer.endswith('.zip'):
inferred_compression = 'zip'
elif filepath_or_buffer.endswith('.xz'):
inferred_compression = 'xz'
else:
inferred_compression = None
else:
inferred_compression = None
compression = kwds.get('compression')
compression = _infer_compression(filepath_or_buffer, compression)
filepath_or_buffer, _, compression = get_filepath_or_buffer(
filepath_or_buffer, encoding,
compression=kwds.get('compression', None))
kwds['compression'] = (inferred_compression if compression == 'infer'
else compression)
filepath_or_buffer, encoding, compression)
kwds['compression'] = compression
if kwds.get('date_parser', None) is not None:
if isinstance(kwds['parse_dates'], bool):
@ -1771,70 +1754,6 @@ def count_empty_vals(vals):
return sum([1 for v in vals if v == '' or v is None])
def _wrap_compressed(f, compression, encoding=None):
"""wraps compressed fileobject in a decompressing fileobject
NOTE: For all files in Python 3.2 and for bzip'd files under all Python
versions, this means reading in the entire file and then re-wrapping it in
StringIO.
"""
compression = compression.lower()
encoding = encoding or get_option('display.encoding')
if compression == 'gzip':
import gzip
f = gzip.GzipFile(fileobj=f)
if compat.PY3:
from io import TextIOWrapper
f = TextIOWrapper(f)
return f
elif compression == 'bz2':
import bz2
if compat.PY3:
f = bz2.open(f, 'rt', encoding=encoding)
else:
# Python 2's bz2 module can't take file objects, so have to
# run through decompress manually
data = bz2.decompress(f.read())
f = StringIO(data)
return f
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(f)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
file_name = zip_names.pop()
f = zip_file.open(file_name)
return f
elif len(zip_names) == 0:
raise ValueError('Corrupted or zero files found in compressed '
'zip file %s', zip_file.filename)
else:
raise ValueError('Multiple files found in compressed '
'zip file %s', str(zip_names))
elif compression == 'xz':
lzma = compat.import_lzma()
f = lzma.LZMAFile(f)
if compat.PY3:
from io import TextIOWrapper
f = TextIOWrapper(f)
return f
else:
raise ValueError('do not recognize compression method %s'
% compression)
class PythonParser(ParserBase):
def __init__(self, f, **kwds):
@ -1890,20 +1809,10 @@ class PythonParser(ParserBase):
self.comment = kwds['comment']
self._comment_lines = []
if isinstance(f, compat.string_types):
f = _get_handle(f, 'r', encoding=self.encoding,
compression=self.compression,
memory_map=self.memory_map)
self.handles.append(f)
elif self.compression:
f = _wrap_compressed(f, self.compression, self.encoding)
self.handles.append(f)
# in Python 3, convert BytesIO or fileobjects passed with an encoding
elif compat.PY3 and isinstance(f, compat.BytesIO):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=self.encoding)
self.handles.append(f)
f, handles = _get_handle(f, 'r', encoding=self.encoding,
compression=self.compression,
memory_map=self.memory_map)
self.handles.extend(handles)
# Set self.data to something that can read lines.
if hasattr(f, 'readline'):

View File

@ -99,9 +99,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
conn = boto.connect_s3(host=s3_host, anon=True)
b = conn.get_bucket(parsed_url.netloc, validate=False)
if compat.PY2 and (compression == 'gzip' or
(compression == 'infer' and
filepath_or_buffer.endswith(".gz"))):
if compat.PY2 and compression:
k = boto.s3.key.Key(b, parsed_url.path)
filepath_or_buffer = BytesIO(k.get_contents_as_string(
encoding=encoding))

View File

@ -225,6 +225,65 @@ class TestNestedToRecord(tm.TestCase):
self.assertEqual(result, expected)
def test_json_normalize_errors(self):
# GH14583: If meta keys are not always present
# a new option to set errors='ignore' has been implemented
i = {
"Trades": [{
"general": {
"tradeid": 100,
"trade_version": 1,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}, {
"general": {
"tradeid": 100,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}
]
}
j = json_normalize(data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='ignore')
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
self.assertEqual(j.fillna('').to_dict(), expected)
self.assertRaises(KeyError,
json_normalize, data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='raise'
)
if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb',
'--pdb-failure', '-s'], exit=False)

View File

@ -168,3 +168,8 @@ class CompressionTests(object):
tm.assert_frame_equal(expected, df)
inputs[3].close()
def test_invalid_compression(self):
msg = 'Unrecognized compression type: sfark'
with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv('test_file.zip', compression='sfark')

View File

@ -7,6 +7,8 @@ and hence require a network connection to be read.
import os
import nose
import functools
from itertools import product
import pandas.util.testing as tm
from pandas import DataFrame
@ -14,24 +16,40 @@ from pandas import compat
from pandas.io.parsers import read_csv, read_table
class TestUrlGz(tm.TestCase):
class TestCompressedUrl(object):
def setUp(self):
dirpath = tm.get_data_path()
localtable = os.path.join(dirpath, 'salaries.csv')
self.local_table = read_table(localtable)
compression_to_extension = {
'gzip': '.gz',
'bz2': '.bz2',
'zip': '.zip',
'xz': '.xz',
}
@tm.network
def test_url_gz(self):
url = ('https://raw.github.com/pandas-dev/pandas/'
'master/pandas/io/tests/parser/data/salaries.csv.gz')
url_table = read_table(url, compression="gzip", engine="python")
tm.assert_frame_equal(url_table, self.local_table)
def __init__(self):
path = os.path.join(tm.get_data_path(), 'salaries.csv')
self.local_table = read_table(path)
self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
'pandas/io/tests/parser/data/salaries.csv')
@tm.network
def test_url_gz_infer(self):
url = 'https://s3.amazonaws.com/pandas-test/salary.table.gz'
url_table = read_table(url, compression="infer", engine="python")
def test_compressed_urls(self):
"""Test reading compressed tables from URL."""
msg = ('Test reading {}-compressed tables from URL: '
'compression="{}", engine="{}"')
for compression, extension in self.compression_to_extension.items():
url = self.base_url + extension
# args is a (compression, engine) tuple
for args in product([compression, 'infer'], ['python']):
# test_fxn is a workaround for more descriptive nose reporting.
# See http://stackoverflow.com/a/37393684/4651668.
test_fxn = functools.partial(self.check_table)
test_fxn.description = msg.format(compression, *args)
yield (test_fxn, url) + args
def check_table(self, url, compression, engine):
if url.endswith('.xz'):
tm._skip_if_no_lzma()
url_table = read_table(url, compression=compression, engine=engine)
tm.assert_frame_equal(url_table, self.local_table)

View File

@ -138,6 +138,19 @@ date,time,prn,rxstatus
names=['datetime', 'prn']))
assert_frame_equal(df, df_correct)
def test_parse_date_column_with_empty_string(self):
# GH 6428
data = """case,opdate
7,10/18/2006
7,10/18/2008
621, """
result = read_csv(StringIO(data), parse_dates=['opdate'])
expected_data = [[7, '10/18/2006'],
[7, '10/18/2008'],
[621, ' ']]
expected = DataFrame(expected_data, columns=['case', 'opdate'])
assert_frame_equal(result, expected)
if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)

View File

@ -45,12 +45,12 @@ cdef bint PY2 = version_info[0] == 2
cdef int64_t NPY_NAT = util.get_nat()
cdef int US_RESO = frequencies.US_RESO
cdef int MS_RESO = frequencies.MS_RESO
cdef int S_RESO = frequencies.S_RESO
cdef int T_RESO = frequencies.T_RESO
cdef int H_RESO = frequencies.H_RESO
cdef int D_RESO = frequencies.D_RESO
cdef int RESO_US = frequencies.RESO_US
cdef int RESO_MS = frequencies.RESO_MS
cdef int RESO_SEC = frequencies.RESO_SEC
cdef int RESO_MIN = frequencies.RESO_MIN
cdef int RESO_HR = frequencies.RESO_HR
cdef int RESO_DAY = frequencies.RESO_DAY
cdef extern from "period_helper.h":
ctypedef struct date_info:
@ -516,7 +516,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
cdef:
Py_ssize_t i, n = len(stamps)
pandas_datetimestruct dts
int reso = D_RESO, curr_reso
int reso = RESO_DAY, curr_reso
if tz is not None:
tz = maybe_get_tz(tz)