Merge master branch into GH13936
This commit is contained in:
commit
77eb47baa4
|
@ -1,5 +1,5 @@
|
|||
from .pandas_vb_common import *
|
||||
from pandas.core.reshape import melt
|
||||
from pandas.core.reshape import melt, wide_to_long
|
||||
|
||||
|
||||
class melt_dataframe(object):
|
||||
|
@ -74,3 +74,25 @@ class unstack_sparse_keyspace(object):
|
|||
|
||||
def time_unstack_sparse_keyspace(self):
|
||||
self.idf.unstack()
|
||||
|
||||
|
||||
class wide_to_long_big(object):
|
||||
goal_time = 0.2
|
||||
|
||||
def setup(self):
|
||||
vars = 'ABCD'
|
||||
nyrs = 20
|
||||
nidvars = 20
|
||||
N = 5000
|
||||
yrvars = []
|
||||
for var in vars:
|
||||
for yr in range(1, nyrs + 1):
|
||||
yrvars.append(var + str(yr))
|
||||
|
||||
self.df = pd.DataFrame(np.random.randn(N, nidvars + len(yrvars)),
|
||||
columns=list(range(nidvars)) + yrvars)
|
||||
self.vars = vars
|
||||
|
||||
def time_wide_to_long_big(self):
|
||||
self.df['id'] = self.df.index
|
||||
wide_to_long(self.df, list(self.vars), i='id', j='year')
|
||||
|
|
|
@ -157,6 +157,7 @@ Data manipulations
|
|||
concat
|
||||
get_dummies
|
||||
factorize
|
||||
wide_to_long
|
||||
|
||||
Top-level missing data
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
|
@ -486,7 +486,9 @@ standard deviation 1), very concisely:
|
|||
xs_stand.std(1)
|
||||
|
||||
Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod`
|
||||
preserve the location of NA values:
|
||||
preserve the location of ``NaN`` values. This is somewhat different from
|
||||
:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling`.
|
||||
For more details please see :ref:`this note <stats.moments.expanding.note>`.
|
||||
|
||||
.. ipython:: python
|
||||
|
||||
|
|
|
@ -691,6 +691,8 @@ Method Summary
|
|||
:meth:`~Expanding.cov`, Unbiased covariance (binary)
|
||||
:meth:`~Expanding.corr`, Correlation (binary)
|
||||
|
||||
.. currentmodule:: pandas
|
||||
|
||||
Aside from not having a ``window`` parameter, these functions have the same
|
||||
interfaces as their ``.rolling`` counterparts. Like above, the parameters they
|
||||
all accept are:
|
||||
|
@ -700,18 +702,37 @@ all accept are:
|
|||
``min_periods`` non-null data points have been seen.
|
||||
- ``center``: boolean, whether to set the labels at the center (default is False)
|
||||
|
||||
.. _stats.moments.expanding.note:
|
||||
.. note::
|
||||
|
||||
The output of the ``.rolling`` and ``.expanding`` methods do not return a
|
||||
``NaN`` if there are at least ``min_periods`` non-null values in the current
|
||||
window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and
|
||||
``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is
|
||||
encountered in the input.
|
||||
window. For example,
|
||||
|
||||
.. ipython:: python
|
||||
|
||||
sn = pd.Series([1, 2, np.nan, 3, np.nan, 4])
|
||||
sn
|
||||
sn.rolling(2).max()
|
||||
sn.rolling(2, min_periods=1).max()
|
||||
|
||||
In case of expanding functions, this differs from :meth:`~DataFrame.cumsum`,
|
||||
:meth:`~DataFrame.cumprod`, :meth:`~DataFrame.cummax`,
|
||||
and :meth:`~DataFrame.cummin`, which return ``NaN`` in the output wherever
|
||||
a ``NaN`` is encountered in the input. In order to match the output of ``cumsum``
|
||||
with ``expanding``, use :meth:`~DataFrame.fillna`:
|
||||
|
||||
.. ipython:: python
|
||||
|
||||
sn.expanding().sum()
|
||||
sn.cumsum()
|
||||
sn.cumsum().fillna(method='ffill')
|
||||
|
||||
|
||||
An expanding window statistic will be more stable (and less responsive) than
|
||||
its rolling window counterpart as the increasing window size decreases the
|
||||
relative impact of an individual data point. As an example, here is the
|
||||
:meth:`~Expanding.mean` output for the previous time series dataset:
|
||||
:meth:`~core.window.Expanding.mean` output for the previous time series dataset:
|
||||
|
||||
.. ipython:: python
|
||||
:suppress:
|
||||
|
@ -731,13 +752,14 @@ relative impact of an individual data point. As an example, here is the
|
|||
Exponentially Weighted Windows
|
||||
------------------------------
|
||||
|
||||
.. currentmodule:: pandas.core.window
|
||||
|
||||
A related set of functions are exponentially weighted versions of several of
|
||||
the above statistics. A similar interface to ``.rolling`` and ``.expanding`` is accessed
|
||||
thru the ``.ewm`` method to receive an :class:`~pandas.core.window.EWM` object.
|
||||
through the ``.ewm`` method to receive an :class:`~EWM` object.
|
||||
A number of expanding EW (exponentially weighted)
|
||||
methods are provided:
|
||||
|
||||
.. currentmodule:: pandas.core.window
|
||||
|
||||
.. csv-table::
|
||||
:header: "Function", "Description"
|
||||
|
|
|
@ -867,6 +867,12 @@ data columns:
|
|||
index_col=0) #index is the nominal column
|
||||
df
|
||||
|
||||
.. note::
|
||||
If a column or index contains an unparseable date, the entire column or
|
||||
index will be returned unaltered as an object data type. For non-standard
|
||||
datetime parsing, use :func:`to_datetime` after ``pd.read_csv``.
|
||||
|
||||
|
||||
.. note::
|
||||
read_csv has a fast_path for parsing datetime strings in iso8601 format,
|
||||
e.g "2000-01-01T00:01:02+00:00" and similar variations. If you can arrange
|
||||
|
|
|
@ -18,6 +18,8 @@ What's New
|
|||
|
||||
These are new features and improvements of note in each release.
|
||||
|
||||
.. include:: whatsnew/v0.20.0.txt
|
||||
|
||||
.. include:: whatsnew/v0.19.2.txt
|
||||
|
||||
.. include:: whatsnew/v0.19.1.txt
|
||||
|
|
|
@ -58,4 +58,4 @@ Bug Fixes
|
|||
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
|
||||
- Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`)
|
||||
- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns``
|
||||
is not scalar and ``values`` is not specified (:issue:`14380`)
|
||||
is not scalar and ``values`` is not specified (:issue:`14380`)
|
||||
|
|
|
@ -23,6 +23,16 @@ Performance Improvements
|
|||
|
||||
- Improved performance of ``.replace()`` (:issue:`12745`)
|
||||
|
||||
.. _whatsnew_0192.enhancements.other:
|
||||
|
||||
Other enhancements
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)
|
||||
- ``pd.merge_asof()`` can take multiple columns in ``by`` parameter and has specialized dtypes for better performace (:issue:`13936`)
|
||||
|
||||
|
||||
|
||||
.. _whatsnew_0192.bug_fixes:
|
||||
|
||||
Bug Fixes
|
||||
|
@ -82,11 +92,3 @@ Bug Fixes
|
|||
- Bug in ``unstack()`` if called with a list of column(s) as an argument, regardless of the dtypes of all columns, they get coerced to ``object`` (:issue:`11847`)
|
||||
|
||||
|
||||
.. _whatsnew_0192.enhancements.other:
|
||||
|
||||
Other enhancements
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
- ``pd.merge_asof()`` can take multiple columns in ``by`` parameter and has specialized dtypes for better performace (:issue:`13936`)
|
||||
|
||||
|
||||
|
|
|
@ -52,6 +52,9 @@ Other enhancements
|
|||
|
||||
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
|
||||
|
||||
|
||||
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
|
||||
|
||||
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
|
||||
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
|
||||
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
|
||||
|
@ -61,6 +64,8 @@ Other enhancements
|
|||
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
|
||||
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
|
||||
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
|
||||
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
|
||||
|
||||
|
||||
.. _whatsnew_0200.api_breaking:
|
||||
|
||||
|
@ -111,6 +116,7 @@ Removal of prior version deprecations/changes
|
|||
Performance Improvements
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -3354,12 +3354,16 @@ class NDFrame(PandasObject):
|
|||
return self._constructor(new_data).__finalize__(self)
|
||||
|
||||
def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
|
||||
"""Synonym for NDFrame.fillna(method='ffill')"""
|
||||
"""
|
||||
Synonym for :meth:`DataFrame.fillna(method='ffill') <DataFrame.fillna>`
|
||||
"""
|
||||
return self.fillna(method='ffill', axis=axis, inplace=inplace,
|
||||
limit=limit, downcast=downcast)
|
||||
|
||||
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
|
||||
"""Synonym for NDFrame.fillna(method='bfill')"""
|
||||
"""
|
||||
Synonym for :meth:`DataFrame.fillna(method='bfill') <DataFrame.fillna>`
|
||||
"""
|
||||
return self.fillna(method='bfill', axis=axis, inplace=inplace,
|
||||
limit=limit, downcast=downcast)
|
||||
|
||||
|
@ -5477,16 +5481,18 @@ class NDFrame(PandasObject):
|
|||
|
||||
cls.cummin = _make_cum_function(
|
||||
cls, 'cummin', name, name2, axis_descr, "cumulative minimum",
|
||||
lambda y, axis: np.minimum.accumulate(y, axis), np.inf, np.nan)
|
||||
lambda y, axis: np.minimum.accumulate(y, axis), "min",
|
||||
np.inf, np.nan)
|
||||
cls.cumsum = _make_cum_function(
|
||||
cls, 'cumsum', name, name2, axis_descr, "cumulative sum",
|
||||
lambda y, axis: y.cumsum(axis), 0., np.nan)
|
||||
lambda y, axis: y.cumsum(axis), "sum", 0., np.nan)
|
||||
cls.cumprod = _make_cum_function(
|
||||
cls, 'cumprod', name, name2, axis_descr, "cumulative product",
|
||||
lambda y, axis: y.cumprod(axis), 1., np.nan)
|
||||
lambda y, axis: y.cumprod(axis), "prod", 1., np.nan)
|
||||
cls.cummax = _make_cum_function(
|
||||
cls, 'cummax', name, name2, axis_descr, "cumulative max",
|
||||
lambda y, axis: np.maximum.accumulate(y, axis), -np.inf, np.nan)
|
||||
lambda y, axis: np.maximum.accumulate(y, axis), "max",
|
||||
-np.inf, np.nan)
|
||||
|
||||
cls.sum = _make_stat_function(
|
||||
cls, 'sum', name, name2, axis_descr,
|
||||
|
@ -5674,7 +5680,15 @@ skipna : boolean, default True
|
|||
|
||||
Returns
|
||||
-------
|
||||
%(outname)s : %(name1)s\n"""
|
||||
%(outname)s : %(name1)s\n
|
||||
|
||||
|
||||
See also
|
||||
--------
|
||||
pandas.core.window.Expanding.%(accum_func_name)s : Similar functionality
|
||||
but ignores ``NaN`` values.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
|
||||
|
@ -5717,10 +5731,10 @@ def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f):
|
|||
return set_function_name(stat_func, name, cls)
|
||||
|
||||
|
||||
def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func,
|
||||
mask_a, mask_b):
|
||||
def _make_cum_function(cls, name, name1, name2, axis_descr, desc,
|
||||
accum_func, accum_func_name, mask_a, mask_b):
|
||||
@Substitution(outname=name, desc=desc, name1=name1, name2=name2,
|
||||
axis_descr=axis_descr)
|
||||
axis_descr=axis_descr, accum_func_name=accum_func_name)
|
||||
@Appender("Return {0} over requested axis.".format(desc) +
|
||||
_cnum_doc)
|
||||
def cum_func(self, axis=None, skipna=True, *args, **kwargs):
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
from pandas.compat import range, zip
|
||||
from pandas import compat
|
||||
import itertools
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
@ -877,29 +878,55 @@ def lreshape(data, groups, dropna=True, label=None):
|
|||
return DataFrame(mdata, columns=id_cols + pivot_cols)
|
||||
|
||||
|
||||
def wide_to_long(df, stubnames, i, j):
|
||||
"""
|
||||
def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
|
||||
r"""
|
||||
Wide panel to long format. Less flexible but more user-friendly than melt.
|
||||
|
||||
With stubnames ['A', 'B'], this function expects to find one or more
|
||||
group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
|
||||
You specify what you want to call this suffix in the resulting long format
|
||||
with `j` (for example `j='year'`)
|
||||
|
||||
Each row of these wide variables are assumed to be uniquely identified by
|
||||
`i` (can be a single column name or a list of column names)
|
||||
|
||||
All remaining variables in the data frame are left intact.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
The wide-format DataFrame
|
||||
stubnames : list
|
||||
A list of stub names. The wide format variables are assumed to
|
||||
stubnames : str or list-like
|
||||
The stub name(s). The wide format variables are assumed to
|
||||
start with the stub names.
|
||||
i : str
|
||||
The name of the id variable.
|
||||
i : str or list-like
|
||||
Column(s) to use as id variable(s)
|
||||
j : str
|
||||
The name of the subobservation variable.
|
||||
stubend : str
|
||||
Regex to match for the end of the stubs.
|
||||
The name of the subobservation variable. What you wish to name your
|
||||
suffix in the long format.
|
||||
sep : str, default ""
|
||||
A character indicating the separation of the variable names
|
||||
in the wide format, to be stripped from the names in the long format.
|
||||
For example, if your column names are A-suffix1, A-suffix2, you
|
||||
can strip the hypen by specifying `sep='-'`
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
suffix : str, default '\\d+'
|
||||
A regular expression capturing the wanted suffixes. '\\d+' captures
|
||||
numeric suffixes. Suffixes with no numbers could be specified with the
|
||||
negated character class '\\D+'. You can also further disambiguate
|
||||
suffixes, for example, if your wide variables are of the form
|
||||
Aone, Btwo,.., and you have an unrelated column Arating, you can
|
||||
ignore the last one by specifying `suffix='(!?one|two)'`
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame that contains each stub name as a variable as well as
|
||||
variables for i and j.
|
||||
A DataFrame that contains each stub name as a variable, with new index
|
||||
(i, j)
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
@ -918,7 +945,7 @@ def wide_to_long(df, stubnames, i, j):
|
|||
0 a d 2.5 3.2 -1.085631 0
|
||||
1 b e 1.2 1.3 0.997345 1
|
||||
2 c f 0.7 0.1 0.282978 2
|
||||
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
X A B
|
||||
id year
|
||||
0 1970 -1.085631 a 2.5
|
||||
|
@ -928,38 +955,151 @@ def wide_to_long(df, stubnames, i, j):
|
|||
1 1980 0.997345 e 1.3
|
||||
2 1980 0.282978 f 0.1
|
||||
|
||||
With multuple id columns
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
... })
|
||||
>>> df
|
||||
birth famid ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 2 1 2.9 3.8
|
||||
2 3 1 2.2 2.9
|
||||
3 1 2 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 3 2 1.9 2.4
|
||||
6 1 3 2.2 3.3
|
||||
7 2 3 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
|
||||
>>> l
|
||||
ht
|
||||
famid birth age
|
||||
1 1 1 2.8
|
||||
2 3.4
|
||||
2 1 2.9
|
||||
2 3.8
|
||||
3 1 2.2
|
||||
2 2.9
|
||||
2 1 1 2.0
|
||||
2 3.2
|
||||
2 1 1.8
|
||||
2 2.8
|
||||
3 1 1.9
|
||||
2 2.4
|
||||
3 1 1 2.2
|
||||
2 3.3
|
||||
2 1 2.3
|
||||
2 3.4
|
||||
3 1 2.1
|
||||
2 2.9
|
||||
|
||||
Going from long back to wide just takes some creative use of `unstack`
|
||||
|
||||
>>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
|
||||
>>> w.columns = pd.Index(w.columns).str.join('')
|
||||
>>> w.reset_index()
|
||||
famid birth ht1 ht2
|
||||
0 1 1 2.8 3.4
|
||||
1 1 2 2.9 3.8
|
||||
2 1 3 2.2 2.9
|
||||
3 2 1 2.0 3.2
|
||||
4 2 2 1.8 2.8
|
||||
5 2 3 1.9 2.4
|
||||
6 3 1 2.2 3.3
|
||||
7 3 2 2.3 3.4
|
||||
8 3 3 2.1 2.9
|
||||
|
||||
Less wieldy column names are also handled
|
||||
|
||||
>>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
|
||||
... 'A(quarterly)-2011': np.random.rand(3),
|
||||
... 'B(quarterly)-2010': np.random.rand(3),
|
||||
... 'B(quarterly)-2011': np.random.rand(3),
|
||||
... 'X' : np.random.randint(3, size=3)})
|
||||
>>> df['id'] = df.index
|
||||
>>> df
|
||||
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
|
||||
0 0.531828 0.724455 0.322959 0.293714
|
||||
1 0.634401 0.611024 0.361789 0.630976
|
||||
2 0.849432 0.722443 0.228263 0.092105
|
||||
\
|
||||
X id
|
||||
0 0 0
|
||||
1 1 1
|
||||
2 2 2
|
||||
>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
|
||||
i='id', j='year', sep='-')
|
||||
X A(quarterly) B(quarterly)
|
||||
id year
|
||||
0 2010 0 0.531828 0.322959
|
||||
1 2010 2 0.634401 0.361789
|
||||
2 2010 2 0.849432 0.228263
|
||||
0 2011 0 0.724455 0.293714
|
||||
1 2011 2 0.611024 0.630976
|
||||
2 2011 2 0.722443 0.092105
|
||||
|
||||
If we have many columns, we could also use a regex to find our
|
||||
stubnames and pass that list on to wide_to_long
|
||||
|
||||
>>> stubnames = set([match[0] for match in
|
||||
df.columns.str.findall('[A-B]\(.*\)').values
|
||||
if match != [] ])
|
||||
>>> list(stubnames)
|
||||
['B(quarterly)', 'A(quarterly)']
|
||||
|
||||
Notes
|
||||
-----
|
||||
All extra variables are treated as extra id variables. This simply uses
|
||||
All extra variables are left untouched. This simply uses
|
||||
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
|
||||
in a typicaly case.
|
||||
"""
|
||||
|
||||
def get_var_names(df, regex):
|
||||
def get_var_names(df, stub, sep, suffix):
|
||||
regex = "^{0}{1}{2}".format(re.escape(stub), re.escape(sep), suffix)
|
||||
return df.filter(regex=regex).columns.tolist()
|
||||
|
||||
def melt_stub(df, stub, i, j):
|
||||
varnames = get_var_names(df, "^" + stub)
|
||||
newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
|
||||
var_name=j)
|
||||
newdf_j = newdf[j].str.replace(stub, "")
|
||||
try:
|
||||
newdf_j = newdf_j.astype(int)
|
||||
except ValueError:
|
||||
pass
|
||||
newdf[j] = newdf_j
|
||||
return newdf
|
||||
def melt_stub(df, stub, i, j, value_vars, sep):
|
||||
newdf = melt(df, id_vars=i, value_vars=value_vars,
|
||||
value_name=stub.rstrip(sep), var_name=j)
|
||||
newdf[j] = Categorical(newdf[j])
|
||||
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
|
||||
|
||||
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
|
||||
if i not in id_vars:
|
||||
id_vars += [i]
|
||||
return newdf.set_index(i + [j])
|
||||
|
||||
newdf = melt_stub(df, stubnames[0], id_vars, j)
|
||||
if any(map(lambda s: s in df.columns.tolist(), stubnames)):
|
||||
raise ValueError("stubname can't be identical to a column name")
|
||||
|
||||
for stub in stubnames[1:]:
|
||||
new = melt_stub(df, stub, id_vars, j)
|
||||
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
|
||||
return newdf.set_index([i, j])
|
||||
if not is_list_like(stubnames):
|
||||
stubnames = [stubnames]
|
||||
else:
|
||||
stubnames = list(stubnames)
|
||||
|
||||
if not is_list_like(i):
|
||||
i = [i]
|
||||
else:
|
||||
i = list(i)
|
||||
|
||||
value_vars = list(map(lambda stub:
|
||||
get_var_names(df, stub, sep, suffix), stubnames))
|
||||
|
||||
value_vars_flattened = [e for sublist in value_vars for e in sublist]
|
||||
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
|
||||
|
||||
melted = []
|
||||
for s, v in zip(stubnames, value_vars):
|
||||
melted.append(melt_stub(df, s, i, j, v, sep))
|
||||
melted = melted[0].join(melted[1:], how='outer')
|
||||
|
||||
if len(i) == 1:
|
||||
new = df[id_vars].set_index(i).join(melted)
|
||||
return new
|
||||
|
||||
new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
|
||||
|
||||
return new
|
||||
|
||||
|
||||
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
|
||||
|
|
|
@ -1455,9 +1455,9 @@ class CSVFormatter(object):
|
|||
f = self.path_or_buf
|
||||
close = False
|
||||
else:
|
||||
f = _get_handle(self.path_or_buf, self.mode,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression)
|
||||
f, handles = _get_handle(self.path_or_buf, self.mode,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression)
|
||||
close = True
|
||||
|
||||
try:
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
"""Common IO api utilities"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import csv
|
||||
import codecs
|
||||
import mmap
|
||||
import zipfile
|
||||
from contextlib import contextmanager, closing
|
||||
|
||||
from pandas.compat import StringIO, BytesIO, string_types, text_type
|
||||
|
@ -141,39 +139,6 @@ def _is_s3_url(url):
|
|||
return False
|
||||
|
||||
|
||||
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
|
||||
"""read an encoded stream from the reader and transform the bytes to
|
||||
unicode if required based on the encoding
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reader : a streamable file-like object
|
||||
encoding : optional, the encoding to attempt to read
|
||||
|
||||
Returns
|
||||
-------
|
||||
a tuple of (a stream of decoded bytes, the encoding which was used)
|
||||
|
||||
"""
|
||||
|
||||
if compat.PY3 or encoding is not None: # pragma: no cover
|
||||
if encoding:
|
||||
errors = 'strict'
|
||||
else:
|
||||
errors = 'replace'
|
||||
encoding = 'utf-8'
|
||||
|
||||
if compression == 'gzip':
|
||||
reader = BytesIO(reader.read())
|
||||
else:
|
||||
reader = StringIO(reader.read().decode(encoding, errors))
|
||||
else:
|
||||
if compression == 'gzip':
|
||||
reader = BytesIO(reader.read())
|
||||
encoding = None
|
||||
return reader, encoding
|
||||
|
||||
|
||||
def _expand_user(filepath_or_buffer):
|
||||
"""Return the argument with an initial component of ~ or ~user
|
||||
replaced by that user's home directory.
|
||||
|
@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
|
|||
"""
|
||||
|
||||
if _is_url(filepath_or_buffer):
|
||||
req = _urlopen(str(filepath_or_buffer))
|
||||
if compression == 'infer':
|
||||
content_encoding = req.headers.get('Content-Encoding', None)
|
||||
if content_encoding == 'gzip':
|
||||
compression = 'gzip'
|
||||
else:
|
||||
compression = None
|
||||
# cat on the compression to the tuple returned by the function
|
||||
to_return = (list(maybe_read_encoded_stream(req, encoding,
|
||||
compression)) +
|
||||
[compression])
|
||||
return tuple(to_return)
|
||||
url = str(filepath_or_buffer)
|
||||
req = _urlopen(url)
|
||||
content_encoding = req.headers.get('Content-Encoding', None)
|
||||
if content_encoding == 'gzip':
|
||||
# Override compression based on Content-Encoding header
|
||||
compression = 'gzip'
|
||||
reader = BytesIO(req.read())
|
||||
return reader, encoding, compression
|
||||
|
||||
if _is_s3_url(filepath_or_buffer):
|
||||
from pandas.io.s3 import get_filepath_or_buffer
|
||||
|
@ -276,64 +237,161 @@ def file_path_to_url(path):
|
|||
return urljoin('file:', pathname2url(path))
|
||||
|
||||
|
||||
# ZipFile is not a context manager for <= 2.6
|
||||
# must be tuple index here since 2.6 doesn't use namedtuple for version_info
|
||||
if sys.version_info[1] <= 6:
|
||||
@contextmanager
|
||||
def ZipFile(*args, **kwargs):
|
||||
with closing(zipfile.ZipFile(*args, **kwargs)) as zf:
|
||||
yield zf
|
||||
else:
|
||||
ZipFile = zipfile.ZipFile
|
||||
_compression_to_extension = {
|
||||
'gzip': '.gz',
|
||||
'bz2': '.bz2',
|
||||
'zip': '.zip',
|
||||
'xz': '.xz',
|
||||
}
|
||||
|
||||
|
||||
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
|
||||
"""Gets file handle for given path and mode.
|
||||
def _infer_compression(filepath_or_buffer, compression):
|
||||
"""
|
||||
if compression is not None:
|
||||
if encoding is not None and not compat.PY3:
|
||||
msg = 'encoding + compression not yet supported in Python 2'
|
||||
Get file handle for given path/buffer and mode.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buf :
|
||||
a path (str) or buffer
|
||||
compression : str, or None
|
||||
|
||||
Returns
|
||||
-------
|
||||
string compression method, None
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError on invalid compression specified
|
||||
|
||||
If compression='infer', infer compression. If compression
|
||||
"""
|
||||
|
||||
# No compression has been explicitly specified
|
||||
if compression is None:
|
||||
return None
|
||||
|
||||
# Cannot infer compression of a buffer. Hence assume no compression.
|
||||
is_path = isinstance(filepath_or_buffer, compat.string_types)
|
||||
if compression == 'infer' and not is_path:
|
||||
return None
|
||||
|
||||
# Infer compression from the filename/URL extension
|
||||
if compression == 'infer':
|
||||
for compression, extension in _compression_to_extension.items():
|
||||
if filepath_or_buffer.endswith(extension):
|
||||
return compression
|
||||
return None
|
||||
|
||||
# Compression has been specified. Check that it's valid
|
||||
if compression in _compression_to_extension:
|
||||
return compression
|
||||
|
||||
msg = 'Unrecognized compression type: {}'.format(compression)
|
||||
valid = ['infer', None] + sorted(_compression_to_extension)
|
||||
msg += '\nValid compression types are {}'.format(valid)
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
|
||||
memory_map=False):
|
||||
"""
|
||||
Get file handle for given path/buffer and mode.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf :
|
||||
a path (str) or buffer
|
||||
mode : str
|
||||
mode to open path_or_buf with
|
||||
encoding : str or None
|
||||
compression : str or None
|
||||
Supported compression protocols are gzip, bz2, zip, and xz
|
||||
memory_map : boolean, default False
|
||||
See parsers._parser_params for more information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
f : file-like
|
||||
A file-like object
|
||||
handles : list of file-like objects
|
||||
A list of file-like object that were openned in this function.
|
||||
"""
|
||||
|
||||
handles = list()
|
||||
f = path_or_buf
|
||||
is_path = isinstance(path_or_buf, compat.string_types)
|
||||
|
||||
if compression:
|
||||
|
||||
if compat.PY2 and not is_path and encoding:
|
||||
msg = 'compression with encoding is not yet supported in Python 2'
|
||||
raise ValueError(msg)
|
||||
|
||||
# GZ Compression
|
||||
if compression == 'gzip':
|
||||
import gzip
|
||||
f = gzip.GzipFile(path, mode)
|
||||
if is_path:
|
||||
f = gzip.open(path_or_buf, mode)
|
||||
else:
|
||||
f = gzip.GzipFile(fileobj=path_or_buf)
|
||||
|
||||
# BZ Compression
|
||||
elif compression == 'bz2':
|
||||
import bz2
|
||||
f = bz2.BZ2File(path, mode)
|
||||
if is_path:
|
||||
f = bz2.BZ2File(path_or_buf, mode)
|
||||
elif compat.PY2:
|
||||
# Python 2's bz2 module can't take file objects, so have to
|
||||
# run through decompress manually
|
||||
f = StringIO(bz2.decompress(path_or_buf.read()))
|
||||
path_or_buf.close()
|
||||
else:
|
||||
f = bz2.BZ2File(path_or_buf)
|
||||
|
||||
# ZIP Compression
|
||||
elif compression == 'zip':
|
||||
import zipfile
|
||||
zip_file = zipfile.ZipFile(path)
|
||||
zip_file = zipfile.ZipFile(path_or_buf)
|
||||
zip_names = zip_file.namelist()
|
||||
|
||||
if len(zip_names) == 1:
|
||||
file_name = zip_names.pop()
|
||||
f = zip_file.open(file_name)
|
||||
f = zip_file.open(zip_names.pop())
|
||||
elif len(zip_names) == 0:
|
||||
raise ValueError('Zero files found in ZIP file {}'
|
||||
.format(path))
|
||||
.format(path_or_buf))
|
||||
else:
|
||||
raise ValueError('Multiple files found in ZIP file.'
|
||||
' Only one file per ZIP :{}'
|
||||
' Only one file per ZIP: {}'
|
||||
.format(zip_names))
|
||||
|
||||
# XZ Compression
|
||||
elif compression == 'xz':
|
||||
lzma = compat.import_lzma()
|
||||
f = lzma.LZMAFile(path, mode)
|
||||
f = lzma.LZMAFile(path_or_buf, mode)
|
||||
|
||||
# Unrecognized Compression
|
||||
else:
|
||||
raise ValueError('Unrecognized compression type: %s' %
|
||||
compression)
|
||||
if compat.PY3:
|
||||
from io import TextIOWrapper
|
||||
f = TextIOWrapper(f, encoding=encoding)
|
||||
return f
|
||||
else:
|
||||
if compat.PY3:
|
||||
if encoding:
|
||||
f = open(path, mode, encoding=encoding)
|
||||
else:
|
||||
f = open(path, mode, errors='replace')
|
||||
msg = 'Unrecognized compression type: {}'.format(compression)
|
||||
raise ValueError(msg)
|
||||
|
||||
handles.append(f)
|
||||
|
||||
elif is_path:
|
||||
if compat.PY2:
|
||||
# Python 2
|
||||
f = open(path_or_buf, mode)
|
||||
elif encoding:
|
||||
# Python 3 and encoding
|
||||
f = open(path_or_buf, mode, encoding=encoding)
|
||||
else:
|
||||
f = open(path, mode)
|
||||
# Python 3 and no explicit encoding
|
||||
f = open(path_or_buf, mode, errors='replace')
|
||||
handles.append(f)
|
||||
|
||||
# in Python 3, convert BytesIO or fileobjects passed with an encoding
|
||||
if compat.PY3 and (compression or isinstance(f, compat.BytesIO)):
|
||||
from io import TextIOWrapper
|
||||
f = TextIOWrapper(f, encoding=encoding)
|
||||
handles.append(f)
|
||||
|
||||
if memory_map and hasattr(f, 'fileno'):
|
||||
try:
|
||||
|
@ -347,7 +405,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
|
|||
# leave the file handler as is then
|
||||
pass
|
||||
|
||||
return f
|
||||
return f, handles
|
||||
|
||||
|
||||
class MMapWrapper(BaseIterator):
|
||||
|
|
|
@ -259,8 +259,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
|
|||
exists = False
|
||||
|
||||
if exists:
|
||||
with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
|
||||
json = fh.read()
|
||||
fh, handles = _get_handle(filepath_or_buffer, 'r',
|
||||
encoding=encoding)
|
||||
json = fh.read()
|
||||
fh.close()
|
||||
else:
|
||||
json = filepath_or_buffer
|
||||
elif hasattr(filepath_or_buffer, 'read'):
|
||||
|
@ -723,7 +725,9 @@ def nested_to_record(ds, prefix="", level=0):
|
|||
|
||||
def json_normalize(data, record_path=None, meta=None,
|
||||
meta_prefix=None,
|
||||
record_prefix=None):
|
||||
record_prefix=None,
|
||||
errors='raise'):
|
||||
|
||||
"""
|
||||
"Normalize" semi-structured JSON data into a flat table
|
||||
|
||||
|
@ -740,6 +744,13 @@ def json_normalize(data, record_path=None, meta=None,
|
|||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||||
path to records is ['foo', 'bar']
|
||||
meta_prefix : string, default None
|
||||
errors : {'raise', 'ignore'}, default 'raise'
|
||||
* ignore : will ignore KeyError if keys listed in meta are not
|
||||
always present
|
||||
* raise : will raise KeyError if keys listed in meta are not
|
||||
always present
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
@ -839,7 +850,16 @@ def json_normalize(data, record_path=None, meta=None,
|
|||
if level + 1 > len(val):
|
||||
meta_val = seen_meta[key]
|
||||
else:
|
||||
meta_val = _pull_field(obj, val[level:])
|
||||
try:
|
||||
meta_val = _pull_field(obj, val[level:])
|
||||
except KeyError as e:
|
||||
if errors == 'ignore':
|
||||
meta_val = np.nan
|
||||
else:
|
||||
raise \
|
||||
KeyError("Try running with "
|
||||
"errors='ignore' as key "
|
||||
"%s is not always present", e)
|
||||
meta_vals[key].append(meta_val)
|
||||
|
||||
records.extend(recs)
|
||||
|
|
|
@ -27,12 +27,11 @@ from pandas.core.series import Series
|
|||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.categorical import Categorical
|
||||
from pandas.core.common import AbstractMethodError
|
||||
from pandas.core.config import get_option
|
||||
from pandas.io.date_converters import generic_parser
|
||||
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
|
||||
_get_handle, UnicodeReader, UTF8Recoder,
|
||||
BaseIterator, ParserError, EmptyDataError,
|
||||
ParserWarning, _NA_VALUES)
|
||||
ParserWarning, _NA_VALUES, _infer_compression)
|
||||
from pandas.tseries import tools
|
||||
|
||||
from pandas.util.decorators import Appender
|
||||
|
@ -168,6 +167,10 @@ default False
|
|||
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result
|
||||
'foo'
|
||||
|
||||
If a column or index contains an unparseable date, the entire column or
|
||||
index will be returned unaltered as an object data type. For non-standard
|
||||
datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``
|
||||
|
||||
Note: A fast-path exists for iso8601-formatted dates.
|
||||
infer_datetime_format : boolean, default False
|
||||
If True and parse_dates is enabled, pandas will attempt to infer the format
|
||||
|
@ -354,37 +357,17 @@ def _validate_nrows(nrows):
|
|||
|
||||
|
||||
def _read(filepath_or_buffer, kwds):
|
||||
"Generic reader of line files."
|
||||
"""Generic reader of line files."""
|
||||
encoding = kwds.get('encoding', None)
|
||||
if encoding is not None:
|
||||
encoding = re.sub('_', '-', encoding).lower()
|
||||
kwds['encoding'] = encoding
|
||||
|
||||
# If the input could be a filename, check for a recognizable compression
|
||||
# extension. If we're reading from a URL, the `get_filepath_or_buffer`
|
||||
# will use header info to determine compression, so use what it finds in
|
||||
# that case.
|
||||
inferred_compression = kwds.get('compression')
|
||||
if inferred_compression == 'infer':
|
||||
if isinstance(filepath_or_buffer, compat.string_types):
|
||||
if filepath_or_buffer.endswith('.gz'):
|
||||
inferred_compression = 'gzip'
|
||||
elif filepath_or_buffer.endswith('.bz2'):
|
||||
inferred_compression = 'bz2'
|
||||
elif filepath_or_buffer.endswith('.zip'):
|
||||
inferred_compression = 'zip'
|
||||
elif filepath_or_buffer.endswith('.xz'):
|
||||
inferred_compression = 'xz'
|
||||
else:
|
||||
inferred_compression = None
|
||||
else:
|
||||
inferred_compression = None
|
||||
|
||||
compression = kwds.get('compression')
|
||||
compression = _infer_compression(filepath_or_buffer, compression)
|
||||
filepath_or_buffer, _, compression = get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding,
|
||||
compression=kwds.get('compression', None))
|
||||
kwds['compression'] = (inferred_compression if compression == 'infer'
|
||||
else compression)
|
||||
filepath_or_buffer, encoding, compression)
|
||||
kwds['compression'] = compression
|
||||
|
||||
if kwds.get('date_parser', None) is not None:
|
||||
if isinstance(kwds['parse_dates'], bool):
|
||||
|
@ -1771,70 +1754,6 @@ def count_empty_vals(vals):
|
|||
return sum([1 for v in vals if v == '' or v is None])
|
||||
|
||||
|
||||
def _wrap_compressed(f, compression, encoding=None):
|
||||
"""wraps compressed fileobject in a decompressing fileobject
|
||||
NOTE: For all files in Python 3.2 and for bzip'd files under all Python
|
||||
versions, this means reading in the entire file and then re-wrapping it in
|
||||
StringIO.
|
||||
"""
|
||||
compression = compression.lower()
|
||||
encoding = encoding or get_option('display.encoding')
|
||||
|
||||
if compression == 'gzip':
|
||||
import gzip
|
||||
|
||||
f = gzip.GzipFile(fileobj=f)
|
||||
if compat.PY3:
|
||||
from io import TextIOWrapper
|
||||
|
||||
f = TextIOWrapper(f)
|
||||
return f
|
||||
elif compression == 'bz2':
|
||||
import bz2
|
||||
|
||||
if compat.PY3:
|
||||
f = bz2.open(f, 'rt', encoding=encoding)
|
||||
else:
|
||||
# Python 2's bz2 module can't take file objects, so have to
|
||||
# run through decompress manually
|
||||
data = bz2.decompress(f.read())
|
||||
f = StringIO(data)
|
||||
return f
|
||||
elif compression == 'zip':
|
||||
import zipfile
|
||||
zip_file = zipfile.ZipFile(f)
|
||||
zip_names = zip_file.namelist()
|
||||
|
||||
if len(zip_names) == 1:
|
||||
file_name = zip_names.pop()
|
||||
f = zip_file.open(file_name)
|
||||
return f
|
||||
|
||||
elif len(zip_names) == 0:
|
||||
raise ValueError('Corrupted or zero files found in compressed '
|
||||
'zip file %s', zip_file.filename)
|
||||
|
||||
else:
|
||||
raise ValueError('Multiple files found in compressed '
|
||||
'zip file %s', str(zip_names))
|
||||
|
||||
elif compression == 'xz':
|
||||
|
||||
lzma = compat.import_lzma()
|
||||
f = lzma.LZMAFile(f)
|
||||
|
||||
if compat.PY3:
|
||||
from io import TextIOWrapper
|
||||
|
||||
f = TextIOWrapper(f)
|
||||
|
||||
return f
|
||||
|
||||
else:
|
||||
raise ValueError('do not recognize compression method %s'
|
||||
% compression)
|
||||
|
||||
|
||||
class PythonParser(ParserBase):
|
||||
|
||||
def __init__(self, f, **kwds):
|
||||
|
@ -1890,20 +1809,10 @@ class PythonParser(ParserBase):
|
|||
self.comment = kwds['comment']
|
||||
self._comment_lines = []
|
||||
|
||||
if isinstance(f, compat.string_types):
|
||||
f = _get_handle(f, 'r', encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
memory_map=self.memory_map)
|
||||
self.handles.append(f)
|
||||
elif self.compression:
|
||||
f = _wrap_compressed(f, self.compression, self.encoding)
|
||||
self.handles.append(f)
|
||||
# in Python 3, convert BytesIO or fileobjects passed with an encoding
|
||||
elif compat.PY3 and isinstance(f, compat.BytesIO):
|
||||
from io import TextIOWrapper
|
||||
|
||||
f = TextIOWrapper(f, encoding=self.encoding)
|
||||
self.handles.append(f)
|
||||
f, handles = _get_handle(f, 'r', encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
memory_map=self.memory_map)
|
||||
self.handles.extend(handles)
|
||||
|
||||
# Set self.data to something that can read lines.
|
||||
if hasattr(f, 'readline'):
|
||||
|
|
|
@ -99,9 +99,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
|
|||
conn = boto.connect_s3(host=s3_host, anon=True)
|
||||
|
||||
b = conn.get_bucket(parsed_url.netloc, validate=False)
|
||||
if compat.PY2 and (compression == 'gzip' or
|
||||
(compression == 'infer' and
|
||||
filepath_or_buffer.endswith(".gz"))):
|
||||
if compat.PY2 and compression:
|
||||
k = boto.s3.key.Key(b, parsed_url.path)
|
||||
filepath_or_buffer = BytesIO(k.get_contents_as_string(
|
||||
encoding=encoding))
|
||||
|
|
|
@ -225,6 +225,65 @@ class TestNestedToRecord(tm.TestCase):
|
|||
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_json_normalize_errors(self):
|
||||
# GH14583: If meta keys are not always present
|
||||
# a new option to set errors='ignore' has been implemented
|
||||
i = {
|
||||
"Trades": [{
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"trade_version": 1,
|
||||
"stocks": [{
|
||||
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}, {
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"stocks": [{
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
j = json_normalize(data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='ignore')
|
||||
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
|
||||
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
|
||||
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
|
||||
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
|
||||
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
|
||||
|
||||
self.assertEqual(j.fillna('').to_dict(), expected)
|
||||
|
||||
self.assertRaises(KeyError,
|
||||
json_normalize, data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='raise'
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb',
|
||||
'--pdb-failure', '-s'], exit=False)
|
||||
|
|
|
@ -168,3 +168,8 @@ class CompressionTests(object):
|
|||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
inputs[3].close()
|
||||
|
||||
def test_invalid_compression(self):
|
||||
msg = 'Unrecognized compression type: sfark'
|
||||
with tm.assertRaisesRegexp(ValueError, msg):
|
||||
self.read_csv('test_file.zip', compression='sfark')
|
||||
|
|
|
@ -7,6 +7,8 @@ and hence require a network connection to be read.
|
|||
|
||||
import os
|
||||
import nose
|
||||
import functools
|
||||
from itertools import product
|
||||
|
||||
import pandas.util.testing as tm
|
||||
from pandas import DataFrame
|
||||
|
@ -14,24 +16,40 @@ from pandas import compat
|
|||
from pandas.io.parsers import read_csv, read_table
|
||||
|
||||
|
||||
class TestUrlGz(tm.TestCase):
|
||||
class TestCompressedUrl(object):
|
||||
|
||||
def setUp(self):
|
||||
dirpath = tm.get_data_path()
|
||||
localtable = os.path.join(dirpath, 'salaries.csv')
|
||||
self.local_table = read_table(localtable)
|
||||
compression_to_extension = {
|
||||
'gzip': '.gz',
|
||||
'bz2': '.bz2',
|
||||
'zip': '.zip',
|
||||
'xz': '.xz',
|
||||
}
|
||||
|
||||
@tm.network
|
||||
def test_url_gz(self):
|
||||
url = ('https://raw.github.com/pandas-dev/pandas/'
|
||||
'master/pandas/io/tests/parser/data/salaries.csv.gz')
|
||||
url_table = read_table(url, compression="gzip", engine="python")
|
||||
tm.assert_frame_equal(url_table, self.local_table)
|
||||
def __init__(self):
|
||||
path = os.path.join(tm.get_data_path(), 'salaries.csv')
|
||||
self.local_table = read_table(path)
|
||||
self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
|
||||
'pandas/io/tests/parser/data/salaries.csv')
|
||||
|
||||
@tm.network
|
||||
def test_url_gz_infer(self):
|
||||
url = 'https://s3.amazonaws.com/pandas-test/salary.table.gz'
|
||||
url_table = read_table(url, compression="infer", engine="python")
|
||||
def test_compressed_urls(self):
|
||||
"""Test reading compressed tables from URL."""
|
||||
msg = ('Test reading {}-compressed tables from URL: '
|
||||
'compression="{}", engine="{}"')
|
||||
|
||||
for compression, extension in self.compression_to_extension.items():
|
||||
url = self.base_url + extension
|
||||
# args is a (compression, engine) tuple
|
||||
for args in product([compression, 'infer'], ['python']):
|
||||
# test_fxn is a workaround for more descriptive nose reporting.
|
||||
# See http://stackoverflow.com/a/37393684/4651668.
|
||||
test_fxn = functools.partial(self.check_table)
|
||||
test_fxn.description = msg.format(compression, *args)
|
||||
yield (test_fxn, url) + args
|
||||
|
||||
def check_table(self, url, compression, engine):
|
||||
if url.endswith('.xz'):
|
||||
tm._skip_if_no_lzma()
|
||||
url_table = read_table(url, compression=compression, engine=engine)
|
||||
tm.assert_frame_equal(url_table, self.local_table)
|
||||
|
||||
|
||||
|
|
|
@ -138,6 +138,19 @@ date,time,prn,rxstatus
|
|||
names=['datetime', 'prn']))
|
||||
assert_frame_equal(df, df_correct)
|
||||
|
||||
def test_parse_date_column_with_empty_string(self):
|
||||
# GH 6428
|
||||
data = """case,opdate
|
||||
7,10/18/2006
|
||||
7,10/18/2008
|
||||
621, """
|
||||
result = read_csv(StringIO(data), parse_dates=['opdate'])
|
||||
expected_data = [[7, '10/18/2006'],
|
||||
[7, '10/18/2008'],
|
||||
[621, ' ']]
|
||||
expected = DataFrame(expected_data, columns=['case', 'opdate'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
if __name__ == '__main__':
|
||||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
|
||||
exit=False)
|
||||
|
|
|
@ -45,12 +45,12 @@ cdef bint PY2 = version_info[0] == 2
|
|||
|
||||
cdef int64_t NPY_NAT = util.get_nat()
|
||||
|
||||
cdef int US_RESO = frequencies.US_RESO
|
||||
cdef int MS_RESO = frequencies.MS_RESO
|
||||
cdef int S_RESO = frequencies.S_RESO
|
||||
cdef int T_RESO = frequencies.T_RESO
|
||||
cdef int H_RESO = frequencies.H_RESO
|
||||
cdef int D_RESO = frequencies.D_RESO
|
||||
cdef int RESO_US = frequencies.RESO_US
|
||||
cdef int RESO_MS = frequencies.RESO_MS
|
||||
cdef int RESO_SEC = frequencies.RESO_SEC
|
||||
cdef int RESO_MIN = frequencies.RESO_MIN
|
||||
cdef int RESO_HR = frequencies.RESO_HR
|
||||
cdef int RESO_DAY = frequencies.RESO_DAY
|
||||
|
||||
cdef extern from "period_helper.h":
|
||||
ctypedef struct date_info:
|
||||
|
@ -516,7 +516,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
|
|||
cdef:
|
||||
Py_ssize_t i, n = len(stamps)
|
||||
pandas_datetimestruct dts
|
||||
int reso = D_RESO, curr_reso
|
||||
int reso = RESO_DAY, curr_reso
|
||||
|
||||
if tz is not None:
|
||||
tz = maybe_get_tz(tz)
|
||||
|