BUG: Improve error message for multi-char sep and quotes in Python engine (#14582)

If there is a field counts mismatch, check whether
a multi-char sep was used in conjunction with quotes.
Currently, that setup is not respected and can result
in improper line breaks.

Closes gh-13374.
This commit is contained in:
gfyoung 2016-11-25 16:21:03 -05:00 committed by Joris Van den Bossche
parent b1d95990b4
commit d8e427bda0
3 changed files with 23 additions and 0 deletions

View File

@ -30,6 +30,7 @@ Bug Fixes
- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)

View File

@ -2515,6 +2515,11 @@ class PythonParser(ParserBase):
msg = ('Expected %d fields in line %d, saw %d' %
(col_len, row_num + 1, zip_len))
if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE:
# see gh-13374
reason = ('Error could possibly be due to quotes being '
'ignored when a multi-char delimiter is used.')
msg += '. ' + reason
raise ValueError(msg)
if self.usecols:

View File

@ -7,6 +7,7 @@ these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""
import csv
import sys
import nose
@ -204,3 +205,19 @@ x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
sep=sep, names=['a', 'b'],
encoding=encoding)
tm.assert_frame_equal(result, expected)
def test_multi_char_sep_quotes(self):
# see gh-13374
data = 'a,,b\n1,,a\n2,,"2,,b"'
msg = 'ignored when a multi-char delimiter is used'
with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(StringIO(data), sep=',,')
# We expect no match, so there should be an assertion
# error out of the inner context manager.
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(StringIO(data), sep=',,',
quoting=csv.QUOTE_NONE)