BUG: Corrects stopping logic when nrows argument is supplied (#7626)
closes #7626 Subsets of tabular files with different "shapes" will now load when a valid skiprows/nrows is given as an argument - Conditions for error: 1) There are different "shapes" within a tabular data file, i.e. different numbers of columns. 2) A "narrower" set of columns is followed by a "wider" (more columns) one, and the narrower set is laid out such that the end of a 262144-byte block occurs within it. Issue summary: The C engine for parsing files reads in 262144 bytes at a time. Previously, the "start_lines" variable in tokenizer.c/tokenize_bytes() was set incorrectly to the first line in that chunk, rather than the overall first row requested. This lead to incorrect logic on when to stop reading when nrows is supplied by the user. This always happened but only caused a crash when a wider set of columns followed in the file. In other cases, extra rows were read in but then harmlessly discarded. This pull request always uses the first requested row for comparisons, so only nrows will be parsed when supplied. Author: Jeff Carey <jeff.carey@gmail.com> Closes #14747 from jeffcarey/fix/7626 and squashes the following commits:cac1bac
[Jeff Carey] Removed duplicative test6f1965a
[Jeff Carey] BUG: Corrects stopping logic when nrows argument is supplied (Fixes #7626)
This commit is contained in:
parent
53bf1b27c7
commit
4378f82967
|
@ -70,6 +70,7 @@ Bug Fixes
|
|||
|
||||
|
||||
- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
|
||||
- Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -371,3 +371,20 @@ No,No,No"""
|
|||
|
||||
result = self.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_nrows_large(self):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
|
||||
for i in range(10)]) + '\n'
|
||||
data_narrow = '\t'.join(['somedatasomedatasomedata1'
|
||||
for i in range(10)]) + '\n'
|
||||
header_wide = '\t'.join(['COL_HEADER_' + str(i)
|
||||
for i in range(15)]) + '\n'
|
||||
data_wide = '\t'.join(['somedatasomedatasomedata2'
|
||||
for i in range(15)]) + '\n'
|
||||
test_input = (header_narrow + data_narrow * 1050 +
|
||||
header_wide + data_wide * 2)
|
||||
|
||||
df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)
|
||||
|
||||
self.assertTrue(df.size == 1010 * 10)
|
||||
|
|
|
@ -726,16 +726,14 @@ int skip_this_line(parser_t *self, int64_t rownum) {
|
|||
}
|
||||
}
|
||||
|
||||
int tokenize_bytes(parser_t *self, size_t line_limit)
|
||||
int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines)
|
||||
{
|
||||
int i, slen, start_lines;
|
||||
int i, slen;
|
||||
long maxstreamsize;
|
||||
char c;
|
||||
char *stream;
|
||||
char *buf = self->data + self->datapos;
|
||||
|
||||
start_lines = self->lines;
|
||||
|
||||
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
|
||||
self->error_msg = "out of memory";
|
||||
return -1;
|
||||
|
@ -1384,7 +1382,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
|
|||
TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n",
|
||||
self->datalen - self->datapos, self->datalen, self->datapos));
|
||||
|
||||
status = tokenize_bytes(self, nrows);
|
||||
status = tokenize_bytes(self, nrows, start_lines);
|
||||
|
||||
if (status < 0) {
|
||||
// XXX
|
||||
|
|
Loading…
Reference in New Issue