Skip to content

ENH: Accept callable for skiprows in read_csv #15059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,16 @@ skipinitialspace : boolean, default ``False``
skiprows : list-like or integer, default ``None``
Line numbers to skip (0-indexed) or number of lines to skip (int) at the start
of the file.

If callable, the callable function will be evaluated against the row
indices, returning True if the row should be skipped and False otherwise:

.. ipython:: python

data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
pd.read_csv(StringIO(data))
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

skipfooter : int, default ``0``
Number of lines at bottom of file to skip (unsupported with engine='c').
skip_footer : int, default ``0``
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ Other enhancements
- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`)
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
Expand Down
28 changes: 20 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,13 @@
Values to consider as False
skipinitialspace : boolean, default False
Skip spaces after delimiter.
skiprows : list-like or integer, default None
skiprows : list-like or integer or callable, default None
Line numbers to skip (0-indexed) or number of lines to skip (int)
at the start of the file
at the start of the file.

If callable, the callable function will be evaluated against the row
indices, returning True if the row should be skipped and False otherwise.
An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
skipfooter : int, default 0
Number of lines at bottom of file to skip (Unsupported with engine='c')
skip_footer : int, default 0
Expand Down Expand Up @@ -930,7 +934,10 @@ def _clean_options(self, options, engine):
if engine != 'c':
if is_integer(skiprows):
skiprows = lrange(skiprows)
skiprows = set() if skiprows is None else set(skiprows)
if skiprows is None:
skiprows = set()
elif not callable(skiprows):
skiprows = set(skiprows)

# put stuff back
result['names'] = names
Expand Down Expand Up @@ -1851,6 +1858,11 @@ def __init__(self, f, **kwds):
self.memory_map = kwds['memory_map']
self.skiprows = kwds['skiprows']

if callable(self.skiprows):
self.skipfunc = self.skiprows
else:
self.skipfunc = lambda x: x in self.skiprows

self.skipfooter = kwds['skipfooter']
self.delimiter = kwds['delimiter']

Expand Down Expand Up @@ -2006,7 +2018,7 @@ class MyDialect(csv.Dialect):
# attempt to sniff the delimiter
if sniff_sep:
line = f.readline()
while self.pos in self.skiprows:
while self.skipfunc(self.pos):
self.pos += 1
line = f.readline()

Expand Down Expand Up @@ -2414,7 +2426,7 @@ def _empty(self, line):

def _next_line(self):
if isinstance(self.data, list):
while self.pos in self.skiprows:
while self.skipfunc(self.pos):
self.pos += 1

while True:
Expand All @@ -2433,7 +2445,7 @@ def _next_line(self):
except IndexError:
raise StopIteration
else:
while self.pos in self.skiprows:
while self.skipfunc(self.pos):
self.pos += 1
next(self.data)

Expand Down Expand Up @@ -2685,7 +2697,7 @@ def _get_lines(self, rows=None):
# Check for stop rows. n.b.: self.skiprows is a set.
if self.skiprows:
new_rows = [row for i, row in enumerate(new_rows)
if i + self.pos not in self.skiprows]
if not self.skipfunc(i + self.pos)]

lines.extend(new_rows)
self.pos = new_pos
Expand Down Expand Up @@ -2713,7 +2725,7 @@ def _get_lines(self, rows=None):
except StopIteration:
if self.skiprows:
new_rows = [row for i, row in enumerate(new_rows)
if self.pos + i not in self.skiprows]
if not self.skipfunc(i + self.pos)]
lines.extend(new_rows)
if len(lines) == 0:
raise
Expand Down
25 changes: 25 additions & 0 deletions pandas/io/tests/parser/skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas.util.testing as tm

from pandas import DataFrame
from pandas.io.common import EmptyDataError
from pandas.compat import StringIO, range, lrange


Expand Down Expand Up @@ -198,3 +199,27 @@ def test_skiprows_infield_quote(self):

df = self.read_csv(StringIO(data), skiprows=2)
tm.assert_frame_equal(df, expected)

def test_skiprows_callable(self):
data = 'a\n1\n2\n3\n4\n5'

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add tests with bad functions (raises and returns non-bool) - in particular want to make sure error propagates from c-engine.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Done.

skiprows = lambda x: x % 2 == 0
expected = DataFrame({'1': [3, 5]})
df = self.read_csv(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(df, expected)

expected = DataFrame({'foo': [3, 5]})
df = self.read_csv(StringIO(data), skiprows=skiprows,
header=0, names=['foo'])
tm.assert_frame_equal(df, expected)

skiprows = lambda x: True
msg = "No columns to parse from file"
with tm.assertRaisesRegexp(EmptyDataError, msg):
self.read_csv(StringIO(data), skiprows=skiprows)

# This is a bad callable and should raise.
msg = "by zero"
skiprows = lambda x: 1 / 0
with tm.assertRaisesRegexp(ZeroDivisionError, msg):
self.read_csv(StringIO(data), skiprows=skiprows)
26 changes: 22 additions & 4 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ cdef extern from "parser/tokenizer.h":
int header_end # header row end

void *skipset
PyObject *skipfunc
int64_t skip_first_N_rows
int skipfooter
double (*converter)(const char *, char **, char, char, char, int) nogil
Expand Down Expand Up @@ -606,9 +607,11 @@ cdef class TextReader:
cdef _make_skiprow_set(self):
if isinstance(self.skiprows, (int, np.integer)):
parser_set_skipfirstnrows(self.parser, self.skiprows)
else:
elif not callable(self.skiprows):
for i in self.skiprows:
parser_add_skiprow(self.parser, i)
else:
self.parser.skipfunc = <PyObject *> self.skiprows

cdef _setup_parser_source(self, source):
cdef:
Expand Down Expand Up @@ -2115,18 +2118,33 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:
cdef raise_parser_error(object base, parser_t *parser):
cdef:
object old_exc
object exc_type
PyObject *type
PyObject *value
PyObject *traceback

if PyErr_Occurred():
PyErr_Fetch(&type, &value, &traceback);
Py_XDECREF(type)
PyErr_Fetch(&type, &value, &traceback)
Py_XDECREF(traceback)

if value != NULL:
old_exc = <object> value
Py_XDECREF(value)
raise old_exc

# PyErr_Fetch only returned the error message in *value,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why was this not necessary before? this seems like lots of hoop jumping

Copy link
Member Author

@gfyoung gfyoung Jan 14, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think because we got lucky. This should have been caught before and processed accordingly. It's annoying, but that's what happens unfortunately when the Python C API can't promise you anything.

# so the Exception class must be extracted from *type.
if isinstance(old_exc, compat.string_types):
if type != NULL:
exc_type = <object> type
else:
exc_type = ParserError

Py_XDECREF(type)
raise exc_type(old_exc)
else:
Py_XDECREF(type)
raise old_exc

message = '%s. C error: ' % base
if parser.error_msg != NULL:
if PY3:
Expand Down
30 changes: 28 additions & 2 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ void parser_set_default_options(parser_t *self) {
self->thousands = '\0';

self->skipset = NULL;
self->skipfunc = NULL;
self->skip_first_N_rows = -1;
self->skip_footer = 0;
}
Expand Down Expand Up @@ -679,7 +680,27 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
}

int skip_this_line(parser_t *self, int64_t rownum) {
if (self->skipset != NULL) {
int should_skip;
PyObject *result;
PyGILState_STATE state;

if (self->skipfunc != NULL) {
state = PyGILState_Ensure();
result = PyObject_CallFunction(self->skipfunc, "i", rownum);

// Error occurred. It will be processed
// and caught at the Cython level.
if (result == NULL) {
should_skip = -1;
} else {
should_skip = PyObject_IsTrue(result);
}

Py_XDECREF(result);
PyGILState_Release(state);

return should_skip;
} else if (self->skipset != NULL) {
return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
((kh_int64_t *)self->skipset)->n_buckets);
} else {
Expand All @@ -689,6 +710,7 @@ int skip_this_line(parser_t *self, int64_t rownum) {

int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
int i, slen;
int should_skip;
long maxstreamsize;
char c;
char *stream;
Expand Down Expand Up @@ -818,7 +840,11 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {

case START_RECORD:
// start of record
if (skip_this_line(self, self->file_lines)) {
should_skip = skip_this_line(self, self->file_lines);

if (should_skip == -1) {
goto parsingerror;
} else if (should_skip) {
if (IS_QUOTE(c)) {
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else {
Expand Down
1 change: 1 addition & 0 deletions pandas/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ typedef struct parser_t {
int header_end; // header row end

void *skipset;
PyObject *skipfunc;
int64_t skip_first_N_rows;
int skip_footer;
double (*converter)(const char *, char **, char, char, char, int);
Expand Down