diff --git a/doc/source/io.rst b/doc/source/io.rst index 32891e371a489..249cfaf62878f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -92,7 +92,8 @@ They can take a number of arguments: - ``dialect``: string or :class:`python:csv.Dialect` instance to expose more ways to specify the file format - ``dtype``: A data type name or a dict of column name to data type. If not - specified, data types will be inferred. + specified, data types will be inferred. (Unsupported with + ``engine='python'``) - ``header``: row number(s) to use as the column names, and the start of the data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly pass ``header=0`` to be able to replace existing names. The header can be @@ -154,6 +155,7 @@ They can take a number of arguments: pieces. Will cause an ``TextFileReader`` object to be returned. More on this below in the section on :ref:`iterating and chunking ` - ``skip_footer``: number of lines to skip at bottom of file (default 0) + (Unsupported with ``engine='c'``) - ``converters``: a dictionary of functions for converting values in certain columns, where keys are either integers or column labels - ``encoding``: a string representing the encoding to use for decoding @@ -275,6 +277,11 @@ individual columns: df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64}) df.dtypes +.. note:: + The ``dtype`` option is currently only supported by the C engine. + Specifying ``dtype`` with ``engine`` other than 'c' raises a + ``ValueError``. + .. _io.headers: Handling column names @@ -1029,6 +1036,22 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: os.remove('tmp.sv') os.remove('tmp2.sv') +Specifying the parser engine +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Under the hood pandas uses a fast and efficient parser implemented in C as well +as a python implementation which is currently more feature-complete. Where +possible pandas uses the C parser (specified as ``engine='c'``), but may fall +back to python if C-unsupported options are specified. Currently, C-unsupported +options include: + +- ``sep`` other than a single character (e.g. regex separators) +- ``skip_footer`` +- ``sep=None`` with ``delim_whitespace=False`` + +Specifying any of the above options will produce a ``ParserWarning`` unless the +python engine is selected explicitly using ``engine='python'``. + .. _io.store_in_csv: Writing to CSV format diff --git a/doc/source/release.rst b/doc/source/release.rst index 271daa1623a4b..b00d68688b02b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -176,6 +176,8 @@ API Changes - ``.quantile`` on a ``datetime[ns]`` series now returns ``Timestamp`` instead of ``np.datetime64`` objects (:issue:`6810`) - change ``AssertionError`` to ``TypeError`` for invalid types passed to ``concat`` (:issue:`6583`) +- Add :class:`~pandas.io.parsers.ParserWarning` class for fallback and option + validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`) Deprecations ~~~~~~~~~~~~ @@ -280,6 +282,9 @@ Improvements to existing features - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max, :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`) - ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`) +- Translate ``sep='\s+'`` to ``delim_whitespace=True`` in + :func:`read_csv`/:func:`read_table` if no other C-unsupported options + specified (:issue:`6607`) .. _release.bug_fixes-0.14.0: @@ -402,6 +407,17 @@ Bug Fixes - Bug in `DataFrame.plot` and `Series.plot` legend behave inconsistently when plotting to the same axes repeatedly (:issue:`6678`) - Internal tests for patching ``__finalize__`` / bug in merge not finalizing (:issue:`6923`, :issue:`6927`) - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`) +- Raise :class:`ValueError` when ``sep`` specified with + ``delim_whitespace=True`` in :func:`read_csv`/:func:`read_table` + (:issue:`6607`) +- Raise :class:`ValueError` when `engine='c'` specified with unsupported + options (:issue:`6607`) +- Raise :class:`ValueError` when fallback to python parser causes options to be + ignored (:issue:`6607`) +- Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python + parser when no options are ignored (:issue:`6607`) +- Bug in C parser with leading whitespace (:issue:`3374`) +- Bug in C parser with ``delim_whitespace=True`` and ``\r``-delimited lines pandas 0.13.1 ------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b45b8929e7af3..b439ca5c61aeb 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -6,6 +6,7 @@ from pandas import compat import re import csv +import warnings import numpy as np @@ -24,6 +25,8 @@ import pandas.tslib as tslib import pandas.parser as _parser +class ParserWarning(Warning): + pass _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -50,6 +53,7 @@ One-character string used to escape delimiter when quoting is QUOTE_NONE. dtype : Type name or dict of column -> type Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + (Unsupported with engine='python') compression : {'gzip', 'bz2', None}, default None For on-the-fly decompression of on-disk data dialect : string or csv.Dialect instance, default None @@ -113,7 +117,7 @@ chunksize : int, default None Return TextFileReader object for iteration skipfooter : int, default 0 - Number of line at bottom of file to skip + Number of lines at bottom of file to skip (Unsupported with engine='c') converters : dict. optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels @@ -125,24 +129,24 @@ Encoding to use for UTF when reading/writing (ex. 'utf-8') squeeze : boolean, default False If the parsed data only contains one column then return a Series -na_filter: boolean, default True +na_filter : boolean, default True Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large file usecols : array-like Return a subset of the columns. Results in much faster parsing time and lower memory usage. -mangle_dupe_cols: boolean, default True +mangle_dupe_cols : boolean, default True Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' -tupleize_cols: boolean, default False +tupleize_cols : boolean, default False Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns) -error_bad_lines: boolean, default True +error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will dropped from the DataFrame that is - returned. (Only valid with C parser). -warn_bad_lines: boolean, default True + returned. (Only valid with C parser) +warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. (Only valid with C parser). infer_datetime_format : boolean, default False @@ -154,25 +158,30 @@ result : DataFrame or TextParser """ -_csv_sep = """sep : string, default ',' +_csv_params = """sep : string, default ',' Delimiter to use. If sep is None, will try to automatically determine this. Regular expressions are accepted. -""" +engine : {'c', 'python'} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete.""" -_table_sep = """sep : string, default \\t (tab-stop) - Delimiter to use. Regular expressions are accepted.""" +_table_params = """sep : string, default \\t (tab-stop) + Delimiter to use. Regular expressions are accepted. +engine : {'c', 'python'} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete.""" _read_csv_doc = """ Read CSV (comma-separated) file into DataFrame %s -""" % (_parser_params % _csv_sep) +""" % (_parser_params % _csv_params) _read_table_doc = """ Read general delimited file into DataFrame %s -""" % (_parser_params % _table_sep) +""" % (_parser_params % _table_params) _fwf_widths = """\ colspecs : list of pairs (int, int) or 'infer'. optional @@ -297,6 +306,8 @@ def _read(filepath_or_buffer, kwds): def _make_parser_function(name, sep=','): + default_sep = sep + def parser_f(filepath_or_buffer, sep=sep, dialect=None, @@ -325,7 +336,7 @@ def parser_f(filepath_or_buffer, dtype=None, usecols=None, - engine='c', + engine=None, delim_whitespace=False, as_recarray=False, na_filter=True, @@ -362,10 +373,21 @@ def parser_f(filepath_or_buffer, if delimiter is None: delimiter = sep + if delim_whitespace and delimiter is not default_sep: + raise ValueError("Specified a delimiter with both sep and"\ + " delim_whitespace=True; you can only specify one.") + + if engine is not None: + engine_specified = True + else: + engine = 'c' + engine_specified = False + kwds = dict(delimiter=delimiter, engine=engine, dialect=dialect, compression=compression, + engine_specified=engine_specified, doublequote=doublequote, escapechar=escapechar, @@ -468,10 +490,18 @@ class TextFileReader(object): """ - def __init__(self, f, engine='python', **kwds): + def __init__(self, f, engine=None, **kwds): self.f = f + if engine is not None: + engine_specified = True + else: + engine = 'python' + engine_specified = False + + self._engine_specified = kwds.get('engine_specified', engine_specified) + if kwds.get('dialect') is not None: dialect = kwds['dialect'] kwds['delimiter'] = dialect.delimiter @@ -530,21 +560,36 @@ def _get_options_with_defaults(self, engine): def _clean_options(self, options, engine): result = options.copy() + engine_specified = self._engine_specified + fallback_reason = None + sep = options['delimiter'] delim_whitespace = options['delim_whitespace'] + # C engine not supported yet + if engine == 'c': + if options['skip_footer'] > 0: + fallback_reason = "the 'c' engine does not support"\ + " skip_footer" + engine = 'python' + if sep is None and not delim_whitespace: if engine == 'c': + fallback_reason = "the 'c' engine does not support"\ + " sep=None with delim_whitespace=False" engine = 'python' elif sep is not None and len(sep) > 1: - # wait until regex engine integrated - if engine not in ('python', 'python-fwf'): + if engine == 'c' and sep == '\s+': + result['delim_whitespace'] = True + del result['delimiter'] + elif engine not in ('python', 'python-fwf'): + # wait until regex engine integrated + fallback_reason = "the 'c' engine does not support"\ + " regex separators" engine = 'python' - # C engine not supported yet - if engine == 'c': - if options['skip_footer'] > 0: - engine = 'python' + if fallback_reason and engine_specified: + raise ValueError(fallback_reason) if engine == 'c': for arg in _c_unsupported: @@ -552,8 +597,23 @@ def _clean_options(self, options, engine): if 'python' in engine: for arg in _python_unsupported: + if fallback_reason and result[arg] != _c_parser_defaults[arg]: + msg = ("Falling back to the 'python' engine because" + " {reason}, but this causes {option!r} to be" + " ignored as it is not supported by the 'python'" + " engine.").format(reason=fallback_reason, option=arg) + if arg == 'dtype': + msg += " (Note the 'converters' option provides"\ + " similar functionality.)" + raise ValueError(msg) del result[arg] + if fallback_reason: + warnings.warn(("Falling back to the 'python' engine because" + " {0}; you can avoid this warning by specifying" + " engine='python'.").format(fallback_reason), + ParserWarning) + index_col = options['index_col'] names = options['names'] converters = options['converters'] diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index 6cfe4bea01045..6204a441b347d 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -323,6 +323,9 @@ def _test(text, **kwargs): data = 'A B C\r 2 3\r4 5 6' _test(data, delim_whitespace=True) + data = 'A B C\r2 3\r4 5 6' + _test(data, delim_whitespace=True) + def test_empty_field_eof(self): data = 'a,b,c\n1,2,3\n4,,' diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 612840e82e3ff..872e719eaa630 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -35,7 +35,7 @@ from numpy.testing.decorators import slow from numpy.testing import assert_array_equal -from pandas.parser import OverflowError +from pandas.parser import OverflowError, CParserError class ParserTests(object): @@ -390,19 +390,26 @@ def test_multiple_date_col_timestamp_parse(self): self.assertEqual(result['0_1'][0], ex_val) def test_single_line(self): - # sniff separator - buf = StringIO() - sys.stdout = buf + # GH 6607 + # Test currently only valid with python engine because sep=None and + # delim_whitespace=False. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: - # printing warning message when engine == 'c' for now + with tm.assertRaisesRegexp(ValueError, + 'sep=None with delim_whitespace=False'): + # sniff separator + buf = StringIO() + sys.stdout = buf - try: - # it works! - df = self.read_csv(StringIO('1,2'), names=['a', 'b'], - header=None, sep=None) - tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) - finally: - sys.stdout = sys.__stdout__ + # printing warning message when engine == 'c' for now + + try: + # it works! + df = self.read_csv(StringIO('1,2'), names=['a', 'b'], + header=None, sep=None) + tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) + finally: + sys.stdout = sys.__stdout__ def test_multiple_date_cols_with_header(self): data = """\ @@ -534,11 +541,17 @@ def test_malformed(self): footer """ + # GH 6607 + # Test currently only valid with python engine because + # skip_footer != 0. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: + try: - df = self.read_table( - StringIO(data), sep=',', header=1, comment='#', - skip_footer=1) - self.assert_(False) + with tm.assertRaisesRegexp(ValueError, 'skip_footer'): #XXX + df = self.read_table( + StringIO(data), sep=',', header=1, comment='#', + skip_footer=1) + self.assert_(False) except Exception as inst: self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) @@ -599,48 +612,55 @@ def test_malformed(self): self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) def test_passing_dtype(self): + # GH 6607 + # Passing dtype is currently only supported by the C engine. + # Temporarily copied to TestCParser*. + # Test for ValueError with other engines: - df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + with tm.assertRaisesRegexp(ValueError, + "The 'dtype' option is not supported"): - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path) + df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) - # GH 3795 - # passing 'str' as the dtype - result = pd.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) - # we expect all object columns, so need to convert to test for equivalence - result = result.astype(float) - tm.assert_frame_equal(result,df) + # GH 3795 + # passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) - # invalid dtype - self.assertRaises(TypeError, pd.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, - index_col=0) + # we expect all object columns, so need to convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result,df) - # valid but we don't support it (date) - self.assertRaises(TypeError, pd.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, - index_col=0) - self.assertRaises(TypeError, pd.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, - index_col=0, parse_dates=['B']) + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + index_col=0) - # valid but we don't support it - self.assertRaises(TypeError, pd.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, - index_col=0) + # valid but we don't support it (date) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0, parse_dates=['B']) + + # valid but we don't support it + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + index_col=0) - def test_quoting(self): - bad_line_small = """printer\tresult\tvariant_name + def test_quoting(self): + bad_line_small = """printer\tresult\tvariant_name Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jacob Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jakob Klosterdruckerei\tKlosterdruckerei (1609-1805)\t"Furststiftische Hofdruckerei, (1609-1805)\tGaller, Alois Klosterdruckerei\tKlosterdruckerei (1609-1805)\tHochfurstliche Buchhandlung """ - self.assertRaises(Exception, self.read_table, StringIO(bad_line_small), - sep='\t') + self.assertRaises(Exception, self.read_table, StringIO(bad_line_small), + sep='\t') - good_line_small = bad_line_small + '"' - df = self.read_table(StringIO(good_line_small), sep='\t') - self.assertEqual(len(df), 3) + good_line_small = bad_line_small + '"' + df = self.read_table(StringIO(good_line_small), sep='\t') + self.assertEqual(len(df), 3) def test_non_string_na_values(self): # GH3611, na_values that are not a string are an issue @@ -1165,58 +1185,64 @@ def test_read_text_list(self): tm.assert_frame_equal(chunk, df) def test_iterator(self): - reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True) - df = self.read_csv(StringIO(self.data1), index_col=0) + # GH 6607 + # Test currently only valid with python engine because + # skip_footer != 0. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: - chunk = reader.read(3) - tm.assert_frame_equal(chunk, df[:3]) + with tm.assertRaisesRegexp(ValueError, 'skip_footer'): + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True) + df = self.read_csv(StringIO(self.data1), index_col=0) - last_chunk = reader.read(5) - tm.assert_frame_equal(last_chunk, df[3:]) + chunk = reader.read(3) + tm.assert_frame_equal(chunk, df[:3]) - # pass list - lines = list(csv.reader(StringIO(self.data1))) - parser = TextParser(lines, index_col=0, chunksize=2) + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, df[3:]) - df = self.read_csv(StringIO(self.data1), index_col=0) + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) + df = self.read_csv(StringIO(self.data1), index_col=0) - # pass skiprows - parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[1:3]) + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) - # test bad parameter (skip_footer) - reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True, skip_footer=True) - self.assertRaises(ValueError, reader.read, 3) + # pass skiprows + parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[1:3]) - treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, - iterator=True) - tm.assert_isinstance(treader, TextFileReader) + # test bad parameter (skip_footer) + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True, skip_footer=True) + self.assertRaises(ValueError, reader.read, 3) - # stopping iteration when on chunksize is specified, GH 3967 - data = """A,B,C + treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + tm.assert_isinstance(treader, TextFileReader) + + # stopping iteration when on chunksize is specified, GH 3967 + data = """A,B,C foo,1,2,3 bar,4,5,6 baz,7,8,9 """ - reader = self.read_csv(StringIO(data), iterator=True) - result = list(reader) - expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) - tm.assert_frame_equal(result[0], expected) + reader = self.read_csv(StringIO(data), iterator=True) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + tm.assert_frame_equal(result[0], expected) - # chunksize = 1 - reader = self.read_csv(StringIO(data), chunksize=1) - result = list(reader) - expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) - self.assertEqual(len(result), 3) - tm.assert_frame_equal(pd.concat(result), expected) + # chunksize = 1 + reader = self.read_csv(StringIO(data), chunksize=1) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + self.assertEqual(len(result), 3) + tm.assert_frame_equal(pd.concat(result), expected) def test_header_not_first_line(self): data = """got,to,ignore,this,line @@ -1447,28 +1473,34 @@ def test_multi_index_parse_dates(self): (datetime, np.datetime64, Timestamp)) def test_skip_footer(self): - data = """A,B,C + # GH 6607 + # Test currently only valid with python engine because + # skip_footer != 0. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: + + with tm.assertRaisesRegexp(ValueError, 'skip_footer'): + data = """A,B,C 1,2,3 4,5,6 7,8,9 want to skip this also also skip this """ - result = self.read_csv(StringIO(data), skip_footer=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = self.read_csv(StringIO(no_footer)) + result = self.read_csv(StringIO(data), skip_footer=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) - result = self.read_csv(StringIO(data), nrows=3) - tm.assert_frame_equal(result, expected) + result = self.read_csv(StringIO(data), nrows=3) + tm.assert_frame_equal(result, expected) - # skipfooter alias - result = read_csv(StringIO(data), skipfooter=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = read_csv(StringIO(no_footer)) + # skipfooter alias + result = read_csv(StringIO(data), skipfooter=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = read_csv(StringIO(no_footer)) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_no_unnamed_index(self): data = """ id c0 c1 c2 @@ -1536,15 +1568,21 @@ def test_converter_return_string_bug(self): self.assertEqual(df2['Number1'].dtype, float) def test_read_table_buglet_4x_multiindex(self): - text = """ A B C D E + # GH 6607 + # Parsing multiindex columns currently causes an error in the C parser. + # Temporarily copied to TestPythonParser. + # Here test that CParserError is raised: + + with tm.assertRaises(CParserError): + text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - # it works! - df = self.read_table(StringIO(text), sep='\s+') - self.assertEquals(df.index.names, ('one', 'two', 'three', 'four')) + # it works! + df = self.read_table(StringIO(text), sep='\s+') + self.assertEquals(df.index.names, ('one', 'two', 'three', 'four')) def test_read_csv_parse_simple_list(self): text = """foo @@ -1987,21 +2025,21 @@ def test_usecols_index_col_conflict(self): expected = DataFrame({'Price': [100, 101]}, index=[datetime(2013, 5, 11), datetime(2013, 5, 12)]) expected.index.name = 'Time' - df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0) tm.assert_frame_equal(expected, df) - df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time') tm.assert_frame_equal(expected, df) - df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time') tm.assert_frame_equal(expected, df) - df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0) tm.assert_frame_equal(expected, df) expected = DataFrame({'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) expected = expected.set_index(['Price', 'P2']) - df = pd.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + df = self.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) tm.assert_frame_equal(expected, df) def test_chunks_have_consistent_numerical_type(self): @@ -2096,6 +2134,14 @@ def test_catch_too_many_names(self): 10,11,12\n""" tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd']) + def test_ignore_leading_whitespace(self): + # GH 6607, GH 3374 + data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' + result = self.read_table(StringIO(data), sep='\s+') + expected = DataFrame({'a':[1,4,7], 'b':[2,5,8], 'c': [3,6,9]}) + tm.assert_frame_equal(result, expected) + + class TestPythonParser(ParserTests, tm.TestCase): def test_negative_skipfooter_raises(self): @@ -2411,6 +2457,252 @@ def test_iteration_open_handle(self): expected = Series(['DDD', 'EEE', 'FFF', 'GGG']) tm.assert_series_equal(result, expected) + def test_iterator(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True) + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunk = reader.read(3) + tm.assert_frame_equal(chunk, df[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, df[3:]) + + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) + + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + # pass skiprows + parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[1:3]) + + # test bad parameter (skip_footer) + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True, skip_footer=True) + self.assertRaises(ValueError, reader.read, 3) + + treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + tm.assert_isinstance(treader, TextFileReader) + + # stopping iteration when on chunksize is specified, GH 3967 + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + reader = self.read_csv(StringIO(data), iterator=True) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + tm.assert_frame_equal(result[0], expected) + + # chunksize = 1 + reader = self.read_csv(StringIO(data), chunksize=1) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + self.assertEqual(len(result), 3) + tm.assert_frame_equal(pd.concat(result), expected) + + def test_single_line(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + # sniff separator + buf = StringIO() + sys.stdout = buf + + # printing warning message when engine == 'c' for now + + try: + # it works! + df = self.read_csv(StringIO('1,2'), names=['a', 'b'], + header=None, sep=None) + tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) + finally: + sys.stdout = sys.__stdout__ + + def test_malformed(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + # all + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + + try: + df = self.read_table( + StringIO(data), sep=',', header=1, comment='#') + self.assert_(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) + + # skip_footer + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + + try: + df = self.read_table( + StringIO(data), sep=',', header=1, comment='#', + skip_footer=1) + self.assert_(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) + + # first chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(5) + self.assert_(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + # middle chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', header=1, + comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(1) + it.read(2) + self.assert_(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + # last chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(1) + it.read() + self.assert_(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + def test_skip_footer(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +""" + result = self.read_csv(StringIO(data), skip_footer=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) + + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), nrows=3) + tm.assert_frame_equal(result, expected) + + # skipfooter alias + result = self.read_csv(StringIO(data), skipfooter=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) + + tm.assert_frame_equal(result, expected) + + def test_decompression_regex_sep(self): + # GH 6607 + # This is a copy which should eventually be moved to ParserTests + # when the issue with the C parser is fixed + + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, sep='::', compression='gzip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, sep='::', compression='bz2') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + def test_read_table_buglet_4x_multiindex(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with multiindex columns is fixed in the C parser. + + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + # it works! + df = self.read_table(StringIO(text), sep='\s+') + self.assertEquals(df.index.names, ('one', 'two', 'three', 'four')) class TestFwfColspaceSniffing(tm.TestCase): def test_full_file(self): @@ -2545,12 +2837,58 @@ def test_compact_ints(self): def test_parse_dates_empty_string(self): # #2263 s = StringIO("Date, test\n2012-01-01, 1\n,2") - result = pd.read_csv(s, parse_dates=["Date"], na_filter=False) + result = self.read_csv(s, parse_dates=["Date"], na_filter=False) self.assertTrue(result['Date'].isnull()[1]) def test_usecols(self): raise nose.SkipTest("Usecols is not supported in C High Memory engine.") + def test_passing_dtype(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the dtype argument is supported by all engines. + + df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # GH 3795 + # passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + + # we expect all object columns, so need to convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result,df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + index_col=0) + + # valid but we don't support it (date) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0, parse_dates=['B']) + + # valid but we don't support it + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + index_col=0) + + def test_fallback_to_python(self): + # GH 6607 + data = 'a b c\n1 2 3' + + # specify C engine with unsupported options (raise) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep=None, + delim_whitespace=False) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep='\s') + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', skip_footer=1) + class TestCParserLowMemory(ParserTests, tm.TestCase): @@ -2706,16 +3044,24 @@ def test_decompression_regex_sep(self): tmp.write(data) tmp.close() - result = self.read_csv(path, sep='::', compression='gzip') - tm.assert_frame_equal(result, expected) + # GH 6607 + # Test currently only valid with the python engine because of + # regex sep. Temporarily copied to TestPythonParser. + # Here test for ValueError when passing regex sep: + + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='gzip') + tm.assert_frame_equal(result, expected) with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() - result = self.read_csv(path, sep='::', compression='bz2') - tm.assert_frame_equal(result, expected) + # GH 6607 + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='bz2') + tm.assert_frame_equal(result, expected) self.assertRaises(ValueError, self.read_csv, path, compression='bz3') @@ -2879,6 +3225,95 @@ def test_invalid_c_parser_opts_with_not_c_parser(self): engine)): read_csv(StringIO(data), engine=engine, **kwargs) + def test_passing_dtype(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the dtype argument is supported by all engines. + + df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # GH 3795 + # passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + + # we expect all object columns, so need to convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result,df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + index_col=0) + + # valid but we don't support it (date) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0, parse_dates=['B']) + + # valid but we don't support it + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + index_col=0) + + def test_fallback_to_python(self): + # GH 6607 + data = 'a b c\n1 2 3' + + # specify C engine with C-unsupported options (raise) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep=None, + delim_whitespace=False) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep='\s') + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', skip_footer=1) + + def test_raise_on_sep_with_delim_whitespace(self): + # GH 6607 + data = 'a b c\n1 2 3' + with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): + self.read_table(StringIO(data), sep='\s', delim_whitespace=True) + + +class TestMiscellaneous(tm.TestCase): + + # for tests that don't fit into any of the other classes, e.g. those that + # compare results for different engines or test the behavior when 'engine' + # is not passed + + def test_compare_whitespace_regex(self): + # GH 6607 + data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' + result_c = pd.read_table(StringIO(data), sep='\s+', engine='c') + result_py = pd.read_table(StringIO(data), sep='\s+', engine='python') + tm.assert_frame_equal(result_c, result_py) + + def test_fallback_to_python(self): + # GH 6607 + data = 'a b c\n1 2 3' + + # specify C-unsupported options with python-unsupported option + # (options will be ignored on fallback, raise) + with tm.assertRaisesRegexp(ValueError, 'Falling back'): + pd.read_table(StringIO(data), sep=None, + delim_whitespace=False, dtype={'a': float}) + with tm.assertRaisesRegexp(ValueError, 'Falling back'): + pd.read_table(StringIO(data), sep='\s', dtype={'a': float}) + with tm.assertRaisesRegexp(ValueError, 'Falling back'): + pd.read_table(StringIO(data), skip_footer=1, dtype={'a': float}) + + # specify C-unsupported options without python-unsupported options + with tm.assert_produces_warning(parsers.ParserWarning): + pd.read_table(StringIO(data), sep=None, delim_whitespace=False) + with tm.assert_produces_warning(parsers.ParserWarning): + pd.read_table(StringIO(data), sep='\s') + with tm.assert_produces_warning(parsers.ParserWarning): + pd.read_table(StringIO(data), skip_footer=1) + + class TestParseSQL(tm.TestCase): def test_convert_sql_column_floats(self): diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index da991ec23c373..f3da2175092e7 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1162,7 +1162,6 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) self->state = EAT_CRNL; break; } else if (IS_WHITESPACE(c)) { - END_FIELD(); self->state = EAT_WHITESPACE; break; } else { @@ -1319,10 +1318,14 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) /* self->state = START_RECORD; */ } else if (IS_WHITESPACE(c)){ // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(EAT_WHITESPACE); + END_LINE_STATE(EAT_WHITESPACE); } else { - PUSH_CHAR(c); - END_LINE_STATE(IN_FIELD); + /* XXX + * first character of a new record--need to back up and reread + * to handle properly... + */ + i--; buf--; /* back up one character (HACK!) */ + END_LINE_STATE(START_RECORD); } break; diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a9e48c62f9693..63355e6ef4a30 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -13501,7 +13501,7 @@ def check_query_with_nested_strings(self, parser, engine): 6 "page 3 load" 2/1/2014 1:02:01 6 "page 3 exit" 2/1/2014 1:02:31 """ - df = pd.read_csv(StringIO(raw), sep=r'\s{2,}', + df = pd.read_csv(StringIO(raw), sep=r'\s{2,}', engine='python', parse_dates=['timestamp']) expected = df[df.event == '"page 1 load"'] res = df.query("""'"page 1 load"' in event""", parser=parser, diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index ac420ee5d78cd..1eb43237c3185 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -462,7 +462,7 @@ def test_xs_level_multiple(self): a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - df = read_table(StringIO(text), sep='\s+') + df = read_table(StringIO(text), sep='\s+', engine='python') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') @@ -495,7 +495,7 @@ def test_xs_level0(self): a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - df = read_table(StringIO(text), sep='\s+') + df = read_table(StringIO(text), sep='\s+', engine='python') result = df.xs('a', level=0) expected = df.xs('a')