diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 3160b35386fa2..7a9b9ddaf19f4 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -136,6 +136,9 @@ Enhancements - ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`) +- ``read_csv`` and ``read_table`` can now read index columns from the first + line after the header when using the C engine (:issue:`6893`) + .. _whatsnew_0141.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 22fe3ef16e34d..12e5820953b2e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1079,6 +1079,10 @@ def __init__(self, src, **kwds): self.orig_names = self.names + # index_col may be specified on line after the header + if self.index_col is None: + self.index_col = self._reader.index_col + if not self._has_complex_date_col: if (self._reader.leading_cols == 0 and _is_index_col(self.index_col)): diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index c02a3172f4adc..cc7d844cedd32 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1568,21 +1568,22 @@ def test_converter_return_string_bug(self): self.assertEqual(df2['Number1'].dtype, float) def test_read_table_buglet_4x_multiindex(self): - # GH 6607 - # Parsing multi-level index currently causes an error in the C parser. - # Temporarily copied to TestPythonParser. - # Here test that CParserError is raised: - - with tm.assertRaises(CParserError): - text = """ A B C D E + text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - # it works! - df = self.read_table(StringIO(text), sep='\s+') - self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) + # it works! + df = self.read_table(StringIO(text), sep='\s+') + self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) + + # GH 6893 + data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' + expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)], + columns=list('abcABC'), index=list('abc')) + actual = self.read_table(StringIO(data), sep='\s+') + tm.assert_frame_equal(actual, expected) def test_read_csv_parse_simple_list(self): text = """foo @@ -2713,28 +2714,6 @@ def test_decompression_regex_sep(self): self.assertRaises(ValueError, self.read_csv, path, compression='bz3') - def test_read_table_buglet_4x_multiindex(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the issue with multi-level index is fixed in the C parser. - - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - # it works! - df = self.read_table(StringIO(text), sep='\s+') - self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) - - # GH 6893 - data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' - expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)], - columns=list('abcABC'), index=list('abc')) - actual = self.read_table(StringIO(data), sep='\s+') - tm.assert_frame_equal(actual, expected) - class TestFwfColspaceSniffing(tm.TestCase): def test_full_file(self): # File with all values diff --git a/pandas/parser.pyx b/pandas/parser.pyx index f303298e88273..8e80862cb19fb 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -574,6 +574,17 @@ cdef class TextReader: raise IOError('Expected file path name or file-like object,' ' got %s type' % type(source)) + cdef _word2name(self, word, char *errors): + if self.c_encoding == NULL and not PY3: + name = PyBytes_FromString(word) + else: + if self.c_encoding == NULL or self.c_encoding == b'utf-8': + name = PyUnicode_FromString(word) + else: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) + return name + cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] @@ -612,16 +623,7 @@ cdef class TextReader: counts = {} unnamed_count = 0 for i in range(field_count): - word = self.parser.words[start + i] - - if self.c_encoding == NULL and not PY3: - name = PyBytes_FromString(word) - else: - if self.c_encoding == NULL or self.c_encoding == b'utf-8': - name = PyUnicode_FromString(word) - else: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + name = self._word2name(self.parser.words[start + i], errors) if name == '': if self.has_mi_columns: @@ -685,13 +687,56 @@ cdef class TextReader: else: # not self.has_usecols: field_count = self.parser.line_fields[data_line] + passed_count = len(header[0]) + + # #6893: look for index columns on first line after header + + # hack: temporarily set expected_fields to prevent parser from + # raising if it sees extra columns + ex_fields = self.parser.expected_fields + self.parser.expected_fields = field_count + + datapos = self.parser.datapos # save position + self._tokenize_rows(1) + self.parser.expected_fields = ex_fields # restore expected_fields + + if self.parser.lines == data_line + 2: + field_count_next = self.parser.line_fields[data_line + 1] + + if field_count_next > field_count: + # found extra columns in the second row after the header + # check whether previous row contains index columns + start = self.parser.line_start[data_line] + + line = [self._word2name(self.parser.words[start + i], errors) + for i in range(self.parser.line_fields[data_line])] + + # remove trailing empty fields + while not line[-1]: + line.pop() + + if passed_count + len(line) == field_count_next: + for h in header: + for c in reversed(line): + h.insert(0, c) + + field_count = field_count_next + passed_count = field_count + self.index_col = line + self.parser_start += 1 + + else: + # hack: didn't find index columns, back up a line and + # let the parser code hande this... + self.parser.datapos = datapos + self.parser.lines -= 1 + self.parser.file_lines -= 1 + self.parser.line_fields[self.parser.lines] = 0 # #2981 if self.names is not None: field_count = max(field_count, len(self.names)) - passed_count = len(header[0]) - # if passed_count > field_count: # raise CParserError('Column names have %d fields, ' # 'data has %d fields'