diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 42e756635e739..63267ebfe7cbf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -235,6 +235,7 @@ Other enhancements - :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`) - :func:`pandas.io.excel.read_excel` supports reading OpenDocument tables. Specify ``engine='odf'`` to enable. Consult the :ref:`IO User Guide ` for more details (:issue:`9070`) - :class:`Interval`, :class:`IntervalIndex`, and :class:`~arrays.IntervalArray` have gained an :attr:`~Interval.is_empty` attribute denoting if the given interval(s) are empty (:issue:`27219`) +- :class:`DataFrame` now treats lists of typing.NameTuple equivalently to lists of nametuples. The behavior of the latter has changed in this release, please see the relevant section in "Breaking Changes". .. _whatsnew_0250.api_breaking: @@ -803,6 +804,55 @@ order of the resulting DataFrame has changed compared to previous pandas verison pd.DataFrame(data) +DataFrame constructor treats list of namedtuple/dict in the same way +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, only the first element in the list was checked and if it was a +namedtuple, the field names of that single tuple were used as the column names. +Subsequent tuples were assumed to be of the same type, and their values were +looked up by position. As a consequence, if subsequent tuples of different types +were included, any additional fields were ignored, and if similarly named fields +appeard in a different order, alignment was not performed. + +This behavior has now changed so that namedtuples are treated much as list of +dict behaves, i.e as a "list of records". + +Additionaly, this change implies a change in the semantics of the `columns` +argument to :class:`DataFrame` when passing a list of namedtuples. Previously, +`columns` has "rename" semantics, now it has the same "lookup" semantics as a +list of records. Meaning that any name given in `columns` which doesn't appear +as a key in the record will be assigned a NaN value. + +Due to this change, The performance of constructing frames from a list +of namedtuples is roughly 50% slower. + +.. ipython:: python + + from collections import namedtuple + Foo = namedtuple("Foo", list("ab")) + tuples = [Foo(1, 3), Foo(2, 4)] + +*Previous Behavior*: + +The columns were lexicographically sorted previously, + +.. code-block:: python + + In [1]: pd.DataFrame(tuples, columns=['y', 'z']) + Out[1]: + y z + 0 1 3 + 1 2 4 + +*New Behavior*: + +The column order now matches the insertion-order of the keys in the ``dict``, +considering all the records from top to bottom. + +.. ipython:: python + + pd.DataFrame(tuples, columns=['Q', 'a']) + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 27ee685acfde7..9d1a07ef34371 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,4 +1,4 @@ -from collections import abc +from collections import abc, OrderedDict from decimal import Decimal from fractions import Fraction from numbers import Number @@ -312,28 +312,53 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): @cython.wraparound(False) @cython.boundscheck(False) -def dicts_to_array(dicts: list, columns: list): +def dicts_to_array(dicts: list, _columns : list): cdef: - Py_ssize_t i, j, k, n - ndarray[object, ndim=2] result - dict row + Py_ssize_t i, j, n + object result, columns + object row object col, onan = np.nan + dict d, nt_lookup - k = len(columns) n = len(dicts) + have_columns = len(_columns) > 0 + columns = OrderedDict.fromkeys(list(_columns or [])) + result = OrderedDict((k, np.full(n, np.nan, dtype='O')) for k in _columns) - result = np.empty((n, k), dtype='O') - + nt_lookup = {} for i in range(n): row = dicts[i] - for j in range(k): - col = columns[j] - if col in row: - result[i, j] = row[col] - else: - result[i, j] = onan + if hasattr(row, 'keys'): + d = row + for k in d: + v = d[k] + if k not in columns: + if have_columns: + continue + columns[k] = None + result[k] = np.full(n, np.nan, dtype='O') + result[k][i] = v + elif hasattr(row, "_fields"): + if type(row) not in nt_lookup: + l = [] + for j, k in enumerate(row._fields): + if k in columns or not have_columns: + # include this field in result + l.append((k, j)) + # create an array to store it + if k not in columns: + columns[k] = None + result[k] = np.full(n, np.nan, dtype='O') + # save (column_name, index) pairs + nt_lookup[type(row)] = l + + for k, j in nt_lookup[type(row)]: + result[k][i] = row[j] + else: + msg = "'%s' at row %d is not a valid record type" + raise ValueError(msg % (type(row), i)) - return result + return list(columns), list(result.values()) def fast_zip(list ndarrays): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 9d622d92e0979..ca6caa6053a03 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -9,6 +9,7 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( + Components, array_to_timedelta64, parse_timedelta_unit, precision_from_unit, @@ -901,7 +902,9 @@ def components(self): def f(x): if isna(x): - return [np.nan] * len(columns) + return Components( + np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan + ) return x.components else: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 461b5cc6232cd..843d372ee7103 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -3,7 +3,7 @@ from collections import abc from numbers import Number import re -from typing import Pattern +from typing import NamedTuple, Pattern import numpy as np @@ -380,7 +380,9 @@ def is_named_tuple(obj): False """ - return isinstance(obj, tuple) and hasattr(obj, "_fields") + return isinstance(obj, NamedTuple) or ( + isinstance(obj, tuple) and hasattr(obj, "_fields") + ) def is_hashable(obj): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c15f4ad8e1900..5b7bea1fb2e0c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -65,7 +65,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_nested_list_like, is_object_dtype, is_scalar, @@ -444,8 +443,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): data = list(data) if len(data) > 0: if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: - if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields arrays, columns = to_arrays(data, columns, dtype=dtype) columns = ensure_index(columns) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c437f686bd17b..cb2853023a600 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -34,6 +34,7 @@ is_integer_dtype, is_iterator, is_list_like, + is_named_tuple, is_object_dtype, pandas_dtype, ) @@ -460,12 +461,12 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays( + if isinstance(data[0], abc.Mapping) or is_named_tuple(data[0]): + return _list_of_records_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype ) + elif isinstance(data[0], (list, tuple)): + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], ABCSeries): return _list_of_series_to_arrays( data, columns, coerce_float=coerce_float, dtype=dtype @@ -535,8 +536,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): return values.T, columns -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): - """Convert list of dicts to numpy arrays +def _list_of_records_to_arrays(data, columns, coerce_float=False, dtype=None): + """Convert list of OrderedDict to numpy array if `columns` is not passed, column names are inferred from the records - for OrderedDict and (on Python>=3.6) dicts, the column names match @@ -556,17 +557,19 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): tuple arrays, columns """ - if columns is None: - gen = (list(x.keys()) for x in data) - types = (dict, OrderedDict) if PY36 else OrderedDict - sort = not any(isinstance(d, types) for d in data) + if not PY36 and columns is None: + gen = (list(x.keys() if hasattr(x, "keys") else x._fields) for x in data) + sort = not any(isinstance(d, OrderedDict) or is_named_tuple(d) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) + else: + columns = list(columns) if columns is not None else [] # assure that they are of the base dict class and not of derived # classes - data = [(type(d) is dict) and d or dict(d) for d in data] - - content = list(lib.dicts_to_array(data, list(columns)).T) + data = [ + ((type(d) is dict) and d) or (is_named_tuple(d) and d) or dict(d) for d in data + ] + columns, content = lib.dicts_to_array(data, columns) return _convert_object_array( content, columns, dtype=dtype, coerce_float=coerce_float ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 29e46ac70c943..3fded92c7467b 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1336,8 +1336,36 @@ def test_constructor_list_of_namedtuples(self): tm.assert_frame_equal(result, expected) # with columns - expected = DataFrame({"y": [1, 2], "z": [3, 4]}) - result = DataFrame(tuples, columns=["y", "z"]) + # namedtuples now behave like records, so columns + # act like lookups, not rename + expected = DataFrame({"a": [1, 2], "x": [np.nan, np.nan]}) + result = DataFrame(tuples, columns=["a", "x"]) + tm.assert_frame_equal(result, expected) + + # new-style NamedTuple + # NOTE: Enable after py3.5 support is dropped + # from typing import NamedTuple + # class named_tuple3(NamedTuple): + # a: int + # b: int + # named_tuple3 = namedtuple("named_tuple3", list("ab")) + # tuples = [named_tuple3(1, 3), named_tuple3(2, 4)] + # expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + # result = DataFrame(tuples) + # tm.assert_frame_equal(result, expected) + + expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + result = DataFrame(tuples, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # hetero columns + named_tuple1 = namedtuple("Pandas", list("ab")) + named_tuple2 = namedtuple("sandaP", list("yabx")) + tuples = [named_tuple1(1, 2), named_tuple2(3, 4, 5, 6)] + result = DataFrame(tuples) + expected = pd.DataFrame( + {"a": [1, 4], "b": [2, 5], "y": [np.nan, 3.0], "x": [np.nan, 6.0]} + ) tm.assert_frame_equal(result, expected) def test_constructor_list_of_dict_order(self):