From 80a09141b298572cd544a2db1e2a2831be197447 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 16 Dec 2021 19:59:47 +0100 Subject: [PATCH 1/7] BUG: to_csv casting datetimes in categorical to int --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/internals/blocks.py | 4 ++++ pandas/tests/io/formats/test_to_csv.py | 6 ++++++ 3 files changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 372f991d96a22..ff5404e88162f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -749,6 +749,7 @@ I/O - Bug in :func:`read_csv` raising ``ValueError`` when names was longer than header but equal to data rows for ``engine="python"`` (:issue:`38453`) - Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) +- Bug in :func:`to_csv` converting datetimes in categorical :class:`Series` to integers (:issue:`40754`) - Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`) - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 85aa61142dd39..8352692bcc8fe 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2105,6 +2105,10 @@ def to_native_types( **kwargs, ) -> np.ndarray: """convert to our native types format""" + if isinstance(values, Categorical): + # GH#40754 Convert categorical datetimes to datetime array + values = np.asarray(values) + values = ensure_wrapped_if_datetimelike(values) if isinstance(values, (DatetimeArray, TimedeltaArray)): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 059fd96db43ad..8e61dd4f86ec7 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -282,6 +282,12 @@ def test_to_csv_date_format(self): df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + def test_to_csv_date_format_in_categorical(self): + # GH#40754 + ser = pd.Series(pd.to_datetime(["2021-03-27"], format="%Y-%m-%d")) + ser = ser.astype("category") + assert ser.to_csv(index=False) == "0\n2021-03-27\n" + def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) From 84ecc33ed82d9280b2f5d782704655b2711cdbea Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 00:10:07 +0100 Subject: [PATCH 2/7] Move fix --- pandas/core/arrays/categorical.py | 8 +++++++- pandas/core/internals/blocks.py | 4 ---- pandas/tests/arrays/categorical/test_dtypes.py | 9 +++++++++ pandas/tests/io/formats/test_to_csv.py | 3 ++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 67dc6ade25254..480330a0e00e0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -108,6 +108,7 @@ ) import pandas.core.common as com from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, extract_array, sanitize_array, ) @@ -532,7 +533,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array try: - new_cats = np.asarray(self.categories) + if is_datetime64_dtype(self.categories): + values = ensure_wrapped_if_datetimelike(np.asarray(self.categories)) + new_cats = values._format_native_types() + else: + new_cats = np.asarray(self.categories) + new_cats = new_cats.astype(dtype=dtype, copy=copy) fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8352692bcc8fe..85aa61142dd39 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2105,10 +2105,6 @@ def to_native_types( **kwargs, ) -> np.ndarray: """convert to our native types format""" - if isinstance(values, Categorical): - # GH#40754 Convert categorical datetimes to datetime array - values = np.asarray(values) - values = ensure_wrapped_if_datetimelike(values) if isinstance(values, (DatetimeArray, TimedeltaArray)): diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 209891ba8f043..511cd1d2c69fb 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -7,8 +7,10 @@ Categorical, CategoricalIndex, Index, + NaT, Series, Timestamp, + to_datetime, ) import pandas._testing as tm @@ -176,6 +178,13 @@ def test_astype_category(self, dtype_ordered, cat_ordered): expected = cat tm.assert_categorical_equal(result, expected) + def test_astype_object_datetime_categories(self): + # GH#40754 + cat = Categorical(to_datetime(["2021-03-27", NaT], format="%Y-%m-%d")) + result = cat.astype(object) + expected = np.array(["2021-03-27", np.nan], dtype="object") + tm.assert_numpy_array_equal(result, expected) + def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 8e61dd4f86ec7..ad186056b6250 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -286,7 +286,8 @@ def test_to_csv_date_format_in_categorical(self): # GH#40754 ser = pd.Series(pd.to_datetime(["2021-03-27"], format="%Y-%m-%d")) ser = ser.astype("category") - assert ser.to_csv(index=False) == "0\n2021-03-27\n" + expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27"]) + assert ser.to_csv(index=False) == expected def test_to_csv_multi_index(self): # see gh-6618 From 187db9db2b77ab34e4519565d8bd5153258c3e3f Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 11:08:04 +0100 Subject: [PATCH 3/7] Fix both cases --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/categorical.py | 12 ++++++------ pandas/core/internals/blocks.py | 10 ++++++++++ pandas/tests/arrays/categorical/test_dtypes.py | 4 ++-- pandas/tests/io/formats/test_to_csv.py | 12 ++++++++++-- 5 files changed, 29 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ca8453205e481..e5be1b1a23a16 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -607,6 +607,7 @@ Categorical ^^^^^^^^^^^ - Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`) +- Bug in :meth:`Categorical.astype` casting datetimes to int for dtype ``object`` (:issue:`44930`) - Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index db2283e18af19..dd19da5834809 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -539,13 +539,13 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array - try: - if is_datetime64_dtype(self.categories): - values = ensure_wrapped_if_datetimelike(np.asarray(self.categories)) - new_cats = values._format_native_types() - else: - new_cats = np.asarray(self.categories) + if is_datetime64_dtype(self.categories): + new_cats = ensure_wrapped_if_datetimelike(self.categories._values) + # new_cats = values._format_native_types() + else: + new_cats = np.asarray(self.categories) + try: new_cats = new_cats.astype(dtype=dtype, copy=copy) fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7056a34c73008..bf66fa05dcd27 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -46,6 +46,7 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( + ensure_platform_int, is_1d_only_ea_dtype, is_1d_only_ea_obj, is_dtype_equal, @@ -88,6 +89,7 @@ replace_regex, should_use_regex, ) +from pandas.core.array_algos.take import take_nd from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -2105,6 +2107,14 @@ def to_native_types( **kwargs, ) -> np.ndarray: """convert to our native types format""" + if isinstance(values, Categorical): + # GH#40754 Convert categorical datetimes to datetime array + values = take_nd( + values.categories._values, + ensure_platform_int(values._codes), + fill_value=na_rep, + ) + values = ensure_wrapped_if_datetimelike(values) if isinstance(values, (DatetimeArray, TimedeltaArray)): diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 511cd1d2c69fb..27e73387263a2 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -180,9 +180,9 @@ def test_astype_category(self, dtype_ordered, cat_ordered): def test_astype_object_datetime_categories(self): # GH#40754 - cat = Categorical(to_datetime(["2021-03-27", NaT], format="%Y-%m-%d")) + cat = Categorical(to_datetime(["2021-03-27", NaT])) result = cat.astype(object) - expected = np.array(["2021-03-27", np.nan], dtype="object") + expected = np.array([Timestamp("2021-03-27 00:00:00"), np.nan], dtype="object") tm.assert_numpy_array_equal(result, expected) def test_iter_python_types(self): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index f25b246111f42..bf17132d1b9c2 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -284,11 +284,19 @@ def test_to_csv_date_format(self): def test_to_csv_date_format_in_categorical(self): # GH#40754 - ser = pd.Series(pd.to_datetime(["2021-03-27"], format="%Y-%m-%d")) + ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) ser = ser.astype("category") - expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27"]) + expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""']) assert ser.to_csv(index=False) == expected + ser = pd.Series( + pd.date_range( + start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin" + ).append(pd.DatetimeIndex([pd.NaT])) + ) + ser = ser.astype("category") + assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected + def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) From bc2cd3de61947fe02d67c5ecb6d79e0152afc025 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 12:56:01 +0100 Subject: [PATCH 4/7] Remove array manager skip --- pandas/tests/series/indexing/test_where.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index bf4bee203a3a1..b8dbe58b95d4a 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_integer import pandas as pd @@ -444,11 +442,6 @@ def test_where_datetimelike_categorical(request, tz_naive_fixture, using_array_m # GH#37682 tz = tz_naive_fixture - if using_array_manager and tz is None: - # TODO(ArrayManager) DataFrame.values not yet correctly returning datetime array - # for categorical with datetime categories - td.mark_array_manager_not_yet_implemented(request) - dr = date_range("2001-01-01", periods=3, tz=tz)._with_freq(None) lvals = pd.DatetimeIndex([dr[0], dr[1], pd.NaT]) rvals = pd.Categorical([dr[0], pd.NaT, dr[2]]) From 7d61d4213edbeab986833a997f8df99edad13d41 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 23:01:50 +0100 Subject: [PATCH 5/7] Remove comment --- pandas/core/arrays/categorical.py | 1 - pandas/tests/series/indexing/test_where.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index dd19da5834809..abd35f72aaa05 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -541,7 +541,6 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: # GH8628 (PERF): astype category codes instead of astyping array if is_datetime64_dtype(self.categories): new_cats = ensure_wrapped_if_datetimelike(self.categories._values) - # new_cats = values._format_native_types() else: new_cats = np.asarray(self.categories) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index b8dbe58b95d4a..9ed04885bd9e1 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -438,7 +438,7 @@ def test_where_categorical(frame_or_series): tm.assert_equal(exp, res) -def test_where_datetimelike_categorical(request, tz_naive_fixture, using_array_manager): +def test_where_datetimelike_categorical(request, tz_naive_fixture): # GH#37682 tz = tz_naive_fixture From fc5d27356a46d75aa2bc377cc107c510e9a3a04b Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 21 Dec 2021 18:08:09 +0100 Subject: [PATCH 6/7] Add test --- pandas/tests/arrays/categorical/test_dtypes.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 27e73387263a2..8733bfccd9f9d 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -185,6 +185,13 @@ def test_astype_object_datetime_categories(self): expected = np.array([Timestamp("2021-03-27 00:00:00"), np.nan], dtype="object") tm.assert_numpy_array_equal(result, expected) + def test_astype_object_timestamp_categories(self): + # GH#18024 + cat = Categorical([Timestamp("2014-01-01")]) + result = cat.astype(object) + expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") + tm.assert_numpy_array_equal(result, expected) + def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) From d776d9275116adec4626f7e2ff77fd285feb0d06 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 21 Dec 2021 18:09:43 +0100 Subject: [PATCH 7/7] Adjust whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 981e06d4faaae..7d117d6822071 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -613,7 +613,7 @@ Categorical ^^^^^^^^^^^ - Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`) -- Bug in :meth:`Categorical.astype` casting datetimes to int for dtype ``object`` (:issue:`44930`) +- Bug in :meth:`Categorical.astype` casting datetimes and :class:`Timestamp` to int for dtype ``object`` (:issue:`44930`) - Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)