diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2592be9c4a350..7d117d6822071 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -613,6 +613,7 @@ Categorical ^^^^^^^^^^^ - Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`) +- Bug in :meth:`Categorical.astype` casting datetimes and :class:`Timestamp` to int for dtype ``object`` (:issue:`44930`) - Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`) - Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`) @@ -760,6 +761,7 @@ I/O - Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) - Bug in :func:`read_csv` not raising an ``ValueError`` when ``\n`` was specified as ``delimiter`` or ``sep`` which conflicts with ``lineterminator`` (:issue:`43528`) +- Bug in :func:`to_csv` converting datetimes in categorical :class:`Series` to integers (:issue:`40754`) - Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`) - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f9d066f1e694d..abd35f72aaa05 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -108,6 +108,7 @@ ) import pandas.core.common as com from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, extract_array, sanitize_array, ) @@ -538,8 +539,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array - try: + if is_datetime64_dtype(self.categories): + new_cats = ensure_wrapped_if_datetimelike(self.categories._values) + else: new_cats = np.asarray(self.categories) + + try: new_cats = new_cats.astype(dtype=dtype, copy=copy) fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index abbebcefc7a87..ba4a8ac202e36 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -46,6 +46,7 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( + ensure_platform_int, is_1d_only_ea_dtype, is_1d_only_ea_obj, is_dtype_equal, @@ -88,6 +89,7 @@ replace_regex, should_use_regex, ) +from pandas.core.array_algos.take import take_nd from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -2087,6 +2089,14 @@ def to_native_types( **kwargs, ) -> np.ndarray: """convert to our native types format""" + if isinstance(values, Categorical): + # GH#40754 Convert categorical datetimes to datetime array + values = take_nd( + values.categories._values, + ensure_platform_int(values._codes), + fill_value=na_rep, + ) + values = ensure_wrapped_if_datetimelike(values) if isinstance(values, (DatetimeArray, TimedeltaArray)): diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 209891ba8f043..8733bfccd9f9d 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -7,8 +7,10 @@ Categorical, CategoricalIndex, Index, + NaT, Series, Timestamp, + to_datetime, ) import pandas._testing as tm @@ -176,6 +178,20 @@ def test_astype_category(self, dtype_ordered, cat_ordered): expected = cat tm.assert_categorical_equal(result, expected) + def test_astype_object_datetime_categories(self): + # GH#40754 + cat = Categorical(to_datetime(["2021-03-27", NaT])) + result = cat.astype(object) + expected = np.array([Timestamp("2021-03-27 00:00:00"), np.nan], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + def test_astype_object_timestamp_categories(self): + # GH#18024 + cat = Categorical([Timestamp("2014-01-01")]) + result = cat.astype(object) + expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") + tm.assert_numpy_array_equal(result, expected) + def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 8815423d95d65..bf17132d1b9c2 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -282,6 +282,21 @@ def test_to_csv_date_format(self): df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + def test_to_csv_date_format_in_categorical(self): + # GH#40754 + ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) + ser = ser.astype("category") + expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""']) + assert ser.to_csv(index=False) == expected + + ser = pd.Series( + pd.date_range( + start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin" + ).append(pd.DatetimeIndex([pd.NaT])) + ) + ser = ser.astype("category") + assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected + def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index bf4bee203a3a1..9ed04885bd9e1 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_integer import pandas as pd @@ -440,15 +438,10 @@ def test_where_categorical(frame_or_series): tm.assert_equal(exp, res) -def test_where_datetimelike_categorical(request, tz_naive_fixture, using_array_manager): +def test_where_datetimelike_categorical(request, tz_naive_fixture): # GH#37682 tz = tz_naive_fixture - if using_array_manager and tz is None: - # TODO(ArrayManager) DataFrame.values not yet correctly returning datetime array - # for categorical with datetime categories - td.mark_array_manager_not_yet_implemented(request) - dr = date_range("2001-01-01", periods=3, tz=tz)._with_freq(None) lvals = pd.DatetimeIndex([dr[0], dr[1], pd.NaT]) rvals = pd.Categorical([dr[0], pd.NaT, dr[2]])