From 00feb1465551cf97b0e34de7927bf45ecaf8d696 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Sep 2021 18:51:56 -0700 Subject: [PATCH 1/2] REF: implement make_na_array --- pandas/core/internals/concat.py | 76 ++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 69b01f0b26be3..ddd0ba93e8ef6 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -10,7 +10,10 @@ import numpy as np -from pandas._libs import internals as libinternals +from pandas._libs import ( + NaT, + internals as libinternals, +) from pandas._typing import ( ArrayLike, DtypeObj, @@ -387,41 +390,12 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None and not self.is_na: # No upcasting is necessary - fill_value = self.block.fill_value values = self.block.get_values() + + elif self.is_na: + return make_na_array(empty_dtype, self.shape) + else: - fill_value = upcasted_na - - if self.is_na: - - if is_datetime64tz_dtype(empty_dtype): - i8values = np.full(self.shape, fill_value.value) - return DatetimeArray(i8values, dtype=empty_dtype) - - elif is_1d_only_ea_dtype(empty_dtype): - empty_dtype = cast(ExtensionDtype, empty_dtype) - cls = empty_dtype.construct_array_type() - - missing_arr = cls._from_sequence([], dtype=empty_dtype) - ncols, nrows = self.shape - assert ncols == 1, ncols - empty_arr = -1 * np.ones((nrows,), dtype=np.intp) - return missing_arr.take( - empty_arr, allow_fill=True, fill_value=fill_value - ) - elif isinstance(empty_dtype, ExtensionDtype): - # TODO: no tests get here, a handful would if we disabled - # the dt64tz special-case above (which is faster) - cls = empty_dtype.construct_array_type() - missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype) - missing_arr[:] = fill_value - return missing_arr - else: - # NB: we should never get here with empty_dtype integer or bool; - # if we did, the missing_arr.fill would cast to gibberish - missing_arr = np.empty(self.shape, dtype=empty_dtype) - missing_arr.fill(fill_value) - return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat @@ -449,6 +423,40 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: return values +def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike: + """ + Construct an np.ndarray or ExtensionArray of the given dtype and shape + holding all-NA values. + """ + if is_datetime64tz_dtype(dtype): + # NaT here is analogous to dtype.na_value below + i8values = np.full(shape, NaT.value) + return DatetimeArray(i8values, dtype=dtype) + + elif is_1d_only_ea_dtype(dtype): + dtype = cast(ExtensionDtype, dtype) + cls = dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=dtype) + nrows = shape[-1] + taker = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value) + elif isinstance(dtype, ExtensionDtype): + # TODO: no tests get here, a handful would if we disabled + # the dt64tz special-case above (which is faster) + cls = dtype.construct_array_type() + missing_arr = cls._empty(shape=shape, dtype=dtype) + missing_arr[:] = dtype.na_value + return missing_arr + else: + # NB: we should never get here with dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish + missing_arr = np.empty(shape, dtype=dtype) + fill_value = _dtype_to_na_value(dtype) + missing_arr.fill(fill_value) + return missing_arr + + def _concatenate_join_units( join_units: list[JoinUnit], concat_axis: int, copy: bool ) -> ArrayLike: From 5e76eeec1f8b5c91a9e01e87ae57156650e5dc55 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Sep 2021 08:46:08 -0700 Subject: [PATCH 2/2] REF: simplify get_reindexed_values --- pandas/core/internals/concat.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index ddd0ba93e8ef6..2927416e9214e 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -385,14 +385,10 @@ def is_na(self) -> bool: return True return False - def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: + def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike: values: ArrayLike - if upcasted_na is None and not self.is_na: - # No upcasting is necessary - values = self.block.get_values() - - elif self.is_na: + if self.is_na: return make_na_array(empty_dtype, self.shape) else: @@ -401,14 +397,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: # preserve these for validation in concat_compat return self.block.values - if self.block.is_bool: - # External code requested filling/upcasting, bool values must - # be upcasted to object to avoid being upcasted to numeric. - values = self.block.astype(np.object_).values - else: - # No dtype upcasting is done here, it will be performed during - # concatenation itself. - values = self.block.values + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside @@ -469,12 +460,7 @@ def _concatenate_join_units( empty_dtype = _get_empty_dtype(join_units) - upcasted_na = _dtype_to_na_value(empty_dtype) - - to_concat = [ - ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) - for ju in join_units - ] + to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype) for ju in join_units] if len(to_concat) == 1: # Only one block, nothing to concatenate.