diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f6f956832eebe..93993fd0a0cab 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -63,11 +63,12 @@ def get_dtype_kinds(l): return typs -def _get_series_result_type(result): +def _get_series_result_type(result, objs=None): """ return appropriate class of Series concat input is either dict or array-like """ + # concat Series with axis 1 if isinstance(result, dict): # concat Series with axis 1 if all(is_sparse(c) for c in compat.itervalues(result)): @@ -77,13 +78,12 @@ def _get_series_result_type(result): from pandas.core.frame import DataFrame return DataFrame - elif is_sparse(result): - # concat Series with axis 1 + # otherwise it is a SingleBlockManager (axis = 0) + if result._block.is_sparse: from pandas.core.sparse.api import SparseSeries return SparseSeries else: - from pandas.core.series import Series - return Series + return objs[0]._constructor def _get_frame_result_type(result, objs): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 689f5521e1ccb..f4f231be570c2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -102,6 +102,7 @@ class Block(PandasObject): _validate_ndim = True _ftype = 'dense' _holder = None + _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None, fastpath=False): if ndim is None: @@ -314,6 +315,15 @@ def ftype(self): def merge(self, other): return _merge_blocks([self, other]) + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1)) + def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): """ @@ -2309,6 +2319,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): _verify_integrity = True _can_hold_na = True _holder = Categorical + _concatenator = staticmethod(_concat._concat_categorical) def __init__(self, values, placement, fastpath=False, **kwargs): @@ -2432,6 +2443,17 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + # not using self.make_block_same_class as values can be object dtype + return make_block( + values, placement=placement or slice(0, len(values), 1), + ndim=self.ndim) + class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () @@ -2571,6 +2593,7 @@ class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ __slots__ = () _holder = DatetimeIndex + _concatenator = staticmethod(_concat._concat_datetime) is_datetimetz = True def __init__(self, values, placement, ndim=2, **kwargs): @@ -2711,6 +2734,16 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + # not using self.make_block_same_class as values can be non-tz dtype + return make_block( + values, placement=placement or slice(0, len(values), 1)) + class SparseBlock(NonConsolidatableMixIn, Block): """ implement as a list of sparse arrays of the same dtype """ @@ -2721,6 +2754,7 @@ class SparseBlock(NonConsolidatableMixIn, Block): _can_hold_na = True _ftype = 'sparse' _holder = SparseArray + _concatenator = staticmethod(_concat._concat_sparse) @property def shape(self): @@ -4517,6 +4551,45 @@ def fast_xs(self, loc): """ return self._block.values[loc] + def concat(self, to_concat, new_axis): + """ + Concatenate a list of SingleBlockManagers into a single + SingleBlockManager. + + Used for pd.concat of Series objects with axis=0. + + Parameters + ---------- + to_concat : list of SingleBlockManagers + new_axis : Index of the result + + Returns + ------- + SingleBlockManager + + """ + non_empties = [x for x in to_concat if len(x) > 0] + + # check if all series are of the same block type: + if len(non_empties) > 0: + blocks = [obj.blocks[0] for obj in non_empties] + + if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa + new_block = blocks[0].concat_same_type(blocks) + else: + values = [x.values for x in blocks] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + else: + values = [x._block.values for x in to_concat] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + + mgr = SingleBlockManager(new_block, new_axis) + return mgr + def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ @@ -5105,13 +5178,42 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): [get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers], concat_axis) - blocks = [make_block( - concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement) for placement, join_units in concat_plan] + blocks = [] + + for placement, join_units in concat_plan: + + if is_uniform_join_units(join_units): + b = join_units[0].block.concat_same_type( + [ju.block for ju in join_units], placement=placement) + else: + b = make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement) + blocks.append(b) return BlockManager(blocks, axes) +def is_uniform_join_units(join_units): + """ + Check if the join units consist of blocks of uniform type that can + be concatenated using Block.concat_same_type instead of the generic + concatenate_join_units (which uses `_concat._concat_compat`). + + """ + return ( + # all blocks need to have the same type + all([type(ju.block) is type(join_units[0].block) for ju in join_units]) and # noqa + # no blocks that would get missing values (can lead to type upcasts) + all([not ju.is_na for ju in join_units]) and + # no blocks with indexers (as then the dimensions do not fit) + all([not ju.indexers for ju in join_units]) and + # disregard Panels + all([ju.block.ndim <= 2 for ju in join_units]) and + # only use this path when there is something to concatenate + len(join_units) > 1) + + def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4040c65136617..c54763f8ebde1 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -362,20 +362,12 @@ def get_result(self): # stack blocks if self.axis == 0: - # concat Series with length to keep dtype as much - non_empties = [x for x in self.objs if len(x) > 0] - if len(non_empties) > 0: - values = [x._values for x in non_empties] - else: - values = [x._values for x in self.objs] - new_data = _concat._concat_compat(values) - name = com._consensus_name_attr(self.objs) - cons = _concat._get_series_result_type(new_data) - return (cons(new_data, index=self.new_axes[0], - name=name, dtype=new_data.dtype) - .__finalize__(self, method='concat')) + mgr = self.objs[0]._data.concat([x._data for x in self.objs], + self.new_axes) + cons = _concat._get_series_result_type(mgr, self.objs) + return cons(mgr, name=name).__finalize__(self, method='concat') # combine as columns in a frame else: diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index cccde76c3e1d9..d98b293ed8daa 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -4,14 +4,26 @@ import numpy as np import pandas as pd -from pandas.core.internals import Block, BlockManager, SingleBlockManager +from pandas.core.internals import ( + Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn) -class CustomBlock(Block): +class CustomBlock(NonConsolidatableMixIn, Block): + + _holder = np.ndarray def formatting_values(self): return np.array(["Val: {}".format(i) for i in self.values]) + def concat_same_type(self, to_concat, placement=None): + """ + Always concatenate disregarding self.ndim as the values are + always 1D in this custom Block + """ + values = np.concatenate([blk.values for blk in to_concat]) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1)) + def test_custom_repr(): values = np.arange(3, dtype='int64') @@ -23,7 +35,30 @@ def test_custom_repr(): assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64' # dataframe - block = CustomBlock(values.reshape(1, -1), placement=slice(0, 1)) + block = CustomBlock(values, placement=slice(0, 1)) blk_mgr = BlockManager([block], [['col'], range(3)]) df = pd.DataFrame(blk_mgr) assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2' + + +def test_concat_series(): + # GH17728 + values = np.arange(3, dtype='int64') + block = CustomBlock(values, placement=slice(0, 3)) + s = pd.Series(block, pd.RangeIndex(3), fastpath=True) + + res = pd.concat([s, s]) + assert isinstance(res._data.blocks[0], CustomBlock) + + +def test_concat_dataframe(): + # GH17728 + df = pd.DataFrame({'a': [1, 2, 3]}) + blocks = df._data.blocks + values = np.arange(3, dtype='int64') + custom_block = CustomBlock(values, placement=slice(1, 2)) + blocks = blocks + (custom_block, ) + block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df.index]) + df = pd.DataFrame(block_manager) + res = pd.concat([df, df]) + assert isinstance(res._data.blocks[1], CustomBlock)