diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 743d45e1fa400..a8fcd6d03847c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -103,6 +103,24 @@ def __repr__(self) -> str: def _is_boolean(self) -> bool: return True + def __from_arrow__(self, array): + """Construct BooleanArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # TODO should optimize this without going through object array + bool_arr = BooleanArray._from_sequence(np.array(arr)) + results.append(bool_arr) + + return BooleanArray._concat_same_type(results) + def coerce_to_array(values, mask=None, copy: bool = False): """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3bad7f0162f44..0d30aa06cd466 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -86,7 +86,7 @@ def __from_arrow__(self, array): results = [] for arr in chunks: - # using _from_sequence to ensure None is convered to np.nan + # using _from_sequence to ensure None is convered to NA str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) @@ -208,7 +208,10 @@ def __arrow_array__(self, type=None): if type is None: type = pa.string() - return pa.array(self._ndarray, type=type, from_pandas=True) + + values = self._ndarray.copy() + values[self.isna()] = None + return pa.array(values, type=type, from_pandas=True) def _values_for_factorize(self): arr = self._ndarray.copy() diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 0544ee4002890..c3f342f16a0bf 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -235,5 +235,5 @@ def test_arrow_roundtrip(): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) tm.assert_frame_equal(result, df) - # ensure the missing value is represented by NaN and not None - assert np.isnan(result.loc[2, "a"]) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is pd.NA diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 90bcd66987e0d..abec4b42c0ffb 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -757,12 +757,29 @@ def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): # result = arr[mask] -@pytest.mark.skip(reason="broken test") @td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 import pyarrow as pa arr = pa.array(data) - expected = pa.array(np.array(data, dtype=object), type=pa.bool_(), from_pandas=True) + + # TODO use to_numpy(na_value=None) here + data_object = np.array(data, dtype=object) + data_object[data.isna()] = None + expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array([True, False, None], dtype="boolean") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "bool" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.BooleanDtype) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 0b8677d6b1415..fc3d55e110d69 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -525,7 +525,6 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - @pytest.mark.skip(reason="broken test") @td.skip_if_no("pyarrow", min_version="0.15.0") def test_additional_extension_arrays(self, pa): # test additional ExtensionArrays that are supported through the