diff --git a/doc/source/text.rst b/doc/source/text.rst index 2b6459b581c1e..1e620acb1f88a 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -218,7 +218,8 @@ Extract first match in each subject (extract) ``DataFrame``, depending on the subject and regular expression pattern (same behavior as pre-0.18.0). When ``expand=True`` it always returns a ``DataFrame``, which is more consistent and less - confusing from the perspective of a user. + confusing from the perspective of a user. ``expand=True`` is the + default since version 0.23.0. The ``extract`` method accepts a `regular expression `__ with at least one diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 69965f44d87a8..0ac27a2f23386 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -296,6 +296,53 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +Extraction of matching patterns from strings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, extracting matching patterns from strings with :func:`str.extract` used to return a +``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was +extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +``expand`` is set to ``False`` (:issue:`11386`). + +Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to +``False``), but now raises a ``ValueError``. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: s = pd.Series(['number 10', '12 eggs']) + + In [2]: extracted = s.str.extract('.*(\d\d).*') + + In [3]: extracted + Out [3]: + 0 10 + 1 12 + dtype: object + + In [4]: type(extracted) + Out [4]: + pandas.core.series.Series + +New Behavior: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*') + extracted + type(extracted) + +To restore previous behavior, simply set ``expand`` to ``False``: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*', expand=False) + extracted + type(extracted) + .. _whatsnew_0230.api: Other API Changes diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 12c7feb5f2b15..b1c1ede66236c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -598,7 +598,7 @@ def _str_extract_frame(arr, pat, flags=0): dtype=object) -def str_extract(arr, pat, flags=0, expand=None): +def str_extract(arr, pat, flags=0, expand=True): r""" For each subject string in the Series, extract groups from the first match of regular expression pat. @@ -610,7 +610,7 @@ def str_extract(arr, pat, flags=0, expand=None): flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE - expand : bool, default False + expand : bool, default True * If True, return DataFrame. * If False, return Series/Index/DataFrame. @@ -676,15 +676,6 @@ def str_extract(arr, pat, flags=0, expand=None): dtype: object """ - if expand is None: - warnings.warn( - "currently extract(expand=None) " + - "means expand=False (return Index/Series/DataFrame) " + - "but in a future version of pandas this will be changed " + - "to expand=True (return DataFrame)", - FutureWarning, - stacklevel=3) - expand = False if not isinstance(expand, bool): raise ValueError("expand must be True or False") if expand: @@ -1739,7 +1730,7 @@ def translate(self, table, deletechars=None): findall = _pat_wrapper(str_findall, flags=True) @copy(str_extract) - def extract(self, pat, flags=0, expand=None): + def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 973fe74429551..178c5ff655b04 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -612,13 +612,16 @@ def test_match(self): def test_extract_expand_None(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_raises_regex(ValueError, + 'expand must be True or False'): values.str.extract('.*(BAD[_]+).*(BAD)', expand=None) def test_extract_expand_unspecified(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): - values.str.extract('.*(BAD[_]+).*(BAD)') + result_unspecified = values.str.extract('.*(BAD[_]+).*') + assert isinstance(result_unspecified, DataFrame) + result_true = values.str.extract('.*(BAD[_]+).*', expand=True) + tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others.