diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index 8b3b54d73..b36d3a902 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -28,6 +28,10 @@ Bug fixes in the multiclass case as well. :pr:`1015` by :user:`Guillaume Lemaitre `. +- Fix a bug in :class:`~imblearn.over_sampling.BorderlineSMOTE` version 2 where samples + should be generated from the whole dataset and not only from the minority class. + :pr:`1023` by :user:`Guillaume Lemaitre `. + Version 0.11.0 ============== diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index b02fc7be2..878c48fbc 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -95,6 +95,11 @@ class BorderlineSMOTE(BaseSMOTE): nn_m_ : estimator object Validated m-nearest neighbours created from the `m_neighbors` parameter. + in_danger_indices : dict of ndarray + Dictionary containing the indices of the samples considered in danger that + are used to generate new synthetic samples. The keys corresponds to the class + label. + n_features_in_ : int Number of features in the input dataset. @@ -201,6 +206,7 @@ def _fit_resample(self, X, y): X_resampled = X.copy() y_resampled = y.copy() + self.in_danger_indices = {} for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue @@ -208,67 +214,34 @@ def _fit_resample(self, X, y): X_class = _safe_indexing(X, target_class_indices) self.nn_m_.fit(X) - danger_index = self._in_danger_noise( + mask_danger = self._in_danger_noise( self.nn_m_, X_class, class_sample, y, kind="danger" ) - if not any(danger_index): + if not any(mask_danger): continue + X_danger = _safe_indexing(X_class, mask_danger) + self.in_danger_indices[class_sample] = target_class_indices[mask_danger] - self.nn_k_.fit(X_class) - nns = self.nn_k_.kneighbors( - _safe_indexing(X_class, danger_index), return_distance=False - )[:, 1:] - - # divergence between borderline-1 and borderline-2 if self.kind == "borderline-1": - # Create synthetic samples for borderline points. - X_new, y_new = self._make_samples( - _safe_indexing(X_class, danger_index), - y.dtype, - class_sample, - X_class, - nns, - n_samples, - ) - if sparse.issparse(X_new): - X_resampled = sparse.vstack([X_resampled, X_new]) - else: - X_resampled = np.vstack((X_resampled, X_new)) - y_resampled = np.hstack((y_resampled, y_new)) - - elif self.kind == "borderline-2": - random_state = check_random_state(self.random_state) - fractions = random_state.beta(10, 10) - - # only minority - X_new_1, y_new_1 = self._make_samples( - _safe_indexing(X_class, danger_index), - y.dtype, - class_sample, - X_class, - nns, - int(fractions * (n_samples + 1)), - step_size=1.0, - ) - - # we use a one-vs-rest policy to handle the multiclass in which - # new samples will be created considering not only the majority - # class but all over classes. - X_new_2, y_new_2 = self._make_samples( - _safe_indexing(X_class, danger_index), - y.dtype, - class_sample, - _safe_indexing(X, np.flatnonzero(y != class_sample)), - nns, - int((1 - fractions) * n_samples), - step_size=0.5, - ) - - if sparse.issparse(X_resampled): - X_resampled = sparse.vstack([X_resampled, X_new_1, X_new_2]) - else: - X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) - y_resampled = np.hstack((y_resampled, y_new_1, y_new_2)) + X_to_sample_from = X_class # consider the positive class only + else: # self.kind == "borderline-2" + X_to_sample_from = X # consider the whole dataset + + self.nn_k_.fit(X_to_sample_from) + nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:] + X_new, y_new = self._make_samples( + X_danger, + y.dtype, + class_sample, + X_to_sample_from, + nns, + n_samples, + ) + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled diff --git a/imblearn/over_sampling/_smote/tests/test_borderline_smote.py b/imblearn/over_sampling/_smote/tests/test_borderline_smote.py index 7519fcaab..0d85c4dfe 100644 --- a/imblearn/over_sampling/_smote/tests/test_borderline_smote.py +++ b/imblearn/over_sampling/_smote/tests/test_borderline_smote.py @@ -1,53 +1,110 @@ -import numpy as np +from collections import Counter + import pytest -from sklearn.neighbors import NearestNeighbors +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression from sklearn.utils._testing import assert_allclose, assert_array_equal from imblearn.over_sampling import BorderlineSMOTE -@pytest.fixture -def data(): - X = np.array( - [ - [0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - ] +@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) +def test_borderline_smote_no_in_danger_samples(kind): + """Check that the algorithm behave properly even on a dataset without any sample + in danger. + """ + X, y = make_classification( + n_samples=500, + n_features=2, + n_informative=2, + n_redundant=0, + n_repeated=0, + n_clusters_per_class=1, + n_classes=3, + weights=[0.1, 0.2, 0.7], + class_sep=1.5, + random_state=1, ) - y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) - return X, y + smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0) + X_res, y_res = smote.fit_resample(X, y) + assert_allclose(X, X_res) + assert_allclose(y, y_res) + assert not smote.in_danger_indices -@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) -def test_borderline_smote(kind, data): - bsmote = BorderlineSMOTE(kind=kind, random_state=42) - bsmote_nn = BorderlineSMOTE( - kind=kind, - random_state=42, - k_neighbors=NearestNeighbors(n_neighbors=6), - m_neighbors=NearestNeighbors(n_neighbors=11), + +def test_borderline_smote_kind(): + """Check the behaviour of the `kind` parameter. + + In short, "borderline-2" generates sample closer to the boundary decision than + "borderline-1". We generate an example where a logistic regression will perform + worse on "borderline-2" than on "borderline-1". + """ + X, y = make_classification( + n_samples=500, + n_features=2, + n_informative=2, + n_redundant=0, + n_repeated=0, + n_clusters_per_class=1, + n_classes=3, + weights=[0.1, 0.2, 0.7], + class_sep=1.0, + random_state=1, + ) + smote = BorderlineSMOTE( + kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0 ) + X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y) + smote.set_params(kind="borderline-2") + X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y) - X_res_1, y_res_1 = bsmote.fit_resample(*data) - X_res_2, y_res_2 = bsmote_nn.fit_resample(*data) + score_borderline_1 = ( + LogisticRegression() + .fit(X_res_borderline_1, y_res_borderline_1) + .score(X_res_borderline_1, y_res_borderline_1) + ) + score_borderline_2 = ( + LogisticRegression() + .fit(X_res_borderline_2, y_res_borderline_2) + .score(X_res_borderline_2, y_res_borderline_2) + ) + assert score_borderline_1 > score_borderline_2 + + +def test_borderline_smote_in_danger(): + X, y = make_classification( + n_samples=500, + n_features=2, + n_informative=2, + n_redundant=0, + n_repeated=0, + n_clusters_per_class=1, + n_classes=3, + weights=[0.1, 0.2, 0.7], + class_sep=0.8, + random_state=1, + ) + smote = BorderlineSMOTE( + kind="borderline-1", + m_neighbors=9, + k_neighbors=5, + random_state=0, + ) + _, y_res_1 = smote.fit_resample(X, y) + in_danger_indices_borderline_1 = smote.in_danger_indices + smote.set_params(kind="borderline-2") + _, y_res_2 = smote.fit_resample(X, y) + in_danger_indices_borderline_2 = smote.in_danger_indices - assert_allclose(X_res_1, X_res_2) - assert_array_equal(y_res_1, y_res_2) + for key1, key2 in zip( + in_danger_indices_borderline_1, in_danger_indices_borderline_2 + ): + assert_array_equal( + in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2] + ) + assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2) + counter = Counter(y_res_1) + assert counter[0] == counter[1] == counter[2] + counter = Counter(y_res_2) + assert counter[0] == counter[1] == counter[2]