Skip to content

FIX BorddelineSMOTE-2 use the full dataset to generate new sample #1023

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v0.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ Bug fixes
in the multiclass case as well.
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.

- Fix a bug in :class:`~imblearn.over_sampling.BorderlineSMOTE` version 2 where samples
should be generated from the whole dataset and not only from the minority class.
:pr:`1023` by :user:`Guillaume Lemaitre <glemaitre>`.

Version 0.11.0
==============

Expand Down
85 changes: 29 additions & 56 deletions imblearn/over_sampling/_smote/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ class BorderlineSMOTE(BaseSMOTE):
nn_m_ : estimator object
Validated m-nearest neighbours created from the `m_neighbors` parameter.

in_danger_indices : dict of ndarray
Dictionary containing the indices of the samples considered in danger that
are used to generate new synthetic samples. The keys corresponds to the class
label.

n_features_in_ : int
Number of features in the input dataset.

Expand Down Expand Up @@ -201,74 +206,42 @@ def _fit_resample(self, X, y):
X_resampled = X.copy()
y_resampled = y.copy()

self.in_danger_indices = {}
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = _safe_indexing(X, target_class_indices)

self.nn_m_.fit(X)
danger_index = self._in_danger_noise(
mask_danger = self._in_danger_noise(
self.nn_m_, X_class, class_sample, y, kind="danger"
)
if not any(danger_index):
if not any(mask_danger):
continue
X_danger = _safe_indexing(X_class, mask_danger)
self.in_danger_indices[class_sample] = target_class_indices[mask_danger]

self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(
_safe_indexing(X_class, danger_index), return_distance=False
)[:, 1:]

# divergence between borderline-1 and borderline-2
if self.kind == "borderline-1":
# Create synthetic samples for borderline points.
X_new, y_new = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
X_class,
nns,
n_samples,
)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))

elif self.kind == "borderline-2":
random_state = check_random_state(self.random_state)
fractions = random_state.beta(10, 10)

# only minority
X_new_1, y_new_1 = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
X_class,
nns,
int(fractions * (n_samples + 1)),
step_size=1.0,
)

# we use a one-vs-rest policy to handle the multiclass in which
# new samples will be created considering not only the majority
# class but all over classes.
X_new_2, y_new_2 = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
_safe_indexing(X, np.flatnonzero(y != class_sample)),
nns,
int((1 - fractions) * n_samples),
step_size=0.5,
)

if sparse.issparse(X_resampled):
X_resampled = sparse.vstack([X_resampled, X_new_1, X_new_2])
else:
X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
y_resampled = np.hstack((y_resampled, y_new_1, y_new_2))
X_to_sample_from = X_class # consider the positive class only
else: # self.kind == "borderline-2"
X_to_sample_from = X # consider the whole dataset

self.nn_k_.fit(X_to_sample_from)
nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:]
X_new, y_new = self._make_samples(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation does not fully reflect the description of Borderline smote 2 in the paper. The paper says that to create the samples by interpolation between the template of the minority and a neigbhour of the majority, it multiplies by a factor between 0 and 0.5 (instead of 0-1) to ensure the synthetic data is closer to the minority.

If I understand this code correctly, we are multiplying everything by a factor between 0 and 1. Pls correct me if I am wrong.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nop, indeed. I forgot to look at the next page of the article. I will try to propose a fix.

X_danger,
y.dtype,
class_sample,
X_to_sample_from,
nns,
n_samples,
)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))

return X_resampled, y_resampled

Expand Down
139 changes: 98 additions & 41 deletions imblearn/over_sampling/_smote/tests/test_borderline_smote.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,110 @@
import numpy as np
from collections import Counter

import pytest
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.utils._testing import assert_allclose, assert_array_equal

from imblearn.over_sampling import BorderlineSMOTE


@pytest.fixture
def data():
X = np.array(
[
[0.11622591, -0.0317206],
[0.77481731, 0.60935141],
[1.25192108, -0.22367336],
[0.53366841, -0.30312976],
[1.52091956, -0.49283504],
[-0.28162401, -2.10400981],
[0.83680821, 1.72827342],
[0.3084254, 0.33299982],
[0.70472253, -0.73309052],
[0.28893132, -0.38761769],
[1.15514042, 0.0129463],
[0.88407872, 0.35454207],
[1.31301027, -0.92648734],
[-1.11515198, -0.93689695],
[-0.18410027, -0.45194484],
[0.9281014, 0.53085498],
[-0.14374509, 0.27370049],
[-0.41635887, -0.38299653],
[0.08711622, 0.93259929],
[1.70580611, -0.11219234],
]
@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"])
def test_borderline_smote_no_in_danger_samples(kind):
"""Check that the algorithm behave properly even on a dataset without any sample
in danger.
"""
X, y = make_classification(
n_samples=500,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_clusters_per_class=1,
n_classes=3,
weights=[0.1, 0.2, 0.7],
class_sep=1.5,
random_state=1,
)
y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
return X, y
smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0)
X_res, y_res = smote.fit_resample(X, y)

assert_allclose(X, X_res)
assert_allclose(y, y_res)
assert not smote.in_danger_indices

@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"])
def test_borderline_smote(kind, data):
bsmote = BorderlineSMOTE(kind=kind, random_state=42)
bsmote_nn = BorderlineSMOTE(
kind=kind,
random_state=42,
k_neighbors=NearestNeighbors(n_neighbors=6),
m_neighbors=NearestNeighbors(n_neighbors=11),

def test_borderline_smote_kind():
"""Check the behaviour of the `kind` parameter.

In short, "borderline-2" generates sample closer to the boundary decision than
"borderline-1". We generate an example where a logistic regression will perform
worse on "borderline-2" than on "borderline-1".
"""
X, y = make_classification(
n_samples=500,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_clusters_per_class=1,
n_classes=3,
weights=[0.1, 0.2, 0.7],
class_sep=1.0,
random_state=1,
)
smote = BorderlineSMOTE(
kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0
)
X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y)
smote.set_params(kind="borderline-2")
X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y)

X_res_1, y_res_1 = bsmote.fit_resample(*data)
X_res_2, y_res_2 = bsmote_nn.fit_resample(*data)
score_borderline_1 = (
LogisticRegression()
.fit(X_res_borderline_1, y_res_borderline_1)
.score(X_res_borderline_1, y_res_borderline_1)
)
score_borderline_2 = (
LogisticRegression()
.fit(X_res_borderline_2, y_res_borderline_2)
.score(X_res_borderline_2, y_res_borderline_2)
)
assert score_borderline_1 > score_borderline_2


def test_borderline_smote_in_danger():
X, y = make_classification(
n_samples=500,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_clusters_per_class=1,
n_classes=3,
weights=[0.1, 0.2, 0.7],
class_sep=0.8,
random_state=1,
)
smote = BorderlineSMOTE(
kind="borderline-1",
m_neighbors=9,
k_neighbors=5,
random_state=0,
)
_, y_res_1 = smote.fit_resample(X, y)
in_danger_indices_borderline_1 = smote.in_danger_indices
smote.set_params(kind="borderline-2")
_, y_res_2 = smote.fit_resample(X, y)
in_danger_indices_borderline_2 = smote.in_danger_indices

assert_allclose(X_res_1, X_res_2)
assert_array_equal(y_res_1, y_res_2)
for key1, key2 in zip(
in_danger_indices_borderline_1, in_danger_indices_borderline_2
):
assert_array_equal(
in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2]
)
assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2)
counter = Counter(y_res_1)
assert counter[0] == counter[1] == counter[2]
counter = Counter(y_res_2)
assert counter[0] == counter[1] == counter[2]