Skip to content

Commit 2859cb0

Browse files
authored
FIX BorderlineSMOTE-2 use the full dataset to generate new sample (#1023)
1 parent d431b9d commit 2859cb0

File tree

3 files changed

+131
-97
lines changed

3 files changed

+131
-97
lines changed

doc/whats_new/v0.11.rst

+4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ Bug fixes
2828
in the multiclass case as well.
2929
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.
3030

31+
- Fix a bug in :class:`~imblearn.over_sampling.BorderlineSMOTE` version 2 where samples
32+
should be generated from the whole dataset and not only from the minority class.
33+
:pr:`1023` by :user:`Guillaume Lemaitre <glemaitre>`.
34+
3135
Version 0.11.0
3236
==============
3337

imblearn/over_sampling/_smote/filter.py

+29-56
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,11 @@ class BorderlineSMOTE(BaseSMOTE):
9595
nn_m_ : estimator object
9696
Validated m-nearest neighbours created from the `m_neighbors` parameter.
9797
98+
in_danger_indices : dict of ndarray
99+
Dictionary containing the indices of the samples considered in danger that
100+
are used to generate new synthetic samples. The keys corresponds to the class
101+
label.
102+
98103
n_features_in_ : int
99104
Number of features in the input dataset.
100105
@@ -201,74 +206,42 @@ def _fit_resample(self, X, y):
201206
X_resampled = X.copy()
202207
y_resampled = y.copy()
203208

209+
self.in_danger_indices = {}
204210
for class_sample, n_samples in self.sampling_strategy_.items():
205211
if n_samples == 0:
206212
continue
207213
target_class_indices = np.flatnonzero(y == class_sample)
208214
X_class = _safe_indexing(X, target_class_indices)
209215

210216
self.nn_m_.fit(X)
211-
danger_index = self._in_danger_noise(
217+
mask_danger = self._in_danger_noise(
212218
self.nn_m_, X_class, class_sample, y, kind="danger"
213219
)
214-
if not any(danger_index):
220+
if not any(mask_danger):
215221
continue
222+
X_danger = _safe_indexing(X_class, mask_danger)
223+
self.in_danger_indices[class_sample] = target_class_indices[mask_danger]
216224

217-
self.nn_k_.fit(X_class)
218-
nns = self.nn_k_.kneighbors(
219-
_safe_indexing(X_class, danger_index), return_distance=False
220-
)[:, 1:]
221-
222-
# divergence between borderline-1 and borderline-2
223225
if self.kind == "borderline-1":
224-
# Create synthetic samples for borderline points.
225-
X_new, y_new = self._make_samples(
226-
_safe_indexing(X_class, danger_index),
227-
y.dtype,
228-
class_sample,
229-
X_class,
230-
nns,
231-
n_samples,
232-
)
233-
if sparse.issparse(X_new):
234-
X_resampled = sparse.vstack([X_resampled, X_new])
235-
else:
236-
X_resampled = np.vstack((X_resampled, X_new))
237-
y_resampled = np.hstack((y_resampled, y_new))
238-
239-
elif self.kind == "borderline-2":
240-
random_state = check_random_state(self.random_state)
241-
fractions = random_state.beta(10, 10)
242-
243-
# only minority
244-
X_new_1, y_new_1 = self._make_samples(
245-
_safe_indexing(X_class, danger_index),
246-
y.dtype,
247-
class_sample,
248-
X_class,
249-
nns,
250-
int(fractions * (n_samples + 1)),
251-
step_size=1.0,
252-
)
253-
254-
# we use a one-vs-rest policy to handle the multiclass in which
255-
# new samples will be created considering not only the majority
256-
# class but all over classes.
257-
X_new_2, y_new_2 = self._make_samples(
258-
_safe_indexing(X_class, danger_index),
259-
y.dtype,
260-
class_sample,
261-
_safe_indexing(X, np.flatnonzero(y != class_sample)),
262-
nns,
263-
int((1 - fractions) * n_samples),
264-
step_size=0.5,
265-
)
266-
267-
if sparse.issparse(X_resampled):
268-
X_resampled = sparse.vstack([X_resampled, X_new_1, X_new_2])
269-
else:
270-
X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
271-
y_resampled = np.hstack((y_resampled, y_new_1, y_new_2))
226+
X_to_sample_from = X_class # consider the positive class only
227+
else: # self.kind == "borderline-2"
228+
X_to_sample_from = X # consider the whole dataset
229+
230+
self.nn_k_.fit(X_to_sample_from)
231+
nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:]
232+
X_new, y_new = self._make_samples(
233+
X_danger,
234+
y.dtype,
235+
class_sample,
236+
X_to_sample_from,
237+
nns,
238+
n_samples,
239+
)
240+
if sparse.issparse(X_new):
241+
X_resampled = sparse.vstack([X_resampled, X_new])
242+
else:
243+
X_resampled = np.vstack((X_resampled, X_new))
244+
y_resampled = np.hstack((y_resampled, y_new))
272245

273246
return X_resampled, y_resampled
274247

Original file line numberDiff line numberDiff line change
@@ -1,53 +1,110 @@
1-
import numpy as np
1+
from collections import Counter
2+
23
import pytest
3-
from sklearn.neighbors import NearestNeighbors
4+
from sklearn.datasets import make_classification
5+
from sklearn.linear_model import LogisticRegression
46
from sklearn.utils._testing import assert_allclose, assert_array_equal
57

68
from imblearn.over_sampling import BorderlineSMOTE
79

810

9-
@pytest.fixture
10-
def data():
11-
X = np.array(
12-
[
13-
[0.11622591, -0.0317206],
14-
[0.77481731, 0.60935141],
15-
[1.25192108, -0.22367336],
16-
[0.53366841, -0.30312976],
17-
[1.52091956, -0.49283504],
18-
[-0.28162401, -2.10400981],
19-
[0.83680821, 1.72827342],
20-
[0.3084254, 0.33299982],
21-
[0.70472253, -0.73309052],
22-
[0.28893132, -0.38761769],
23-
[1.15514042, 0.0129463],
24-
[0.88407872, 0.35454207],
25-
[1.31301027, -0.92648734],
26-
[-1.11515198, -0.93689695],
27-
[-0.18410027, -0.45194484],
28-
[0.9281014, 0.53085498],
29-
[-0.14374509, 0.27370049],
30-
[-0.41635887, -0.38299653],
31-
[0.08711622, 0.93259929],
32-
[1.70580611, -0.11219234],
33-
]
11+
@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"])
12+
def test_borderline_smote_no_in_danger_samples(kind):
13+
"""Check that the algorithm behave properly even on a dataset without any sample
14+
in danger.
15+
"""
16+
X, y = make_classification(
17+
n_samples=500,
18+
n_features=2,
19+
n_informative=2,
20+
n_redundant=0,
21+
n_repeated=0,
22+
n_clusters_per_class=1,
23+
n_classes=3,
24+
weights=[0.1, 0.2, 0.7],
25+
class_sep=1.5,
26+
random_state=1,
3427
)
35-
y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
36-
return X, y
28+
smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0)
29+
X_res, y_res = smote.fit_resample(X, y)
3730

31+
assert_allclose(X, X_res)
32+
assert_allclose(y, y_res)
33+
assert not smote.in_danger_indices
3834

39-
@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"])
40-
def test_borderline_smote(kind, data):
41-
bsmote = BorderlineSMOTE(kind=kind, random_state=42)
42-
bsmote_nn = BorderlineSMOTE(
43-
kind=kind,
44-
random_state=42,
45-
k_neighbors=NearestNeighbors(n_neighbors=6),
46-
m_neighbors=NearestNeighbors(n_neighbors=11),
35+
36+
def test_borderline_smote_kind():
37+
"""Check the behaviour of the `kind` parameter.
38+
39+
In short, "borderline-2" generates sample closer to the boundary decision than
40+
"borderline-1". We generate an example where a logistic regression will perform
41+
worse on "borderline-2" than on "borderline-1".
42+
"""
43+
X, y = make_classification(
44+
n_samples=500,
45+
n_features=2,
46+
n_informative=2,
47+
n_redundant=0,
48+
n_repeated=0,
49+
n_clusters_per_class=1,
50+
n_classes=3,
51+
weights=[0.1, 0.2, 0.7],
52+
class_sep=1.0,
53+
random_state=1,
54+
)
55+
smote = BorderlineSMOTE(
56+
kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0
4757
)
58+
X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y)
59+
smote.set_params(kind="borderline-2")
60+
X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y)
4861

49-
X_res_1, y_res_1 = bsmote.fit_resample(*data)
50-
X_res_2, y_res_2 = bsmote_nn.fit_resample(*data)
62+
score_borderline_1 = (
63+
LogisticRegression()
64+
.fit(X_res_borderline_1, y_res_borderline_1)
65+
.score(X_res_borderline_1, y_res_borderline_1)
66+
)
67+
score_borderline_2 = (
68+
LogisticRegression()
69+
.fit(X_res_borderline_2, y_res_borderline_2)
70+
.score(X_res_borderline_2, y_res_borderline_2)
71+
)
72+
assert score_borderline_1 > score_borderline_2
73+
74+
75+
def test_borderline_smote_in_danger():
76+
X, y = make_classification(
77+
n_samples=500,
78+
n_features=2,
79+
n_informative=2,
80+
n_redundant=0,
81+
n_repeated=0,
82+
n_clusters_per_class=1,
83+
n_classes=3,
84+
weights=[0.1, 0.2, 0.7],
85+
class_sep=0.8,
86+
random_state=1,
87+
)
88+
smote = BorderlineSMOTE(
89+
kind="borderline-1",
90+
m_neighbors=9,
91+
k_neighbors=5,
92+
random_state=0,
93+
)
94+
_, y_res_1 = smote.fit_resample(X, y)
95+
in_danger_indices_borderline_1 = smote.in_danger_indices
96+
smote.set_params(kind="borderline-2")
97+
_, y_res_2 = smote.fit_resample(X, y)
98+
in_danger_indices_borderline_2 = smote.in_danger_indices
5199

52-
assert_allclose(X_res_1, X_res_2)
53-
assert_array_equal(y_res_1, y_res_2)
100+
for key1, key2 in zip(
101+
in_danger_indices_borderline_1, in_danger_indices_borderline_2
102+
):
103+
assert_array_equal(
104+
in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2]
105+
)
106+
assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2)
107+
counter = Counter(y_res_1)
108+
assert counter[0] == counter[1] == counter[2]
109+
counter = Counter(y_res_2)
110+
assert counter[0] == counter[1] == counter[2]

0 commit comments

Comments
 (0)