|
1 |
| -import numpy as np |
| 1 | +from collections import Counter |
| 2 | + |
2 | 3 | import pytest
|
3 |
| -from sklearn.neighbors import NearestNeighbors |
| 4 | +from sklearn.datasets import make_classification |
| 5 | +from sklearn.linear_model import LogisticRegression |
4 | 6 | from sklearn.utils._testing import assert_allclose, assert_array_equal
|
5 | 7 |
|
6 | 8 | from imblearn.over_sampling import BorderlineSMOTE
|
7 | 9 |
|
8 | 10 |
|
9 |
| -@pytest.fixture |
10 |
| -def data(): |
11 |
| - X = np.array( |
12 |
| - [ |
13 |
| - [0.11622591, -0.0317206], |
14 |
| - [0.77481731, 0.60935141], |
15 |
| - [1.25192108, -0.22367336], |
16 |
| - [0.53366841, -0.30312976], |
17 |
| - [1.52091956, -0.49283504], |
18 |
| - [-0.28162401, -2.10400981], |
19 |
| - [0.83680821, 1.72827342], |
20 |
| - [0.3084254, 0.33299982], |
21 |
| - [0.70472253, -0.73309052], |
22 |
| - [0.28893132, -0.38761769], |
23 |
| - [1.15514042, 0.0129463], |
24 |
| - [0.88407872, 0.35454207], |
25 |
| - [1.31301027, -0.92648734], |
26 |
| - [-1.11515198, -0.93689695], |
27 |
| - [-0.18410027, -0.45194484], |
28 |
| - [0.9281014, 0.53085498], |
29 |
| - [-0.14374509, 0.27370049], |
30 |
| - [-0.41635887, -0.38299653], |
31 |
| - [0.08711622, 0.93259929], |
32 |
| - [1.70580611, -0.11219234], |
33 |
| - ] |
| 11 | +@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) |
| 12 | +def test_borderline_smote_no_in_danger_samples(kind): |
| 13 | + """Check that the algorithm behave properly even on a dataset without any sample |
| 14 | + in danger. |
| 15 | + """ |
| 16 | + X, y = make_classification( |
| 17 | + n_samples=500, |
| 18 | + n_features=2, |
| 19 | + n_informative=2, |
| 20 | + n_redundant=0, |
| 21 | + n_repeated=0, |
| 22 | + n_clusters_per_class=1, |
| 23 | + n_classes=3, |
| 24 | + weights=[0.1, 0.2, 0.7], |
| 25 | + class_sep=1.5, |
| 26 | + random_state=1, |
34 | 27 | )
|
35 |
| - y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) |
36 |
| - return X, y |
| 28 | + smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0) |
| 29 | + X_res, y_res = smote.fit_resample(X, y) |
37 | 30 |
|
| 31 | + assert_allclose(X, X_res) |
| 32 | + assert_allclose(y, y_res) |
| 33 | + assert not smote.in_danger_indices |
38 | 34 |
|
39 |
| -@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) |
40 |
| -def test_borderline_smote(kind, data): |
41 |
| - bsmote = BorderlineSMOTE(kind=kind, random_state=42) |
42 |
| - bsmote_nn = BorderlineSMOTE( |
43 |
| - kind=kind, |
44 |
| - random_state=42, |
45 |
| - k_neighbors=NearestNeighbors(n_neighbors=6), |
46 |
| - m_neighbors=NearestNeighbors(n_neighbors=11), |
| 35 | + |
| 36 | +def test_borderline_smote_kind(): |
| 37 | + """Check the behaviour of the `kind` parameter. |
| 38 | +
|
| 39 | + In short, "borderline-2" generates sample closer to the boundary decision than |
| 40 | + "borderline-1". We generate an example where a logistic regression will perform |
| 41 | + worse on "borderline-2" than on "borderline-1". |
| 42 | + """ |
| 43 | + X, y = make_classification( |
| 44 | + n_samples=500, |
| 45 | + n_features=2, |
| 46 | + n_informative=2, |
| 47 | + n_redundant=0, |
| 48 | + n_repeated=0, |
| 49 | + n_clusters_per_class=1, |
| 50 | + n_classes=3, |
| 51 | + weights=[0.1, 0.2, 0.7], |
| 52 | + class_sep=1.0, |
| 53 | + random_state=1, |
| 54 | + ) |
| 55 | + smote = BorderlineSMOTE( |
| 56 | + kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0 |
47 | 57 | )
|
| 58 | + X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y) |
| 59 | + smote.set_params(kind="borderline-2") |
| 60 | + X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y) |
48 | 61 |
|
49 |
| - X_res_1, y_res_1 = bsmote.fit_resample(*data) |
50 |
| - X_res_2, y_res_2 = bsmote_nn.fit_resample(*data) |
| 62 | + score_borderline_1 = ( |
| 63 | + LogisticRegression() |
| 64 | + .fit(X_res_borderline_1, y_res_borderline_1) |
| 65 | + .score(X_res_borderline_1, y_res_borderline_1) |
| 66 | + ) |
| 67 | + score_borderline_2 = ( |
| 68 | + LogisticRegression() |
| 69 | + .fit(X_res_borderline_2, y_res_borderline_2) |
| 70 | + .score(X_res_borderline_2, y_res_borderline_2) |
| 71 | + ) |
| 72 | + assert score_borderline_1 > score_borderline_2 |
| 73 | + |
| 74 | + |
| 75 | +def test_borderline_smote_in_danger(): |
| 76 | + X, y = make_classification( |
| 77 | + n_samples=500, |
| 78 | + n_features=2, |
| 79 | + n_informative=2, |
| 80 | + n_redundant=0, |
| 81 | + n_repeated=0, |
| 82 | + n_clusters_per_class=1, |
| 83 | + n_classes=3, |
| 84 | + weights=[0.1, 0.2, 0.7], |
| 85 | + class_sep=0.8, |
| 86 | + random_state=1, |
| 87 | + ) |
| 88 | + smote = BorderlineSMOTE( |
| 89 | + kind="borderline-1", |
| 90 | + m_neighbors=9, |
| 91 | + k_neighbors=5, |
| 92 | + random_state=0, |
| 93 | + ) |
| 94 | + _, y_res_1 = smote.fit_resample(X, y) |
| 95 | + in_danger_indices_borderline_1 = smote.in_danger_indices |
| 96 | + smote.set_params(kind="borderline-2") |
| 97 | + _, y_res_2 = smote.fit_resample(X, y) |
| 98 | + in_danger_indices_borderline_2 = smote.in_danger_indices |
51 | 99 |
|
52 |
| - assert_allclose(X_res_1, X_res_2) |
53 |
| - assert_array_equal(y_res_1, y_res_2) |
| 100 | + for key1, key2 in zip( |
| 101 | + in_danger_indices_borderline_1, in_danger_indices_borderline_2 |
| 102 | + ): |
| 103 | + assert_array_equal( |
| 104 | + in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2] |
| 105 | + ) |
| 106 | + assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2) |
| 107 | + counter = Counter(y_res_1) |
| 108 | + assert counter[0] == counter[1] == counter[2] |
| 109 | + counter = Counter(y_res_2) |
| 110 | + assert counter[0] == counter[1] == counter[2] |
0 commit comments