diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1699a0d88..d0ce77834 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,7 @@ jobs: # Linux environment to test the latest available dependencies and MKL. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' - PYTHON_VERSION: '3.8' + PYTHON_VERSION: '3.9' COVERAGE: 'true' PANDAS_VERSION: '*' TEST_DOCSTRINGS: 'true' diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 37793c529..6f3cf03cc 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -21,7 +21,7 @@ except ImportError: python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pip list -TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" +TEST_CMD="python -m pytest -vsl --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" diff --git a/doc/conf.py b/doc/conf.py index e404258e3..2531683fa 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -43,6 +43,9 @@ 'sphinx_gallery.gen_gallery', ] +# bibtex file +bibtex_bibfiles = ['bibtex/refs.bib'] + # this is needed for some reason... # see https://github.com/numpy/numpydoc/issues/69 numpydoc_show_class_members = False @@ -345,8 +348,8 @@ def patch_signature(subject, bound_method=False, follow_wrapped=True): # https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files def setup(app): app.registry.documenters["class"] = PatchedClassDocumenter - app.add_javascript("js/copybutton.js") - app.add_stylesheet("basic.css") + app.add_js_file("js/copybutton.js") + app.add_css_file("basic.css") # app.connect('autodoc-process-docstring', generate_example_rst) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index a154a62dc..e5948cbd6 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -60,7 +60,7 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data >>> import numpy as np >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], - ... dtype=np.object) + ... dtype=object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index d00aab7ce..13798ad78 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -107,7 +107,7 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data (e.g. containing some strings):: >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], - ... dtype=np.object) + ... dtype=object) >>> y_hetero = np.array([0, 0, 1]) >>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero) >>> print(X_resampled) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index ca58a8012..2acda02e5 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -115,7 +115,7 @@ def test_multiclass_fit_resample(): def test_random_over_sampling_heterogeneous_data(): X_hetero = np.array( - [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object + [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 945d31fec..2ca0b3354 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -101,7 +101,7 @@ def test_multiclass_fit_resample(): def test_random_under_sampling_heterogeneous_data(): X_hetero = np.array( - [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=np.object + [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 729ceebea..6a3032ebf 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -25,8 +25,8 @@ from sklearn.cluster import KMeans from sklearn.exceptions import SkipTestWarning from sklearn.preprocessing import label_binarize -from sklearn.utils.estimator_checks import _mark_xfail_checks -from sklearn.utils.estimator_checks import _set_check_estimator_ids +from sklearn.utils.estimator_checks import _maybe_mark_xfail +from sklearn.utils.estimator_checks import _get_check_estimator_ids from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_raises_regex from sklearn.utils.multiclass import type_of_target @@ -44,7 +44,7 @@ def _set_checking_parameters(estimator): if name == "ClusterCentroids": estimator.set_params( voting="soft", - estimator=KMeans(random_state=0, algorithm="full"), + estimator=KMeans(random_state=0, algorithm="full", n_init=1), ) if name == "KMeansSMOTE": estimator.set_params(kmeans_estimator=12) @@ -117,21 +117,19 @@ def parametrize_with_checks(estimators): ... def test_sklearn_compatible_estimator(estimator, check): ... check(estimator) """ - names = (type(estimator).__name__ for estimator in estimators) + def checks_generator(): + for estimator in estimators: + name = type(estimator).__name__ + for check in _yield_all_checks(estimator): + check = partial(check, name) + yield _maybe_mark_xfail(estimator, check, pytest) - checks_generator = ((clone(estimator), partial(check, name)) - for name, estimator in zip(names, estimators) - for check in _yield_all_checks(estimator)) + return pytest.mark.parametrize("estimator, check", checks_generator(), + ids=_get_check_estimator_ids) - checks_with_marks = ( - _mark_xfail_checks(estimator, check, pytest) - for estimator, check in checks_generator) - return pytest.mark.parametrize("estimator, check", checks_with_marks, - ids=_set_check_estimator_ids) - - -def check_target_type(name, estimator): +def check_target_type(name, estimator_orig): + estimator = clone(estimator_orig) # should raise warning if the target is continuous (we cannot raise error) X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) @@ -148,7 +146,8 @@ def check_target_type(name, estimator): ) -def check_samplers_one_label(name, sampler): +def check_samplers_one_label(name, sampler_orig): + sampler = clone(sampler_orig) error_string_fit = "Sampler can't balance when only one class is present." X = np.random.random((20, 2)) y = np.zeros(20) @@ -168,7 +167,8 @@ def check_samplers_one_label(name, sampler): raise AssertionError(error_string_fit) -def check_samplers_fit(name, sampler): +def check_samplers_fit(name, sampler_orig): + sampler = clone(sampler_orig) np.random.seed(42) # Make this test reproducible X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) @@ -178,7 +178,8 @@ def check_samplers_fit(name, sampler): ), "No fitted attribute sampling_strategy_" -def check_samplers_fit_resample(name, sampler): +def check_samplers_fit_resample(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=1000, n_classes=3, @@ -213,7 +214,8 @@ def check_samplers_fit_resample(name, sampler): ) -def check_samplers_sampling_strategy_fit_resample(name, sampler): +def check_samplers_sampling_strategy_fit_resample(name, sampler_orig): + sampler = clone(sampler_orig) # in this test we will force all samplers to not change the class 1 X, y = make_classification( n_samples=1000, @@ -240,7 +242,8 @@ def check_samplers_sampling_strategy_fit_resample(name, sampler): assert Counter(y_res)[1] == expected_stat -def check_samplers_sparse(name, sampler): +def check_samplers_sparse(name, sampler_orig): + sampler = clone(sampler_orig) # check that sparse matrices can be passed through the sampler leading to # the same results than dense X, y = make_classification( @@ -252,14 +255,16 @@ def check_samplers_sparse(name, sampler): ) X_sparse = sparse.csr_matrix(X) X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) + sampler = clone(sampler) X_res, y_res = sampler.fit_resample(X, y) assert sparse.issparse(X_res_sparse) - assert_allclose(X_res_sparse.A, X_res) + assert_allclose(X_res_sparse.A, X_res, rtol=1e-5) assert_allclose(y_res_sparse, y_res) -def check_samplers_pandas(name, sampler): +def check_samplers_pandas(name, sampler_orig): pd = pytest.importorskip("pandas") + sampler = clone(sampler_orig) # Check that the samplers handle pandas dataframe and pandas series X, y = make_classification( n_samples=1000, @@ -290,7 +295,8 @@ def check_samplers_pandas(name, sampler): assert_allclose(y_res_s.to_numpy(), y_res) -def check_samplers_list(name, sampler): +def check_samplers_list(name, sampler_orig): + sampler = clone(sampler_orig) # Check that the can samplers handle simple lists X, y = make_classification( n_samples=1000, @@ -312,7 +318,8 @@ def check_samplers_list(name, sampler): assert_allclose(y_res, y_res_list) -def check_samplers_multiclass_ova(name, sampler): +def check_samplers_multiclass_ova(name, sampler_orig): + sampler = clone(sampler_orig) # Check that multiclass target lead to the same results than OVA encoding X, y = make_classification( n_samples=1000, @@ -329,7 +336,8 @@ def check_samplers_multiclass_ova(name, sampler): assert_allclose(y_res, y_res_ova.argmax(axis=1)) -def check_samplers_2d_target(name, sampler): +def check_samplers_2d_target(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=100, n_classes=3, @@ -342,7 +350,8 @@ def check_samplers_2d_target(name, sampler): sampler.fit_resample(X, y) -def check_samplers_preserve_dtype(name, sampler): +def check_samplers_preserve_dtype(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=1000, n_classes=3, @@ -358,7 +367,8 @@ def check_samplers_preserve_dtype(name, sampler): assert y.dtype == y_res.dtype, "y dtype is not preserved" -def check_samplers_sample_indices(name, sampler): +def check_samplers_sample_indices(name, sampler_orig): + sampler = clone(sampler_orig) X, y = make_classification( n_samples=1000, n_classes=3, @@ -374,17 +384,21 @@ def check_samplers_sample_indices(name, sampler): assert not hasattr(sampler, "sample_indices_") -def check_classifier_on_multilabel_or_multioutput_targets(name, estimator): +def check_classifier_on_multilabel_or_multioutput_targets( + name, estimator_orig +): + estimator = clone(estimator_orig) X, y = make_multilabel_classification(n_samples=30) msg = "Multilabel and multioutput targets are not supported." with pytest.raises(ValueError, match=msg): estimator.fit(X, y) -def check_classifiers_with_encoded_labels(name, classifier): +def check_classifiers_with_encoded_labels(name, classifier_orig): # Non-regression test for #709 # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709 pytest.importorskip("pandas") + classifier = clone(classifier_orig) df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True) df, y = make_imbalance( df, y, sampling_strategy={