From 60b0155ac573a8ad5994c74c49e05854281e2469 Mon Sep 17 00:00:00 2001 From: RektPunk <110188257+RektPunk@users.noreply.github.com> Date: Mon, 23 Dec 2024 00:35:51 +0900 Subject: [PATCH] [python-package] Fix inconsistency in `predict()` output shape for 1-tree models (#6753) --- python-package/lightgbm/basic.py | 2 +- tests/python_package_test/test_engine.py | 90 +++++++++++++++++++++++- 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index e06290dc1c5f..7b152fd2b006 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1248,7 +1248,7 @@ def predict( if pred_leaf: preds = preds.astype(np.int32) is_sparse = isinstance(preds, (list, scipy.sparse.spmatrix)) - if not is_sparse and preds.size != nrow: + if not is_sparse and (preds.size != nrow or pred_leaf or pred_contrib): if preds.size % nrow == 0: preds = preds.reshape(nrow, -1) else: diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index a1797d1c1187..667cb86c1a14 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -15,7 +15,7 @@ import psutil import pytest from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr -from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification +from sklearn.datasets import load_svmlight_file, make_blobs, make_classification, make_multilabel_classification from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split @@ -2314,6 +2314,33 @@ def test_refit(): assert err_pred > new_err_pred +def test_refit_with_one_tree_regression(): + X, y = make_synthetic_regression(n_samples=1_000, n_features=2) + lgb_train = lgb.Dataset(X, label=y) + params = {"objective": "regression", "verbosity": -1} + model = lgb.train(params, lgb_train, num_boost_round=1) + model_refit = model.refit(X, y) + assert isinstance(model_refit, lgb.Booster) + + +def test_refit_with_one_tree_binary_classification(): + X, y = load_breast_cancer(return_X_y=True) + lgb_train = lgb.Dataset(X, label=y) + params = {"objective": "binary", "verbosity": -1} + model = lgb.train(params, lgb_train, num_boost_round=1) + model_refit = model.refit(X, y) + assert isinstance(model_refit, lgb.Booster) + + +def test_refit_with_one_tree_multiclass_classification(): + X, y = load_iris(return_X_y=True) + lgb_train = lgb.Dataset(X, y) + params = {"objective": "multiclass", "num_class": 3, "verbose": -1} + model = lgb.train(params, lgb_train, num_boost_round=1) + model_refit = model.refit(X, y) + assert isinstance(model_refit, lgb.Booster) + + def test_refit_dataset_params(rng): # check refit accepts dataset_params X, y = load_breast_cancer(return_X_y=True) @@ -3872,6 +3899,67 @@ def test_predict_stump(rng, use_init_score): np.testing.assert_allclose(preds_all, np.full_like(preds_all, fill_value=y_avg)) +def test_predict_regression_output_shape(): + n_samples = 1_000 + n_features = 4 + X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features) + dtrain = lgb.Dataset(X, label=y) + params = {"objective": "regression", "verbosity": -1} + + # 1-round model + bst = lgb.train(params, dtrain, num_boost_round=1) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1) + + # 2-round model + bst = lgb.train(params, dtrain, num_boost_round=2) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2) + + +def test_predict_binary_classification_output_shape(): + n_samples = 1_000 + n_features = 4 + X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2) + dtrain = lgb.Dataset(X, label=y) + params = {"objective": "binary", "verbosity": -1} + + # 1-round model + bst = lgb.train(params, dtrain, num_boost_round=1) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1) + + # 2-round model + bst = lgb.train(params, dtrain, num_boost_round=2) + assert bst.predict(X).shape == (n_samples,) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2) + + +def test_predict_multiclass_classification_output_shape(): + n_samples = 1_000 + n_features = 10 + n_classes = 3 + X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=6) + dtrain = lgb.Dataset(X, label=y) + params = {"objective": "multiclass", "verbosity": -1, "num_class": n_classes} + + # 1-round model + bst = lgb.train(params, dtrain, num_boost_round=1) + assert bst.predict(X).shape == (n_samples, n_classes) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1)) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes) + + # 2-round model + bst = lgb.train(params, dtrain, num_boost_round=2) + assert bst.predict(X).shape == (n_samples, n_classes) + assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1)) + assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes * 2) + + def test_average_precision_metric(): # test against sklearn average precision metric X, y = load_breast_cancer(return_X_y=True)