Skip to content

Commit 340ac3c

Browse files
Enable auto early stopping
1 parent 6862162 commit 340ac3c

File tree

5 files changed

+154
-62
lines changed

5 files changed

+154
-62
lines changed

python-package/lightgbm/basic.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -2538,14 +2538,12 @@ def set_categorical_feature(
25382538
self : Dataset
25392539
Dataset with set categorical features.
25402540
"""
2541-
if self.categorical_feature == categorical_feature:
2541+
if self.categorical_feature == categorical_feature or categorical_feature == 'auto':
25422542
return self
25432543
if self.data is not None:
25442544
if self.categorical_feature is None:
25452545
self.categorical_feature = categorical_feature
25462546
return self._free_handle()
2547-
elif categorical_feature == 'auto':
2548-
return self
25492547
else:
25502548
if self.categorical_feature != 'auto':
25512549
_log_warning('categorical_feature in Dataset is overridden.\n'

python-package/lightgbm/dask.py

+3
Original file line numberDiff line numberDiff line change
@@ -1145,6 +1145,7 @@ def __init__(
11451145
random_state: Optional[Union[int, np.random.RandomState]] = None,
11461146
n_jobs: Optional[int] = None,
11471147
importance_type: str = 'split',
1148+
validation_fraction: Optional[float] = 0.1,
11481149
client: Optional[Client] = None,
11491150
**kwargs: Any
11501151
):
@@ -1350,6 +1351,7 @@ def __init__(
13501351
random_state: Optional[Union[int, np.random.RandomState]] = None,
13511352
n_jobs: Optional[int] = None,
13521353
importance_type: str = 'split',
1354+
validation_fraction: Optional[float] = 0.1,
13531355
client: Optional[Client] = None,
13541356
**kwargs: Any
13551357
):
@@ -1520,6 +1522,7 @@ def __init__(
15201522
random_state: Optional[Union[int, np.random.RandomState]] = None,
15211523
n_jobs: Optional[int] = None,
15221524
importance_type: str = 'split',
1525+
validation_fraction: Optional[float] = 0.1,
15231526
client: Optional[Client] = None,
15241527
**kwargs: Any
15251528
):

python-package/lightgbm/engine.py

+23-14
Original file line numberDiff line numberDiff line change
@@ -455,11 +455,9 @@ def _make_n_folds(
455455
nfold: int,
456456
params: Dict[str, Any],
457457
seed: int,
458-
fpreproc: Optional[_LGBM_PreprocFunction],
459458
stratified: bool,
460459
shuffle: bool,
461-
eval_train_metric: bool
462-
) -> CVBooster:
460+
) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
463461
"""Make a n-fold list of Booster from random indices."""
464462
full_data = full_data.construct()
465463
num_data = full_data.num_data()
@@ -500,7 +498,16 @@ def _make_n_folds(
500498
test_id = [randidx[i: i + kstep] for i in range(0, num_data, kstep)]
501499
train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
502500
folds = zip(train_id, test_id)
501+
return folds
503502

503+
504+
def _make_cvbooster(
505+
full_data: Dataset,
506+
params: Dict[str, Any],
507+
folds: Iterable[Tuple[np.ndarray, np.ndarray]],
508+
fpreproc: Optional[_LGBM_PreprocFunction],
509+
eval_train_metric: bool,
510+
) -> CVBooster:
504511
ret = CVBooster()
505512
for train_idx, test_idx in folds:
506513
train_set = full_data.subset(sorted(train_idx))
@@ -720,8 +727,10 @@ def cv(
720727

721728
results = defaultdict(list)
722729
cvfolds = _make_n_folds(full_data=train_set, folds=folds, nfold=nfold,
723-
params=params, seed=seed, fpreproc=fpreproc,
724-
stratified=stratified, shuffle=shuffle,
730+
params=params, seed=seed,
731+
stratified=stratified, shuffle=shuffle)
732+
cvbooster = _make_cvbooster(full_data=train_set, folds=cvfolds,
733+
params=params, fpreproc=fpreproc,
725734
eval_train_metric=eval_train_metric)
726735

727736
# setup callbacks
@@ -752,34 +761,34 @@ def cv(
752761

753762
for i in range(num_boost_round):
754763
for cb in callbacks_before_iter:
755-
cb(callback.CallbackEnv(model=cvfolds,
764+
cb(callback.CallbackEnv(model=cvbooster,
756765
params=params,
757766
iteration=i,
758767
begin_iteration=0,
759768
end_iteration=num_boost_round,
760769
evaluation_result_list=None))
761-
cvfolds.update(fobj=fobj) # type: ignore[call-arg]
762-
res = _agg_cv_result(cvfolds.eval_valid(feval)) # type: ignore[call-arg]
770+
cvbooster.update(fobj=fobj) # type: ignore[call-arg]
771+
res = _agg_cv_result(cvbooster.eval_valid(feval)) # type: ignore[call-arg]
763772
for _, key, mean, _, std in res:
764773
results[f'{key}-mean'].append(mean)
765774
results[f'{key}-stdv'].append(std)
766775
try:
767776
for cb in callbacks_after_iter:
768-
cb(callback.CallbackEnv(model=cvfolds,
777+
cb(callback.CallbackEnv(model=cvbooster,
769778
params=params,
770779
iteration=i,
771780
begin_iteration=0,
772781
end_iteration=num_boost_round,
773782
evaluation_result_list=res))
774783
except callback.EarlyStopException as earlyStopException:
775-
cvfolds.best_iteration = earlyStopException.best_iteration + 1
776-
for bst in cvfolds.boosters:
777-
bst.best_iteration = cvfolds.best_iteration
784+
cvbooster.best_iteration = earlyStopException.best_iteration + 1
785+
for bst in cvbooster.boosters:
786+
bst.best_iteration = cvbooster.best_iteration
778787
for k in results:
779-
results[k] = results[k][:cvfolds.best_iteration]
788+
results[k] = results[k][:cvbooster.best_iteration]
780789
break
781790

782791
if return_cvbooster:
783-
results['cvbooster'] = cvfolds # type: ignore[assignment]
792+
results['cvbooster'] = cvbooster # type: ignore[assignment]
784793

785794
return dict(results)

python-package/lightgbm/sklearn.py

+76-39
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
1717
_LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
1818
dt_DataTable, pd_DataFrame)
19-
from .engine import train
19+
from .engine import _make_n_folds, train
2020

2121
__all__ = [
2222
'LGBMClassifier',
@@ -412,6 +412,7 @@ def __init__(
412412
random_state: Optional[Union[int, np.random.RandomState]] = None,
413413
n_jobs: Optional[int] = None,
414414
importance_type: str = 'split',
415+
validation_fraction: Optional[float] = 0.1,
415416
**kwargs
416417
):
417418
r"""Construct a gradient boosting model.
@@ -491,6 +492,10 @@ def __init__(
491492
The type of feature importance to be filled into ``feature_importances_``.
492493
If 'split', result contains numbers of times the feature is used in a model.
493494
If 'gain', result contains total gains of splits which use the feature.
495+
validation_fraction : float or None, optional (default=0.1)
496+
Proportion of training data to set aside as
497+
validation data for early stopping. If None, early stopping is done on
498+
the training data. Only used if early stopping is performed.
494499
**kwargs
495500
Other parameters for the model.
496501
Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
@@ -566,6 +571,7 @@ def __init__(
566571
self._n_features_in: int = -1
567572
self._classes: Optional[np.ndarray] = None
568573
self._n_classes: int = -1
574+
self.validation_fraction = validation_fraction
569575
self.set_params(**kwargs)
570576

571577
def _more_tags(self) -> Dict[str, Any]:
@@ -668,9 +674,24 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
668674
params.pop('importance_type', None)
669675
params.pop('n_estimators', None)
670676
params.pop('class_weight', None)
677+
params.pop("validation_fraction", None)
671678

672679
if isinstance(params['random_state'], np.random.RandomState):
673680
params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)
681+
682+
params = _choose_param_value(
683+
main_param_name="early_stopping_round",
684+
params=params,
685+
default_value="auto",
686+
)
687+
if params["early_stopping_round"] == "auto":
688+
params["early_stopping_round"] = 10 if hasattr(self, "n_rows_train") and self.n_rows_train > 10000 else None
689+
690+
if params["early_stopping_round"] is True:
691+
params["early_stopping_round"] = 10
692+
elif params["early_stopping_round"] is False:
693+
params["early_stopping_round"] = None
694+
674695
if self._n_classes > 2:
675696
for alias in _ConfigAliases.get('num_class'):
676697
params.pop(alias, None)
@@ -765,7 +786,6 @@ def fit(
765786
params['metric'] = [params['metric']] if isinstance(params['metric'], (str, type(None))) else params['metric']
766787
params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric']
767788
params['metric'] = [metric for metric in params['metric'] if metric is not None]
768-
769789
if not isinstance(X, (pd_DataFrame, dt_DataTable)):
770790
_X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
771791
if sample_weight is not None:
@@ -789,44 +809,61 @@ def fit(
789809
train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group,
790810
init_score=init_score, categorical_feature=categorical_feature,
791811
params=params)
812+
self._n_rows_train = _X.shape[0]
813+
if params["early_stopping_round"] == "auto":
814+
params["early_stopping_round"] = 10 if self.n_rows_train > 10000 else None
815+
if params["early_stopping_round"] is not None and eval_set is None:
816+
if self.validation_fraction is not None:
817+
n_splits = max(int(np.ceil(1 / self.validation_fraction)), 2)
818+
stratified = isinstance(self, LGBMClassifier)
819+
cvfolds = _make_n_folds(full_data=train_set, folds=None, nfold=n_splits,
820+
params=params, seed=self.random_state,
821+
stratified=stratified, shuffle=True)
822+
train_idx, val_idx = next(cvfolds)
823+
valid_set = train_set.subset(sorted(val_idx))
824+
train_set = train_set.subset(sorted(train_idx))
825+
else:
826+
valid_set = train_set
827+
valid_set = valid_set.construct()
828+
valid_sets = [valid_set]
792829

793-
valid_sets: List[Dataset] = []
794-
if eval_set is not None:
795-
796-
def _get_meta_data(collection, name, i):
797-
if collection is None:
798-
return None
799-
elif isinstance(collection, list):
800-
return collection[i] if len(collection) > i else None
801-
elif isinstance(collection, dict):
802-
return collection.get(i, None)
803-
else:
804-
raise TypeError(f"{name} should be dict or list")
805-
806-
if isinstance(eval_set, tuple):
807-
eval_set = [eval_set]
808-
for i, valid_data in enumerate(eval_set):
809-
# reduce cost for prediction training data
810-
if valid_data[0] is X and valid_data[1] is y:
811-
valid_set = train_set
812-
else:
813-
valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
814-
valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
815-
if valid_class_weight is not None:
816-
if isinstance(valid_class_weight, dict) and self._class_map is not None:
817-
valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
818-
valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1])
819-
if valid_weight is None or len(valid_weight) == 0:
820-
valid_weight = valid_class_sample_weight
821-
else:
822-
valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
823-
valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
824-
valid_group = _get_meta_data(eval_group, 'eval_group', i)
825-
valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
826-
group=valid_group, init_score=valid_init_score,
827-
categorical_feature='auto', params=params)
828-
829-
valid_sets.append(valid_set)
830+
else:
831+
valid_sets: List[Dataset] = []
832+
if eval_set is not None:
833+
def _get_meta_data(collection, name, i):
834+
if collection is None:
835+
return None
836+
elif isinstance(collection, list):
837+
return collection[i] if len(collection) > i else None
838+
elif isinstance(collection, dict):
839+
return collection.get(i, None)
840+
else:
841+
raise TypeError(f"{name} should be dict or list")
842+
843+
if isinstance(eval_set, tuple):
844+
eval_set = [eval_set]
845+
for i, valid_data in enumerate(eval_set):
846+
# reduce cost for prediction training data
847+
if valid_data[0] is X and valid_data[1] is y:
848+
valid_set = train_set
849+
else:
850+
valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
851+
valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
852+
if valid_class_weight is not None:
853+
if isinstance(valid_class_weight, dict) and self._class_map is not None:
854+
valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
855+
valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1])
856+
if valid_weight is None or len(valid_weight) == 0:
857+
valid_weight = valid_class_sample_weight
858+
else:
859+
valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
860+
valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
861+
valid_group = _get_meta_data(eval_group, 'eval_group', i)
862+
valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
863+
group=valid_group, init_score=valid_init_score,
864+
categorical_feature='auto', params=params)
865+
866+
valid_sets.append(valid_set)
830867

831868
if isinstance(init_model, LGBMModel):
832869
init_model = init_model.booster_

tests/python_package_test/test_sklearn.py

+51-6
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,51 @@ def test_binary_classification_with_custom_objective():
257257
assert ret < 0.05
258258

259259

260+
@pytest.mark.parametrize('use_weight', [True, False])
261+
def test_binary_classification_with_auto_early_stopping(use_weight):
262+
263+
X, y = load_breast_cancer(return_X_y=True)
264+
n_estimators = 1000
265+
gbm = lgb.LGBMClassifier(
266+
n_estimators=n_estimators, random_state=42, verbose=-1, early_stopping=True
267+
)
268+
weight = np.full_like(y, 2) if use_weight else None
269+
gbm.fit(X, y, sample_weight=weight)
270+
assert bool(gbm.best_iteration_)
271+
272+
273+
def test_regression_with_auto_early_stopping():
274+
X, y = make_synthetic_regression()
275+
n_estimators = 1000
276+
gbm = lgb.LGBMRegressor(
277+
n_estimators=n_estimators,
278+
random_state=42,
279+
early_stopping=True,
280+
verbose=-1,
281+
)
282+
gbm.fit(X, y)
283+
assert bool(gbm.best_iteration_)
284+
285+
@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
286+
def test_lambdarank_with_auto_early_stopping():
287+
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
288+
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
289+
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
290+
gbm = lgb.LGBMRanker(
291+
n_estimators=50, random_state=42, early_stopping=True
292+
)
293+
gbm.fit(
294+
X_train,
295+
y_train,
296+
group=q_train,
297+
eval_at=[1, 3],
298+
callbacks=[
299+
lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
300+
]
301+
)
302+
assert bool(gbm.best_iteration_)
303+
304+
260305
def test_dart():
261306
X, y = make_synthetic_regression()
262307
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -609,21 +654,21 @@ def test_pandas_categorical():
609654
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
610655
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
611656
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
612-
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
657+
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y)
613658
pred0 = gbm0.predict(X_test, raw_score=True)
614659
pred_prob = gbm0.predict_proba(X_test)[:, 1]
615-
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0])
660+
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, pd.Series(y), categorical_feature=[0])
616661
pred1 = gbm1.predict(X_test, raw_score=True)
617-
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A'])
662+
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=['A'])
618663
pred2 = gbm2.predict(X_test, raw_score=True)
619-
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
664+
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
620665
pred3 = gbm3.predict(X_test, raw_score=True)
621666
gbm3.booster_.save_model('categorical.model')
622667
gbm4 = lgb.Booster(model_file='categorical.model')
623668
pred4 = gbm4.predict(X_test)
624-
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E'])
669+
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E'])
625670
pred5 = gbm5.predict(X_test, raw_score=True)
626-
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
671+
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=[])
627672
pred6 = gbm6.predict(X_test, raw_score=True)
628673
with pytest.raises(AssertionError):
629674
np.testing.assert_allclose(pred0, pred1)

0 commit comments

Comments
 (0)