Skip to content

Commit 46f6088

Browse files
committed
xgboost: add clipping of loss values to the float32 limits
Internally the XGBoost routine operates a conversion to float32, and this fails if the absolute values of the loss are exceedingly large. This commit adds a simple clipping operation before the loss values are passed to the XGBoost package.
1 parent 6778326 commit 46f6088

File tree

4 files changed

+57
-8
lines changed

4 files changed

+57
-8
lines changed

.pylintrc

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ disable=C0103,C0301,R0801,W1514,W1505
1111

1212
[IMPORTS]
1313
ignored-modules=
14+
known-third-party=xgboost
1415

1516
[DESIGN]
1617
# min-public-methods=1

black_it/samplers/xgboost.py

+27
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# along with this program. If not, see <http://www.gnu.org/licenses/>.
1616

1717
"""This module contains the implementation of the XGBoost sampling."""
18+
import warnings
1819
from typing import Optional, cast
1920

2021
import numpy as np
@@ -23,6 +24,10 @@
2324

2425
from black_it.samplers.surrogate import MLSurrogateSampler
2526

27+
MAX_FLOAT32 = np.finfo(np.float32).max
28+
MIN_FLOAT32 = np.finfo(np.float32).min
29+
EPS_FLOAT32 = np.finfo(np.float32).eps
30+
2631

2732
class XGBoostSampler(MLSurrogateSampler):
2833
"""This class implements xgboost sampling."""
@@ -95,9 +100,31 @@ def n_estimators(self) -> int:
95100
"""Get the number of estimators."""
96101
return self._n_estimators
97102

103+
@staticmethod
104+
def _clip_losses(y: NDArray[np.float64]) -> NDArray[np.float64]:
105+
"""Check that loss values fall within the float32 limits needed for XGBoost to work."""
106+
large_floats = np.where(y >= MAX_FLOAT32)
107+
small_floats = np.where(y <= MIN_FLOAT32)
108+
109+
if len(large_floats) == 0 and len(small_floats) == 0:
110+
return y
111+
112+
warnings.warn(
113+
"Found loss values out of float32 limits, clipping them for XGBoost.",
114+
RuntimeWarning,
115+
)
116+
if len(large_floats) > 0:
117+
y[large_floats] = MAX_FLOAT32 - EPS_FLOAT32
118+
119+
if len(small_floats) > 0:
120+
y[small_floats] = MIN_FLOAT32 + EPS_FLOAT32
121+
122+
return y
123+
98124
def fit(self, X: NDArray[np.float64], y: NDArray[np.float64]) -> None:
99125
"""Fit a xgboost surrogate model."""
100126
# prepare data
127+
y = self._clip_losses(y) # pylint: disable=W0212
101128
_ = xgb.DMatrix(data=X, label=y)
102129

103130
# train surrogate

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ force_grid_wrap=0
3535
use_parentheses=True
3636
line_length=88
3737
ensure_newline_before_comments=True
38+
known_third_party=xgboost
3839

3940
[black]
4041
exclude = "scripts/whitelists/"

tests/test_samplers/test_xgboost.py

+28-8
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
# along with this program. If not, see <http://www.gnu.org/licenses/>.
1616
"""This module contains tests for the xgboost sampler."""
1717
import numpy as np
18-
import pytest
19-
import xgboost as xgb
2018

2119
from black_it.calibrator import Calibrator
2220
from black_it.loss_functions.msm import MethodOfMomentsLoss
@@ -28,6 +26,10 @@
2826

2927
expected_params = np.array([[0.24, 0.26], [0.26, 0.02], [0.08, 0.24], [0.15, 0.15]])
3028

29+
MAX_FLOAT32 = np.finfo(np.float32).max
30+
MIN_FLOAT32 = np.finfo(np.float32).min
31+
EPS_FLOAT32 = np.finfo(np.float32).eps
32+
3133

3234
def test_xgboost_2d() -> None:
3335
"""Test the xgboost sampler, 2d."""
@@ -94,10 +96,28 @@ def test_clip_losses() -> None:
9496
random_state=0,
9597
)
9698

97-
# the calibration breaks due to losses exceeding the limits of float32
99+
# verify that the calibration does not break,
100+
# it would without the call to _clip_losses
101+
_, losses = cal.calibrate(1)
102+
103+
assert np.allclose(
104+
losses,
105+
np.array(
106+
[
107+
7.46098998e02,
108+
5.80544566e17,
109+
3.40282347e38,
110+
3.40282347e38,
111+
3.40282347e38,
112+
2.94273501e41,
113+
]
114+
),
115+
)
116+
117+
# verify that _clip_losses works as expected
118+
y = np.array([0.0, -1e40, 1e40])
119+
y2 = xgboost._clip_losses(y) # pylint: disable=W0212
98120

99-
with pytest.raises(
100-
xgb.core.XGBoostError,
101-
match=r"Label contains NaN, infinity or a value too large",
102-
):
103-
_, losses = cal.calibrate(1)
121+
assert (
122+
y2 == np.array([0.0, MIN_FLOAT32 + EPS_FLOAT32, MAX_FLOAT32 - EPS_FLOAT32])
123+
).all()

0 commit comments

Comments
 (0)