From 544c43a31ab9f4d2e7c0d515ac59a4b15b9c0a6a Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 22 Feb 2024 14:56:34 +0100 Subject: [PATCH] Fix benchmark for cancelled markets (#26) --- .../benchmark/benchmark.py | 4 + .../benchmark/utils.py | 42 +++++++-- tests/test_benchmark.py | 87 +++++++++++++++++++ 3 files changed, 125 insertions(+), 8 deletions(-) diff --git a/prediction_market_agent_tooling/benchmark/benchmark.py b/prediction_market_agent_tooling/benchmark/benchmark.py index 4bff73ac..9c9f1757 100644 --- a/prediction_market_agent_tooling/benchmark/benchmark.py +++ b/prediction_market_agent_tooling/benchmark/benchmark.py @@ -38,6 +38,10 @@ def __init__( self.registered_agents ): raise ValueError("Agents must have unique names") + if any(m.is_cancelled for m in markets): + raise ValueError( + "Cancelled markets shouldn't be used in the benchmark, please filter them out." + ) # Predictions self.cache_path = cache_path diff --git a/prediction_market_agent_tooling/benchmark/utils.py b/prediction_market_agent_tooling/benchmark/utils.py index a00eba4f..562d91ba 100644 --- a/prediction_market_agent_tooling/benchmark/utils.py +++ b/prediction_market_agent_tooling/benchmark/utils.py @@ -32,6 +32,12 @@ class MarketResolution(str, Enum): NO = "no" +class CancelableMarketResolution(str, Enum): + YES = "yes" + NO = "no" + CANCEL = "cancel" + + class Market(BaseModel): source: MarketSource question: str @@ -39,7 +45,7 @@ class Market(BaseModel): p_yes: float volume: float created_time: datetime - resolution: MarketResolution | None = None + resolution: CancelableMarketResolution | None = None outcomePrices: list[float] | None = None @validator("outcomePrices", pre=True) @@ -60,6 +66,10 @@ def _validate_created_time(cls, value: datetime) -> datetime: def is_resolved(self) -> bool: return self.resolution is not None + @property + def is_cancelled(self) -> bool: + return self.resolution == CancelableMarketResolution.CANCEL + @property def p_no(self) -> float: return 1 - self.p_yes @@ -77,11 +87,27 @@ def no_outcome_price(self) -> float: @property def probable_resolution(self) -> MarketResolution: return ( - self.resolution - if self.resolution is not None - else MarketResolution.YES - if self.p_yes > 0.5 - else MarketResolution.NO + MarketResolution.YES + if ( + ( + self.resolution is not None + and self.resolution == CancelableMarketResolution.YES + ) + or (self.resolution is None and self.p_yes > 0.5) + ) + else ( + MarketResolution.NO + if ( + ( + self.resolution is not None + and self.resolution == CancelableMarketResolution.NO + ) + or (self.resolution is None and self.p_yes <= 0.5) + ) + else should_not_happen( + f"Unknown resolution `{self.resolution}`, if it is `cancel`, you should first filter out cancelled markets." + ) + ) ) @@ -275,10 +301,10 @@ def get_polymarket_markets( continue resolution = ( - MarketResolution.YES + CancelableMarketResolution.YES if closed and m_json["outcomePrices"][0] == "1.0" else ( - MarketResolution.NO + CancelableMarketResolution.NO if closed and m_json["outcomePrices"][1] == "1.0" else ( should_not_happen() diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 9d28c418..1244026c 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,9 +1,13 @@ +import datetime import tempfile import pytest import prediction_market_agent_tooling.benchmark.benchmark as bm from prediction_market_agent_tooling.benchmark.utils import ( + CancelableMarketResolution, + Market, + MarketResolution, MarketSource, OutcomePrediction, get_markets, @@ -140,3 +144,86 @@ def test_benchmarker_cache(dummy_agent: DummyAgent) -> None: another_benchmark_prediction.outcome_prediction.p_yes == prediction.outcome_prediction.p_yes ) + + +def test_benchmarker_cancelled_markets() -> None: + markets = [ + Market( + source=MarketSource.MANIFOLD, + question="Will GNO go up?", + url="...", + p_yes=0.1, + volume=1, + created_time=datetime.datetime.now(), + resolution=CancelableMarketResolution.CANCEL, + ) + ] + with pytest.raises(ValueError) as e: + bm.Benchmarker( + markets=markets, + agents=[], + ) + assert ( + "Cancelled markets shouldn't be used in the benchmark, please filter them out." + in str(e) + ) + + +def test_market_probable_resolution() -> None: + with pytest.raises(ValueError) as e: + Market( + source=MarketSource.MANIFOLD, + question="Will GNO go up?", + url="...", + p_yes=0.1, + volume=1, + created_time=datetime.datetime.now(), + resolution=CancelableMarketResolution.CANCEL, + ).probable_resolution + assert "Unknown resolution" in str(e) + assert ( + Market( + source=MarketSource.MANIFOLD, + question="Will GNO go up?", + url="...", + p_yes=0.1, + volume=1, + created_time=datetime.datetime.now(), + resolution=CancelableMarketResolution.YES, + ).probable_resolution + == MarketResolution.YES + ) + assert ( + Market( + source=MarketSource.MANIFOLD, + question="Will GNO go up?", + url="...", + p_yes=0.1, + volume=1, + created_time=datetime.datetime.now(), + resolution=CancelableMarketResolution.NO, + ).probable_resolution + == MarketResolution.NO + ) + assert ( + Market( + source=MarketSource.MANIFOLD, + question="Will GNO go up?", + url="...", + p_yes=0.1, + volume=1, + created_time=datetime.datetime.now(), + ).probable_resolution + == MarketResolution.NO + ) + assert ( + Market( + source=MarketSource.MANIFOLD, + question="Will GNO go up?", + url="...", + p_yes=0.8, + volume=1, + created_time=datetime.datetime.now(), + ).probable_resolution + == MarketResolution.YES + )