From 8102a5881eb193678849609642fe6702352752bb Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 28 Oct 2024 10:27:16 +0000 Subject: [PATCH] tests: fail tests which write too much data --- test_runner/fixtures/metrics.py | 23 ++++++++++++ test_runner/fixtures/neon_fixtures.py | 51 +++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index eb3d06b949594..ec9c02d4977c2 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -85,6 +85,29 @@ def get_metrics_values( return result + def get_metric_sum( + self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False + ) -> float: + """ + Fetch all metrics matching `names` and `filter`, and sum their values + """ + metrics = self.get_metrics() + samples = [] + for name in names: + samples.extend(metrics.query_all(name, filter=filter)) + + found = False + result = 0.0 + for sample in samples: + result += sample.value + found = True + + if not found and not absence_ok: + log.info(f"Metrics found: {metrics.metrics}") + raise RuntimeError(f"could not find any metrics matching {names}, {filter}") + + return result + def parse_metrics(text: str, name: str = "") -> Metrics: metrics = Metrics(name) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 13ada1361e638..083519c3bf692 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -878,7 +878,44 @@ def __exit__( traceback: TracebackType | None, ): # Stop all the nodes. + bytes_written: int = 0 + getpage_requests: int = 0 + if self.env: + log.info("Checking for lots of I/O in tests that shouldn't") + + sk_bytes_written: float | None = 0 + if self.env.safekeepers[0].running: + sk_bytes_written = ( + self.env.safekeepers[0] + .http_client() + .get_metric_value("safekeeper_write_wal_bytes_sum") + ) + assert sk_bytes_written is not None + + ps_bytes_written: float = 0 + for pageserver in self.env.pageservers: + if pageserver.running: + _tmp_bytes_written = pageserver.http_client().get_metric_sum( + ["pageserver_io_operations_bytes_total"], + {"operation": "write"}, + absence_ok=True, + ) + if _tmp_bytes_written is not None: + ps_bytes_written += int(_tmp_bytes_written) + + _tmp_getpage = pageserver.http_client().get_metric_value( + "pageserver_smgr_query_started_global_count_total", + {"smgr_query_type": "get_page_at_lsn"}, + ) + if _tmp_getpage is not None: + getpage_requests += int(_tmp_getpage) + assert ps_bytes_written is not None + + log.info(f"Bytes written: SK {sk_bytes_written}, PS {ps_bytes_written}") + log.info(f"GetPage@LSN requests: {getpage_requests}") + bytes_written = int(max(ps_bytes_written, sk_bytes_written)) + log.info("Cleaning up all storage and compute nodes") self.env.stop( immediate=False, @@ -939,6 +976,19 @@ def __exit__( if cleanup_error is not None: cleanup_error = e + if os.environ.get("BUILD_TYPE") == "debug" and bytes_written and bytes_written > 128000000: + raise RuntimeError( + f"This test wrote too much data in debug mode: {bytes_written} bytes" + ) + else: + log.info(f"This test wrote {bytes_written} bytes") + + # Fail tests that do more than 100MB of GetPage@LSN requests in debug mode + if os.environ.get("BUILD_TYPE") == "debug" and getpage_requests > 12800: + raise RuntimeError( + f"This test read too much data from pageservers in debug mode: {getpage_requests * 8192} bytes" + ) + class NeonEnv: """ @@ -1242,6 +1292,7 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi for sk in self.safekeepers: sk.stop(immediate=immediate) + for pageserver in self.pageservers: if ps_assert_metric_no_errors: try: