diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 4263bacce8d1..06fc80a60439 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -306,7 +306,7 @@ jobs: run_with_real_s3: true real_s3_bucket: neon-github-ci-tests real_s3_region: eu-central-1 - rerun_failed: true + rerun_failed: false pg_version: ${{ matrix.pg_version }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c5295360c339..4fbb841d29ba 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -85,6 +85,29 @@ def get_metrics_values( return result + def get_metric_sum( + self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False + ) -> float: + """ + Fetch all metrics matching `names` and `filter`, and sum their values + """ + metrics = self.get_metrics() + samples = [] + for name in names: + samples.extend(metrics.query_all(name, filter=filter)) + + found = False + result = 0.0 + for sample in samples: + result += sample.value + found = True + + if not found and not absence_ok: + log.info(f"Metrics found: {metrics.metrics}") + raise RuntimeError(f"could not find any metrics matching {names}, {filter}") + + return result + def parse_metrics(text: str, name: str = "") -> Metrics: metrics = Metrics(name) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e22e452a5281..865172db440e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -882,7 +882,56 @@ def __exit__( traceback: TracebackType | None, ): # Stop all the nodes. + bytes_written: int = 0 + getpage_requests: int = 0 + if self.env: + log.info("Checking for lots of I/O in tests that shouldn't") + + sk_bytes_written: float = 0 + if self.env.safekeepers[0].running: + try: + _sk_bytes_written = ( + self.env.safekeepers[0] + .http_client() + .get_metric_value("safekeeper_write_wal_bytes_sum") + ) + except requests.exceptions.ConnectionError: + _sk_bytes_written = 0 + + if _sk_bytes_written is not None: + sk_bytes_written = int(_sk_bytes_written) + + ps_bytes_written: float = 0 + for pageserver in self.env.pageservers: + if pageserver.running: + try: + _tmp_bytes_written = pageserver.http_client().get_metric_sum( + ["pageserver_io_operations_bytes_total"], + {"operation": "write"}, + absence_ok=True, + ) + except requests.exceptions.ConnectionError: + _tmp_bytes_written = 0 + if _tmp_bytes_written is not None: + ps_bytes_written += int(_tmp_bytes_written) + + try: + _tmp_getpage = pageserver.http_client().get_metric_value( + "pageserver_smgr_query_started_global_count_total", + {"smgr_query_type": "get_page_at_lsn"}, + ) + except requests.exceptions.ConnectionError: + _tmp_getpage = 0 + + if _tmp_getpage is not None: + getpage_requests += int(_tmp_getpage) + assert ps_bytes_written is not None + + log.info(f"Bytes written: SK {sk_bytes_written}, PS {ps_bytes_written}") + log.info(f"GetPage@LSN requests: {getpage_requests}") + bytes_written = int(max(ps_bytes_written, sk_bytes_written)) + log.info("Cleaning up all storage and compute nodes") self.env.stop( immediate=False, @@ -943,6 +992,31 @@ def __exit__( if cleanup_error is not None: cleanup_error = e + if ( + os.environ.get("BUILD_TYPE") == "debug" + and bytes_written + and bytes_written > 512 * 1024 * 1024 + ): + raise RuntimeError( + f"This test wrote too much data in debug mode: {bytes_written} bytes" + ) + elif bytes_written > 1024 * 1024 * 1024: + raise RuntimeError( + f"This test wrote too much data in release mode: {bytes_written} bytes" + ) + else: + log.info(f"This test wrote {bytes_written} bytes") + + # Fail tests that do more than 100MB of GetPage@LSN requests in debug mode + if os.environ.get("BUILD_TYPE") == "debug" and getpage_requests > 12800: + raise RuntimeError( + f"This test read too much data from pageservers in debug mode: {getpage_requests * 8192} bytes" + ) + elif getpage_requests > 128000: + raise RuntimeError( + f"This test read too much data from pageservers in release mode: {getpage_requests * 8192} bytes" + ) + class NeonEnv: """ @@ -1246,6 +1320,7 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi for sk in self.safekeepers: sk.stop(immediate=immediate) + for pageserver in self.pageservers: if ps_assert_metric_no_errors: try: