Skip to content

Commit

Permalink
tests: fail tests which write too much data
Browse files Browse the repository at this point in the history
  • Loading branch information
jcsp committed Dec 19, 2024
1 parent a1b0558 commit 1bf20bf
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
23 changes: 23 additions & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,29 @@ def get_metrics_values(

return result

def get_metric_sum(
self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False
) -> float:
"""
Fetch all metrics matching `names` and `filter`, and sum their values
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))

found = False
result = 0.0
for sample in samples:
result += sample.value
found = True

if not found and not absence_ok:
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find any metrics matching {names}, {filter}")

return result


def parse_metrics(text: str, name: str = "") -> Metrics:
metrics = Metrics(name)
Expand Down
52 changes: 52 additions & 0 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,45 @@ def __exit__(
traceback: TracebackType | None,
):
# Stop all the nodes.
bytes_written: int = 0
getpage_requests: int = 0

if self.env:
log.info("Checking for lots of I/O in tests that shouldn't")

sk_bytes_written: float = 0
if self.env.safekeepers[0].running:
_sk_bytes_written = (
self.env.safekeepers[0]
.http_client()
.get_metric_value("safekeeper_write_wal_bytes_sum")
)
if _sk_bytes_written is not None:
sk_bytes_written = int(_sk_bytes_written)

ps_bytes_written: float = 0
for pageserver in self.env.pageservers:
if pageserver.running:
_tmp_bytes_written = pageserver.http_client().get_metric_sum(
["pageserver_io_operations_bytes_total"],
{"operation": "write"},
absence_ok=True,
)
if _tmp_bytes_written is not None:
ps_bytes_written += int(_tmp_bytes_written)

_tmp_getpage = pageserver.http_client().get_metric_value(
"pageserver_smgr_query_started_global_count_total",
{"smgr_query_type": "get_page_at_lsn"},
)
if _tmp_getpage is not None:
getpage_requests += int(_tmp_getpage)
assert ps_bytes_written is not None

log.info(f"Bytes written: SK {sk_bytes_written}, PS {ps_bytes_written}")
log.info(f"GetPage@LSN requests: {getpage_requests}")
bytes_written = int(max(ps_bytes_written, sk_bytes_written))

log.info("Cleaning up all storage and compute nodes")
self.env.stop(
immediate=False,
Expand Down Expand Up @@ -942,6 +980,19 @@ def __exit__(
if cleanup_error is not None:
cleanup_error = e

if os.environ.get("BUILD_TYPE") == "debug" and bytes_written and bytes_written > 128000000:
raise RuntimeError(
f"This test wrote too much data in debug mode: {bytes_written} bytes"
)
else:
log.info(f"This test wrote {bytes_written} bytes")

# Fail tests that do more than 100MB of GetPage@LSN requests in debug mode
if os.environ.get("BUILD_TYPE") == "debug" and getpage_requests > 12800:
raise RuntimeError(
f"This test read too much data from pageservers in debug mode: {getpage_requests * 8192} bytes"
)


class NeonEnv:
"""
Expand Down Expand Up @@ -1245,6 +1296,7 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi

for sk in self.safekeepers:
sk.stop(immediate=immediate)

for pageserver in self.pageservers:
if ps_assert_metric_no_errors:
try:
Expand Down

0 comments on commit 1bf20bf

Please sign in to comment.