Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: fail tests which write too much data #9537

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_build-and-test-locally.yml
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ jobs:
run_with_real_s3: true
real_s3_bucket: neon-github-ci-tests
real_s3_region: eu-central-1
rerun_failed: true
rerun_failed: false
pg_version: ${{ matrix.pg_version }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
Expand Down
23 changes: 23 additions & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,29 @@ def get_metrics_values(

return result

def get_metric_sum(
self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False
) -> float:
"""
Fetch all metrics matching `names` and `filter`, and sum their values
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))

found = False
result = 0.0
for sample in samples:
result += sample.value
found = True

if not found and not absence_ok:
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find any metrics matching {names}, {filter}")

return result


def parse_metrics(text: str, name: str = "") -> Metrics:
metrics = Metrics(name)
Expand Down
75 changes: 75 additions & 0 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,56 @@ def __exit__(
traceback: TracebackType | None,
):
# Stop all the nodes.
bytes_written: int = 0
getpage_requests: int = 0

if self.env:
log.info("Checking for lots of I/O in tests that shouldn't")

sk_bytes_written: float = 0
if self.env.safekeepers[0].running:
try:
_sk_bytes_written = (
self.env.safekeepers[0]
.http_client()
.get_metric_value("safekeeper_write_wal_bytes_sum")
)
except requests.exceptions.ConnectionError:
_sk_bytes_written = 0

if _sk_bytes_written is not None:
sk_bytes_written = int(_sk_bytes_written)

ps_bytes_written: float = 0
for pageserver in self.env.pageservers:
if pageserver.running:
try:
_tmp_bytes_written = pageserver.http_client().get_metric_sum(
["pageserver_io_operations_bytes_total"],
{"operation": "write"},
absence_ok=True,
)
except requests.exceptions.ConnectionError:
_tmp_bytes_written = 0
if _tmp_bytes_written is not None:
ps_bytes_written += int(_tmp_bytes_written)

try:
_tmp_getpage = pageserver.http_client().get_metric_value(
"pageserver_smgr_query_started_global_count_total",
{"smgr_query_type": "get_page_at_lsn"},
)
except requests.exceptions.ConnectionError:
_tmp_getpage = 0

if _tmp_getpage is not None:
getpage_requests += int(_tmp_getpage)
assert ps_bytes_written is not None

log.info(f"Bytes written: SK {sk_bytes_written}, PS {ps_bytes_written}")
log.info(f"GetPage@LSN requests: {getpage_requests}")
bytes_written = int(max(ps_bytes_written, sk_bytes_written))

log.info("Cleaning up all storage and compute nodes")
self.env.stop(
immediate=False,
Expand Down Expand Up @@ -943,6 +992,31 @@ def __exit__(
if cleanup_error is not None:
cleanup_error = e

if (
os.environ.get("BUILD_TYPE") == "debug"
and bytes_written
and bytes_written > 512 * 1024 * 1024
):
raise RuntimeError(
f"This test wrote too much data in debug mode: {bytes_written} bytes"
)
elif bytes_written > 1024 * 1024 * 1024:
raise RuntimeError(
f"This test wrote too much data in release mode: {bytes_written} bytes"
)
else:
log.info(f"This test wrote {bytes_written} bytes")

# Fail tests that do more than 100MB of GetPage@LSN requests in debug mode
if os.environ.get("BUILD_TYPE") == "debug" and getpage_requests > 12800:
raise RuntimeError(
f"This test read too much data from pageservers in debug mode: {getpage_requests * 8192} bytes"
)
elif getpage_requests > 128000:
raise RuntimeError(
f"This test read too much data from pageservers in release mode: {getpage_requests * 8192} bytes"
)


class NeonEnv:
"""
Expand Down Expand Up @@ -1246,6 +1320,7 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi

for sk in self.safekeepers:
sk.stop(immediate=immediate)

for pageserver in self.pageservers:
if ps_assert_metric_no_errors:
try:
Expand Down
Loading