Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support multiple destinations for a healthcheck #2704

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci-lite.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
with:
semantic_version: 18
extra_plugins: |
@semantic-release/exec
@semantic-release/exec@v6.0.3
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fixed in: #2690 , but we need it for workflow to work properly

@semantic-release/git
semantic-release-helm
@google/semantic-release-replace-plugin@1.2.0
Expand Down Expand Up @@ -430,7 +430,7 @@ jobs:
with:
semantic_version: 18
extra_plugins: |
@semantic-release/exec
@semantic-release/exec@v6.0.3
@semantic-release/git
semantic-release-helm
@google/semantic-release-replace-plugin@1.2.0
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci-main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
with:
semantic_version: 18
extra_plugins: |
@semantic-release/exec
@semantic-release/exec@v6.0.3
@semantic-release/git
semantic-release-helm
@google/semantic-release-replace-plugin@1.2.0
Expand Down Expand Up @@ -456,7 +456,7 @@ jobs:
with:
semantic_version: 18
extra_plugins: |
@semantic-release/exec
@semantic-release/exec@v6.0.3
@semantic-release/git
semantic-release-helm
@google/semantic-release-replace-plugin@1.2.0
Expand Down
39 changes: 28 additions & 11 deletions package/sbin/healthcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import subprocess
import re

app = Flask(__name__)

Expand All @@ -14,11 +15,20 @@ def str_to_bool(value):
'yes'
}

def get_list_of_destinations():
Copy link
Contributor

@ikheifets-splunk ikheifets-splunk Feb 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just have 2 questions:

  • Are we planning to backport same logic from health check api to healthcheck
  • Are we planning to add same logic on entry point during startup (where we checking that default destination is healthy, probably we also need to check not only default)

P.S. It's just a questions, it doesn't mean that we need to do asap, we will just discuss it

found_destinations = []
regex = r"^SC4S_DEST_SPLUNK_HEC_(.*)_URL$"

for var_key, var_variable in os.environ.items():
if re.search(regex, var_key):
found_destinations.append(var_variable)
return set(found_destinations)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be imported from:

regex = r"^SC4S_DEST_SPLUNK_HEC_(.*)_URL$"

But a dot in "conf.d" is causing an import issue. Something to think about later I guess

class Config:
SC4S_DEST_SPLUNK_HEC_DEFAULT_URL = os.getenv('SC4S_DEST_SPLUNK_HEC_DEFAULT_URL')
HEALTHCHECK_PORT = int(os.getenv('SC4S_LISTEN_STATUS_PORT', '8080'))
CHECK_QUEUE_SIZE = str_to_bool(os.getenv('HEALTHCHECK_CHECK_QUEUE_SIZE', "false"))
MAX_QUEUE_SIZE = int(os.getenv('HEALTHCHECK_MAX_QUEUE_SIZE', '10000'))
DESTINATIONS = get_list_of_destinations()

logging.basicConfig(
format=f"%(asctime)s - healthcheck.py - %(levelname)s - %(message)s",
Expand Down Expand Up @@ -48,11 +58,11 @@ def check_syslog_ng_health() -> bool:
return False

def check_queue_size(
sc4s_dest_splunk_hec_default=Config.SC4S_DEST_SPLUNK_HEC_DEFAULT_URL,
sc4s_dest_splunk_hec_destinations=Config.DESTINATIONS,
max_queue_size=Config.MAX_QUEUE_SIZE
) -> bool:
"""Check syslog-ng queue size and compare it against the configured maximum limit."""
if not sc4s_dest_splunk_hec_default:
if not sc4s_dest_splunk_hec_destinations:
logger.error(
"SC4S_DEST_SPLUNK_HEC_DEFAULT_URL not configured. "
"Ensure the default HEC destination is set, or disable HEALTHCHECK_CHECK_QUEUE_SIZE."
Expand All @@ -71,15 +81,22 @@ def check_queue_size(
return False

stats = result.stdout.splitlines()
destination_stat = next(
(s for s in stats if ";queued;" in s and sc4s_dest_splunk_hec_default in s),
None
)
if not destination_stat:
logger.error("No matching queue stats found for the destination URL.")
return False

queue_size = int(destination_stat.split(";")[-1])
queue_sizes_all_destinations = []

for destination in sc4s_dest_splunk_hec_destinations:
destination_stat = next(
(s for s in stats if ";queued;" in s and destination in s),
None
)

if not destination_stat:
logger.error(f"No matching queue stats found for the destination URL {destination}.")
return False

queue_sizes_all_destinations.append(int(destination_stat.split(";")[-1]))

queue_size = max(queue_sizes_all_destinations)
if queue_size > max_queue_size:
logger.warning(
f"Queue size {queue_size} exceeds the maximum limit of {max_queue_size}."
Expand Down
94 changes: 86 additions & 8 deletions tests/test_healthcheck_unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
check_syslog_ng_health,
subprocess,
check_queue_size,
get_list_of_destinations,
)

# str_to_bool
Expand Down Expand Up @@ -52,9 +53,9 @@ def test_check_syslog_ng_health_exception(mock_run):
# check_queue_size
def test_check_queue_size_no_url():
"""
If sc4s_dest_splunk_hec_default is not set, check_queue_size should fail.
If sc4s_dest_splunk_hec_destinations is not set, check_queue_size should fail.
"""
assert check_queue_size(sc4s_dest_splunk_hec_default=None, max_queue_size=1000) is False
assert check_queue_size(sc4s_dest_splunk_hec_destinations=None, max_queue_size=1000) is False

@patch("subprocess.run")
def test_check_queue_size_stats_fail(mock_run):
Expand All @@ -63,7 +64,7 @@ def test_check_queue_size_stats_fail(mock_run):
"""
mock_run.return_value.returncode = 1
mock_run.return_value.stderr = "stats error"
assert check_queue_size(sc4s_dest_splunk_hec_default="http://example.com:8088", max_queue_size=1000) is False
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088"}, max_queue_size=1000) is False

@patch("subprocess.run")
def test_check_queue_size_no_matching_stats(mock_run):
Expand All @@ -72,7 +73,7 @@ def test_check_queue_size_no_matching_stats(mock_run):
"""
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = "some;other;stat;line\nanother;stat"
assert check_queue_size(sc4s_dest_splunk_hec_default="http://example.com:8088", max_queue_size=1000) is False
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088"}, max_queue_size=1000) is False

@patch("subprocess.run")
def test_check_queue_size_exceeds_limit(mock_run):
Expand All @@ -84,7 +85,7 @@ def test_check_queue_size_exceeds_limit(mock_run):
"destination;queued;http://example.com:8088;2000\n"
"another;queued;http://other-url.com;1234"
)
assert check_queue_size(sc4s_dest_splunk_hec_default="http://example.com:8088", max_queue_size=1000) is False
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088"}, max_queue_size=1000) is False

@patch("subprocess.run")
def test_check_queue_size_under_limit(mock_run):
Expand All @@ -96,7 +97,7 @@ def test_check_queue_size_under_limit(mock_run):
"destination;queued;http://example.com:8088;500\n"
"another;queued;http://other-url.com;1234"
)
assert check_queue_size(sc4s_dest_splunk_hec_default="http://example.com:8088", max_queue_size=1000) is True
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088"}, max_queue_size=1000) is True

@patch("subprocess.run")
def test_check_queue_size_equals_limit(mock_run):
Expand All @@ -108,7 +109,62 @@ def test_check_queue_size_equals_limit(mock_run):
"destination;queued;http://example.com:8088;1000\n"
"another;queued;http://other-url.com;1234"
)
assert check_queue_size(sc4s_dest_splunk_hec_default="http://example.com:8088", max_queue_size=1000) is True
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088"}, max_queue_size=1000) is True

@patch("subprocess.run")
def test_check_queue_size_multiple_destinations(mock_run):
"""
If queue size for all destinations is <= HEALTHCHECK_MAX_QUEUE_SIZE, check_queue_size should pass.
"""
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = (
"destination;queued;http://example.com:8088;300\n"
"destination;queued;http://another.com:8088;500\n"
"another;queued;http://other-url.com;1234"
)
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088", "http://another.com:8088"},
max_queue_size=1000) is True

@patch("subprocess.run")
def test_check_queue_size_multiple_destinations_over_limit(mock_run):
"""
If queue size for at least one destination is > HEALTHCHECK_MAX_QUEUE_SIZE, check_queue_size should fail.
"""
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = (
"destination;queued;http://example.com:8088;1300\n"
"destination;queued;http://another.com:8088;500\n"
"another;queued;http://other-url.com;1234"
)
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088", "http://another.com:8088"},
max_queue_size=1000) is False

@patch("subprocess.run")
def test_check_queue_size_multiple_destinations_all_over_limit(mock_run):
"""
If queue size for all destinations is > HEALTHCHECK_MAX_QUEUE_SIZE, check_queue_size should fail.
"""
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = (
"destination;queued;http://example.com:8088;1300\n"
"destination;queued;http://another.com:8088;1500\n"
"another;queued;http://other-url.com;1234"
)
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088", "http://another.com:8088"},
max_queue_size=1000) is False

@patch("subprocess.run")
def test_check_queue_size_multiple_incomplete_info(mock_run):
"""
If stats run successfully but do not contain stats for one of the desired destinations, it should fail.
"""
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = (
"destination;queued;http://example.com:8088;300\n"
"another;queued;http://other-url.com;1234"
)
assert check_queue_size(sc4s_dest_splunk_hec_destinations={"http://example.com:8088", "http://another.com:8088"},
max_queue_size=1000) is False

@patch("subprocess.run", side_effect=Exception("some exception"))
def test_check_queue_size_exception(mock_run):
Expand Down Expand Up @@ -139,4 +195,26 @@ def test_health_endpoint_no_queue_check(mock_run, client):

response = client.get("/health")
assert response.status_code == 200
assert response.json["status"] == "healthy"
assert response.json["status"] == "healthy"

@patch.dict(
os.environ,
{
"SC4S_DEST_SPLUNK_HEC_DEFAULT_URL": "http://my_test_url:1234",
"SC4S_DEST_SPLUNK_HEC_OTHER_URL": "http://my_hec:1234",
"SOME_OTHER_URL": "http://my_url/test_url",
"SOME_OTHER_ENV_VARIABLE": "my_variable",
"SC4S_LISTEN_STATUS_PORT": "1234",
},
clear=True
)
def test_get_destinations():
"""
Check if get_list_of_destinations method parses and returns the expected
destinations from environment variables.
"""
destinations = get_list_of_destinations()

assert len(destinations) == 2
assert "http://my_test_url:1234" in destinations
assert "http://my_hec:1234" in destinations
Loading