Skip to content

Commit

Permalink
feat(backend): 故障自愈自动化注册 TencentBlueKing#6940
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangzhw8 committed Sep 13, 2024
1 parent b06e8c2 commit 0118653
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 9 deletions.
15 changes: 15 additions & 0 deletions dbm-ui/backend/components/bkmonitorv3/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,21 @@ def __init__(self):
url="proxy_host_info/",
description=_("获取自定义上报的 proxy 主机信息"),
)
self.search_action_config = self.generate_data_api(
method="GET",
url="search_action_config/",
description=_("查询处理套餐"),
)
self.save_action_config = self.generate_data_api(
method="POST",
url="save_action_config/",
description=_("保存处理套餐"),
)
self.edit_action_config = self.generate_data_api(
method="POST",
url="edit_action_config/",
description=_("编辑处理套餐"),
)


BKMonitorV3Api = _BKMonitorV3Api()
36 changes: 36 additions & 0 deletions dbm-ui/backend/db_monitor/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from django.conf import settings
from django.utils.translation import ugettext as _

from backend import env
from blue_krill.data_types.enum import EnumField, StructuredEnum

DB_MONITOR_TPLS_DIR = os.path.join(settings.BASE_DIR, "backend/db_monitor/tpls")
Expand Down Expand Up @@ -189,3 +190,38 @@ class NoticeWayEnum(str, StructuredEnum):
}

MONITOR_EVENTS = "monitor_events"

AUTOFIX_ACTION_NAME = "dbm_autofix_http_callback"

# 故障自愈模板
AUTOFIX_ACTION_TEMPLATE = {
"execute_config": {
"template_detail": {
"method": "POST",
"url": f"{env.BK_SAAS_CALLBACK_URL}/apis/monitor/policy/callback/",
"headers": [],
"authorize": {
"auth_config": {"token": env.BKMONITOR_BEARER_TOKEN},
"auth_type": "bearer_token",
"insecure_skip_verify": True,
},
"body": {
"data_type": "raw",
"content_type": "json",
"content": '{"callback_message": {{alarm.callback_message}},' '"appointees": "{{alarm.appointees}}"}',
"params": [],
},
"query_params": [],
"need_poll": False,
"notify_interval": 60,
"failed_retry": {"is_enabled": True, "max_retry_times": 2, "retry_interval": 2, "timeout": 10},
},
"timeout": 600,
},
"name": AUTOFIX_ACTION_NAME,
"desc": "",
"is_enabled": True,
# plugin_id = 2 代表 http 回调
"plugin_id": 2,
"bk_biz_id": env.DBA_APP_BK_BIZ_ID,
}
16 changes: 10 additions & 6 deletions dbm-ui/backend/db_monitor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from backend import env
from backend.components import BKMonitorV3Api
from backend.db_monitor.constants import AUTOFIX_ACTION_NAME
from backend.db_monitor.exceptions import BkMonitorDeleteAlarmException, BkMonitorSaveAlarmException

logger = logging.getLogger("root")
Expand Down Expand Up @@ -99,9 +100,12 @@ def render_promql_sql(prom_sql, wheres):
return prom_sql


if __name__ == "__main__":
sql = """ioutil{cluster_type="a"}[1m] by cpu{appid="5"} by disk{db_module="2"}"""
# Conditions to modify or add to the original SQL query
new_conditions = {"appid": ["2", "3"], "cluster_domain": ["hello.2"], "db_module": ["1"]}
# Render the modified PromQL query
print(render_promql_sql(sql, new_conditions))
def get_dbm_autofix_action_id() -> int:
"""获取 dbm 故障自愈套餐 id"""
actions = BKMonitorV3Api.search_action_config({"bk_biz_id": env.DBA_APP_BK_BIZ_ID})["data"]

action_id = None
for action in actions:
if action["name"] == AUTOFIX_ACTION_NAME:
action_id = action["id"]
return action_id
27 changes: 25 additions & 2 deletions dbm-ui/backend/db_periodic_task/local_tasks/db_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from backend.db_monitor.exceptions import BkMonitorSaveAlarmException
from backend.db_monitor.models import CollectInstance, DispatchGroup, MonitorPolicy, NoticeGroup
from backend.db_monitor.tasks import update_app_policy
from backend.db_monitor.utils import get_dbm_autofix_action_id
from backend.db_periodic_task.local_tasks.register import register_periodic_task
from backend.db_periodic_task.utils import TimeUnit, calculate_countdown

Expand Down Expand Up @@ -72,8 +73,10 @@ def update_dba_notice_group(dba_id: int):


@register_periodic_task(run_every=crontab(minute="*/5"))
def sync_plat_monitor_policy():
def sync_plat_monitor_policy(action_id=None):
"""同步平台告警策略"""
if action_id is None:
action_id = get_dbm_autofix_action_id()
skip_dir = "v1"
now = datetime.datetime.now(timezone.utc)
logger.warning("[sync_plat_monitor_policy] sync bkm alarm policy start: %s", now)
Expand Down Expand Up @@ -103,11 +106,31 @@ def sync_plat_monitor_policy():
continue

# patch template
template_dict["details"]["labels"] = list(set(template_dict["details"]["labels"]))
labels = list(set(template_dict["details"]["labels"]))
template_dict["details"]["labels"] = labels
template_dict["details"]["name"] = policy_name
template_dict["details"]["priority"] = TargetPriority.PLATFORM.value
# 平台策略仅开启基于分派通知
template_dict["details"]["notice"]["options"]["assign_mode"] = ["by_rule"]
for label in labels:
if label.startswith("NEED_AUTOFIX") and action_id is not None:
template_dict["details"]["actions"] = (
[
{
"config_id": action_id,
"signal": ["abnormal"],
"user_groups": [],
"options": {
"converge_config": {
"is_enabled": False,
"converge_func": "skip_when_success",
"timedelta": 60,
"count": 1,
}
},
}
],
)

policy = MonitorPolicy(**template_dict)

Expand Down
21 changes: 20 additions & 1 deletion dbm-ui/backend/dbm_init/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
import copy
import datetime
import json
import logging
Expand All @@ -31,6 +32,8 @@
from backend.core.storages.file_source import BkJobFileSourceManager
from backend.core.storages.storage import get_storage
from backend.db_meta.models import AppMonitorTopo
from backend.db_monitor.constants import AUTOFIX_ACTION_TEMPLATE
from backend.db_monitor.utils import get_dbm_autofix_action_id
from backend.db_services.cmdb.biz import get_or_create_cmdb_module_with_name, get_or_create_set_with_name
from backend.db_services.ipchooser.constants import DB_MANAGE_SET, DEFAULT_CLOUD, DIRTY_MODULE, RESOURCE_MODULE
from backend.dbm_init.constants import CC_APP_ABBR_ATTR, CC_HOST_DBM_ATTR
Expand Down Expand Up @@ -419,6 +422,20 @@ def init_custom_metric_and_event():
key=SystemSettingsEnum.BKM_DBM_REPORT.value,
)

@staticmethod
def auto_create_bkmonitor_action() -> int:
"""初始化监控处理套餐"""
action_id = get_dbm_autofix_action_id()
action_config = copy.deepcopy(AUTOFIX_ACTION_TEMPLATE)

if action_id is None:
BKMonitorV3Api.save_action_config(action_config)
else:
action_config["id"] = action_id
BKMonitorV3Api.edit_action_config(action_config)

return action_id

@staticmethod
def auto_create_bkmonitor_alarm() -> bool:
"""初始化bkmonitor配置"""
Expand All @@ -428,11 +445,13 @@ def auto_create_bkmonitor_alarm() -> bool:

logger.info("auto_create_bkmonitor_service")

action_id = Services.auto_create_bkmonitor_action()

# 加载采集策略
CollectInstance.sync_collect_strategy()

# 加载告警策略
sync_plat_monitor_policy()
sync_plat_monitor_policy(action_id)

return True

Expand Down

0 comments on commit 0118653

Please sign in to comment.