From 4515e547097070655aeb61b1df3703b7665192b4 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 14 Oct 2024 10:32:56 +0100 Subject: [PATCH 1/8] Add installed_extensions prometheus metric and add /metrics endpoint to compute_ctl to expose such metrics metric format example for extension pg_rag with versions 1.2.3 and 1.4.2 installed in 3 and 1 databases respectively: neon_extensions_installed{extension="pg_rag", version="1.2.3"} = 3 neon_extensions_installed{extension="pg_rag", version="1.4.2"} = 1 --- Cargo.lock | 3 ++ compute_tools/Cargo.toml | 3 ++ compute_tools/src/http/api.rs | 25 +++++++++++ compute_tools/src/http/openapi_spec.yaml | 15 +++++++ compute_tools/src/installed_extensions.rs | 31 ++++++++++++-- test_runner/fixtures/endpoint/http.py | 5 +++ .../regress/test_installed_extensions.py | 42 +++++++++++++++++++ 7 files changed, 121 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c5af247e8be4..79e5ea7caaef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1288,12 +1288,15 @@ dependencies = [ "flate2", "futures", "hyper 0.14.30", + "metrics", "nix 0.27.1", "notify", "num_cpus", + "once_cell", "opentelemetry", "opentelemetry_sdk", "postgres", + "prometheus", "regex", "remote_storage", "reqwest 0.12.4", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 91e0b9d5b87c..0bf4ed53d669 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -18,9 +18,11 @@ clap.workspace = true flate2.workspace = true futures.workspace = true hyper0 = { workspace = true, features = ["full"] } +metrics.workspace = true nix.workspace = true notify.workspace = true num_cpus.workspace = true +once_cell.workspace = true opentelemetry.workspace = true opentelemetry_sdk.workspace = true postgres.workspace = true @@ -39,6 +41,7 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true +prometheus.workspace = true compute_api.workspace = true utils.workspace = true diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index af35f71bf2d9..3677582c11ed 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError; use crate::catalog::{get_database_schema, get_dbs_and_roles}; use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; +use crate::installed_extensions; use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest}; use compute_api::responses::{ ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError, @@ -19,6 +20,8 @@ use anyhow::Result; use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use metrics::Encoder; +use metrics::TextEncoder; use tokio::task; use tracing::{debug, error, info, warn}; use tracing_utils::http::OtelName; @@ -65,6 +68,28 @@ async fn routes(req: Request, compute: &Arc) -> Response { + debug!("serving /metrics GET request"); + + let mut buffer = vec![]; + let metrics = installed_extensions::collect(); + let encoder = TextEncoder::new(); + encoder.encode(&metrics, &mut buffer).unwrap(); + + match Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE, encoder.format_type()) + .body(Body::from(buffer)) + { + Ok(response) => response, + Err(err) => { + let msg = format!("error handling /metrics request: {err}"); + error!(msg); + render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR) + } + } + } // Collect Postgres current usage insights (&Method::GET, "/insights") => { info!("serving /insights GET request"); diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 11eee6ccfd44..7b9a62c54569 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -37,6 +37,21 @@ paths: schema: $ref: "#/components/schemas/ComputeMetrics" + /metrics + get: + tags: + - Info + summary: Get compute node metrics in text format. + description: "" + operationId: getComputeMetrics + responses: + 200: + description: ComputeMetrics + content: + text/plain: + schema: + type: string + description: Metrics in text format. /insights: get: tags: diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 877f99bff715..6dd55855db29 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,4 +1,5 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; +use metrics::proto::MetricFamily; use std::collections::HashMap; use std::collections::HashSet; use tracing::info; @@ -8,6 +9,10 @@ use anyhow::Result; use postgres::{Client, NoTls}; use tokio::task; +use metrics::core::Collector; +use metrics::{register_uint_gauge_vec, UIntGaugeVec}; +use once_cell::sync::Lazy; + /// We don't reuse get_existing_dbs() just for code clarity /// and to make database listing query here more explicit. /// @@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result Result Result<()> { "[NEON_EXT_STAT] {}", serde_json::to_string(&result).expect("failed to serialize extensions list") ); - Ok(()) } + +static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "installed_extensions", + "Number of databases where the version of extension is installed", + &["extension_name", "version"] + ) + .expect("failed to define a metric") +}); + +pub fn collect() -> Vec { + INSTALLED_EXTENSIONS.collect() +} diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index ea8291c1e07f..74361307ce48 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -46,3 +46,8 @@ def set_role_grants(self, database: str, role: str, schema: str, privileges: lis ) res.raise_for_status() return res.json() + + def metrics(self): + res = self.get(f"http://localhost:{self.port}/metrics") + res.raise_for_status() + return res.text diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 4700db85eedd..9ee60a94faf8 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -1,5 +1,7 @@ +import time from logging import info +from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import NeonEnv @@ -85,3 +87,43 @@ def test_installed_extensions(neon_simple_env: NeonEnv): assert ext["n_databases"] == 2 ext["versions"].sort() assert ext["versions"] == ["1.2", "1.3"] + + # check that /metrics endpoint is available + # ensure that we see the metric before and after restart + res = client.metrics() + info("Metrics: %s", res) + m = parse_metrics(res) + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 2 + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 1 + + endpoint.stop() + endpoint.start() + + timeout = 5 + while timeout > 0: + try: + res = client.metrics() + timeout = -1 + except Exception as e: + info("failed to get metrics, assume they are not collected yet: %s", e) + time.sleep(1) + timeout -= 1 + continue + + info("After restart metrics: %s", res) + m = parse_metrics(res) + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 1 + + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 1 From b77c93bfe44e714cb1d6b978520ac5ab5101d0a9 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 1 Nov 2024 14:15:43 +0000 Subject: [PATCH 2/8] fix test --- test_runner/regress/test_installed_extensions.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 9ee60a94faf8..4cc96e853482 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -105,17 +105,26 @@ def test_installed_extensions(neon_simple_env: NeonEnv): endpoint.stop() endpoint.start() - timeout = 5 + timeout = 10 while timeout > 0: try: res = client.metrics() timeout = -1 + if len(parse_metrics(res).query_all("installed_extensions")) < 4: + # Assume that not all metrics that are collected yet + time.sleep(1) + timeout -= 1 + continue except Exception as e: info("failed to get metrics, assume they are not collected yet: %s", e) time.sleep(1) timeout -= 1 continue + assert ( + len(parse_metrics(res).query_all("installed_extensions")) >= 4 + ), "Not all metrics are collected" + info("After restart metrics: %s", res) m = parse_metrics(res) neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) From 58658e53f44cf71ed311e1cd546b65a107944b65 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 12 Nov 2024 15:44:33 +0100 Subject: [PATCH 3/8] Update test_runner/regress/test_installed_extensions.py Co-authored-by: Tristan Partin --- test_runner/regress/test_installed_extensions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 4cc96e853482..87a2d2f6ee3e 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -1,8 +1,11 @@ import time from logging import info +from typing import TYPE_CHECKING from fixtures.metrics import parse_metrics -from fixtures.neon_fixtures import NeonEnv + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_installed_extensions(neon_simple_env: NeonEnv): From 6f41a24b436e924b6e5a63545fed770ad699a865 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 12 Nov 2024 15:53:06 +0100 Subject: [PATCH 4/8] fix review suggestions --- test_runner/regress/test_installed_extensions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 87a2d2f6ee3e..40070b6b601b 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -2,6 +2,7 @@ from logging import info from typing import TYPE_CHECKING +from fixtures.log_helper import log from fixtures.metrics import parse_metrics if TYPE_CHECKING: @@ -119,7 +120,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv): timeout -= 1 continue except Exception as e: - info("failed to get metrics, assume they are not collected yet: %s", e) + log.exception("failed to get metrics, assume they are not collected yet: %s", e) time.sleep(1) timeout -= 1 continue From 62ca3116878cf045660a3eac05d64043cfe29964 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 12 Nov 2024 16:17:40 +0100 Subject: [PATCH 5/8] Update test_runner/fixtures/endpoint/http.py Co-authored-by: Tristan Partin --- test_runner/fixtures/endpoint/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 74361307ce48..db3723b7cc9a 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -47,7 +47,7 @@ def set_role_grants(self, database: str, role: str, schema: str, privileges: lis res.raise_for_status() return res.json() - def metrics(self): + def metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") res.raise_for_status() return res.text From 9022ccaff3a1c782b6978cca8314676aa652da08 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 12 Nov 2024 16:26:19 +0100 Subject: [PATCH 6/8] fix test logging --- test_runner/regress/test_installed_extensions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 40070b6b601b..ef5352fc0ecf 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -120,7 +120,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv): timeout -= 1 continue except Exception as e: - log.exception("failed to get metrics, assume they are not collected yet: %s", e) + log.exception("failed to get metrics, assume they are not collected yet") time.sleep(1) timeout -= 1 continue From 3b1ef53c793c70bc5404557aaf61d29c6ff4d006 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 12 Nov 2024 16:34:38 +0100 Subject: [PATCH 7/8] Update test_runner/regress/test_installed_extensions.py --- test_runner/regress/test_installed_extensions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index ef5352fc0ecf..8192e8db9dde 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -119,7 +119,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv): time.sleep(1) timeout -= 1 continue - except Exception as e: + except Exception: log.exception("failed to get metrics, assume they are not collected yet") time.sleep(1) timeout -= 1 From ccedc1e2033d8672341a1f7cc4d78e029b245c25 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 12 Nov 2024 18:47:54 +0100 Subject: [PATCH 8/8] Update test_runner/regress/test_installed_extensions.py --- test_runner/regress/test_installed_extensions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 8192e8db9dde..54ce7c8340d1 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import time from logging import info from typing import TYPE_CHECKING