From cd3aabd75bdb937cecba1f99b306913adfd4bd2a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 7 Nov 2024 12:54:04 -0800 Subject: [PATCH 01/61] Add support bundle API skeleton to Nexus --- nexus/external-api/output/nexus_tags.txt | 5 + nexus/external-api/src/lib.rs | 59 +++++ nexus/src/external_api/http_entrypoints.rs | 99 +++++++ nexus/types/src/external_api/params.rs | 1 + nexus/types/src/external_api/shared.rs | 37 +++ openapi/nexus.json | 291 ++++++++++++++++++++- 6 files changed, 480 insertions(+), 12 deletions(-) diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 2d64679ca02..6ac531cf61b 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -30,6 +30,11 @@ probe_create POST /experimental/v1/probes probe_delete DELETE /experimental/v1/probes/{probe} probe_list GET /experimental/v1/probes probe_view GET /experimental/v1/probes/{probe} +support_bundle_create POST /experimental/v1/system/support-bundles +support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} +support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_list GET /experimental/v1/system/support-bundles +support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} API operations found with tag "images" OPERATION ID METHOD URL PATH diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index b05366e0c56..d7f523a9006 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2760,6 +2760,65 @@ pub trait NexusExternalApi { path_params: Path, ) -> Result; + // Support bundles (experimental) + + /// List all support bundles + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_list( + rqctx: RequestContext, + query_params: Query, + ) -> Result>, HttpError>; + + /// View a single support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_view( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the contents of a single support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Create a new support bundle + #[endpoint { + method = POST, + path = "/experimental/v1/system/support-bundles", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Delete an existing support bundle + /// + /// May also be used to cancel a support bundle which is currently being + /// collected, or to remove metadata for a support bundle that has failed. + #[endpoint { + method = DELETE, + path = "/experimental/v1/system/support-bundles/{support_bundle}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + // Probes (experimental) /// List instrumentation probes diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index df8f1aa24e5..655c56320e5 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -5999,6 +5999,105 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn support_bundle_list( + rqctx: RequestContext, + _query_params: Query, + ) -> Result>, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_view( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_create( + rqctx: RequestContext, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_delete( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + async fn probe_list( rqctx: RequestContext, query_params: Query>, diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 9af9d3be1e6..a0bd36ed3cf 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -89,6 +89,7 @@ path_param!(SshKeyPath, ssh_key, "SSH key"); path_param!(AddressLotPath, address_lot, "address lot"); path_param!(ProbePath, probe, "probe"); path_param!(CertificatePath, certificate, "certificate"); +path_param!(SupportBundlePath, support_bundle, "support bundle"); id_path_param!(GroupPath, group_id, "group"); diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 9bfa9c8358d..35367400e0b 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -6,6 +6,8 @@ use std::net::IpAddr; +use chrono::DateTime; +use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; use parse_display::FromStr; @@ -414,6 +416,41 @@ mod test { } } +#[derive(Debug, Clone, Copy, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SupportBundleState { + /// Support Bundle still actively being collected. + /// + /// This is the initial state for a Support Bundle, and it will + /// automatically transition to either "Failed" or "Active". + /// + /// If a user no longer wants to access a Support Bundle, they can + /// request cancellation, which will transition to the "Cancelling" state. + Collecting, + + /// Support Bundle has been cancelled, and will be deleted shortly. + Cancelling, + + /// Support Bundle was not created successfully, or was created and has lost + /// backing storage. + /// + /// The record of the bundle still exists for readability, but the only + /// valid operation on these bundles is to delete them. + Failed, + + /// Support Bundle has been processed, and is ready for usage. + Active, +} + +#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] +pub struct SupportBundleInfo { + // XXX: Strongly-typed Support Bundle UUID? + pub id: Uuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub state: SupportBundleState, +} + #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct ProbeInfo { pub id: Uuid, diff --git a/openapi/nexus.json b/openapi/nexus.json index f12ff4730cf..6cf143ac49c 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -301,6 +301,195 @@ } } }, + "/experimental/v1/system/support-bundles": { + "get": { + "tags": [ + "hidden" + ], + "summary": "List all support bundles", + "operationId": "support_bundle_list", + "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + }, + { + "in": "query", + "name": "sort_by", + "schema": { + "$ref": "#/components/schemas/IdSortMode" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfoResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + }, + "post": { + "tags": [ + "hidden" + ], + "summary": "Create a new support bundle", + "operationId": "support_bundle_create", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "View a single support bundle", + "operationId": "support_bundle_view", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "Name or ID of the support bundle", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "tags": [ + "hidden" + ], + "summary": "Delete an existing support bundle", + "description": "May also be used to cancel a support bundle which is currently being collected, or to remove metadata for a support bundle that has failed.", + "operationId": "support_bundle_delete", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "Name or ID of the support bundle", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the contents of a single support bundle", + "operationId": "support_bundle_download", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "Name or ID of the support bundle", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + } + }, "/login/{silo_name}/saml/{provider_name}": { "post": { "tags": [ @@ -19990,6 +20179,84 @@ "items" ] }, + "SupportBundleInfo": { + "type": "object", + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "reason_for_creation": { + "type": "string" + }, + "state": { + "$ref": "#/components/schemas/SupportBundleState" + }, + "time_created": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "id", + "reason_for_creation", + "state", + "time_created" + ] + }, + "SupportBundleInfoResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/SupportBundleInfo" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, + "SupportBundleState": { + "oneOf": [ + { + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failed\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Cancelling\" state.", + "type": "string", + "enum": [ + "collecting" + ] + }, + { + "description": "Support Bundle has been cancelled, and will be deleted shortly.", + "type": "string", + "enum": [ + "cancelling" + ] + }, + { + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to delete them.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "Support Bundle has been processed, and is ready for usage.", + "type": "string", + "enum": [ + "active" + ] + } + ] + }, "Switch": { "description": "An operator's view of a Switch.", "type": "object", @@ -22440,6 +22707,18 @@ } ] }, + "IdSortMode": { + "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", + "oneOf": [ + { + "description": "sort in increasing order of \"id\"", + "type": "string", + "enum": [ + "id_ascending" + ] + } + ] + }, "DiskMetricName": { "type": "string", "enum": [ @@ -22459,18 +22738,6 @@ "descending" ] }, - "IdSortMode": { - "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", - "oneOf": [ - { - "description": "sort in increasing order of \"id\"", - "type": "string", - "enum": [ - "id_ascending" - ] - } - ] - }, "SystemMetricName": { "type": "string", "enum": [ From 3774ecbe97839af6fba78c7a55d8ec5c21eb4d62 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 7 Nov 2024 12:57:50 -0800 Subject: [PATCH 02/61] fmt --- nexus/src/external_api/http_entrypoints.rs | 28 +++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 655c56320e5..3f22b958118 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6002,7 +6002,8 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_list( rqctx: RequestContext, _query_params: Query, - ) -> Result>, HttpError> { + ) -> Result>, HttpError> + { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; @@ -6010,7 +6011,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6030,7 +6034,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6050,7 +6057,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6069,7 +6079,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context @@ -6089,7 +6102,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus.unimplemented_todo(&opctx, crate::app::Unimpl::Public).await.into()) + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) }; apictx .context From 0b3fd29b97db396a3e937c39ad2d83c3ade22797 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 7 Nov 2024 14:54:57 -0800 Subject: [PATCH 03/61] EXPECTORATE --- nexus/reconfigurator/planning/src/system.rs | 6 +++--- nexus/tests/output/uncovered-authz-endpoints.txt | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 66a53932757..28b8bcf2ae2 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -845,9 +845,9 @@ impl Sled { name: config.name.full_name(), available: ByteCount::from_gibibytes_u32(1), used: ByteCount::from_gibibytes_u32(0), - quota: config.quota, - reservation: config.reservation, - compression: config.compression.to_string(), + quota: config.inner.quota, + reservation: config.inner.reservation, + compression: config.inner.compression.to_string(), }); } diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index c5091c5a3bc..ec4790c071d 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -1,13 +1,18 @@ API endpoints with no coverage in authz tests: probe_delete (delete "/experimental/v1/probes/{probe}") +support_bundle_delete (delete "/experimental/v1/system/support-bundles/{support_bundle}") probe_list (get "/experimental/v1/probes") probe_view (get "/experimental/v1/probes/{probe}") +support_bundle_list (get "/experimental/v1/system/support-bundles") +support_bundle_view (get "/experimental/v1/system/support-bundles/{support_bundle}") +support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") probe_create (post "/experimental/v1/probes") +support_bundle_create (post "/experimental/v1/system/support-bundles") login_saml (post "/login/{silo_name}/saml/{provider_name}") login_local (post "/v1/login/{silo_name}/local") logout (post "/v1/logout") From ec897f77162dfe4884bf5acc480c019d7715e54e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 8 Nov 2024 15:30:20 -0800 Subject: [PATCH 04/61] Add database schema for support bundles --- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 14 +++++ nexus/db-model/src/support_bundle.rs | 77 ++++++++++++++++++++++++++++ schema/crdb/dbinit.sql | 28 ++++++++++ 4 files changed, 121 insertions(+) create mode 100644 nexus/db-model/src/support_bundle.rs diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 9137b0b3c3a..546f53b1ac0 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -96,6 +96,7 @@ mod sled_state; mod sled_underlay_subnet_allocation; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod tuf_repo; mod typed_uuid; @@ -203,6 +204,7 @@ pub use sled_state::*; pub use sled_underlay_subnet_allocation::*; pub use snapshot::*; pub use ssh_key::*; +pub use support_bundle::*; pub use switch::*; pub use switch_interface::*; pub use switch_port::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 8a97e436398..56f1e7e8acd 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1362,6 +1362,20 @@ joinable!(tuf_repo_artifact -> tuf_repo (tuf_repo_id)); // Can't specify joinable for a composite primary key (tuf_repo_artifact -> // tuf_artifact). +table! { + support_bundle { + id -> Uuid, + time_created -> Timestamptz, + reason_for_creation -> Text, + reason_for_failure -> Nullable, + state -> crate::SupportBundleStateEnum, + zpool_id -> Uuid, + dataset_id -> Uuid, + + assigned_nexus -> Nullable, + } +} + /* hardware inventory */ table! { diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs new file mode 100644 index 00000000000..56ebbf2e2f2 --- /dev/null +++ b/nexus/db-model/src/support_bundle.rs @@ -0,0 +1,77 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::support_bundle; +use crate::typed_uuid::DbTypedUuid; + +use chrono::{DateTime, Utc}; +use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; +use nexus_types::external_api::shared::SupportBundleState as SupportBundleStateView; +use omicron_uuid_kinds::DatasetKind; +use omicron_uuid_kinds::ZpoolKind; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "support_bundle_state", schema = "public"))] + pub struct SupportBundleStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = SupportBundleStateEnum)] + pub enum SupportBundleState; + + // Enum values + Collecting => b"collecting" + Cancelling => b"cancelling" + Failed => b"failed" + Active => b"active" +); + +impl From for SupportBundleStateView { + fn from(state: SupportBundleState) -> Self { + use SupportBundleState::*; + + match state { + Collecting => SupportBundleStateView::Collecting, + Cancelling => SupportBundleStateView::Cancelling, + Failed => SupportBundleStateView::Failed, + Active => SupportBundleStateView::Active, + } + } +} + +#[derive( + Queryable, + Insertable, + Debug, + Clone, + Selectable, + Deserialize, + Serialize, + PartialEq, +)] +#[diesel(table_name = support_bundle)] +pub struct SupportBundle { + id: Uuid, + time_created: DateTime, + reason_for_creation: String, + reason_for_failure: Option, + state: SupportBundleState, + zpool_id: DbTypedUuid, + dataset_id: DbTypedUuid, + assigned_nexus: Option, +} + +impl From for SupportBundleView { + fn from(bundle: SupportBundle) -> Self { + Self { + id: bundle.id, + time_created: bundle.time_created, + reason_for_creation: bundle.reason_for_creation, + state: bundle.state.into(), + } + } +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b095c4c89a9..12883554630 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2384,6 +2384,34 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( /*******************************************************************/ +/* + * Support Bundles + */ + + +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + 'collecting', + 'cancelling', + 'failed', + 'active' +); + +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID NOT NULL, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id NOT NULL UUID, + dataset_id NOT NULL UUID, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID, +); + +/*******************************************************************/ + /* * DNS Propagation * From 38b6ea86de4452c0a831641a2c6bf8b7b5b59cff Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 8 Nov 2024 15:47:55 -0800 Subject: [PATCH 05/61] Add failure reason --- nexus/types/src/external_api/shared.rs | 1 + openapi/nexus.json | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 35367400e0b..2928fab46bc 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -448,6 +448,7 @@ pub struct SupportBundleInfo { pub id: Uuid, pub time_created: DateTime, pub reason_for_creation: String, + pub reason_for_failure: Option, pub state: SupportBundleState, } diff --git a/openapi/nexus.json b/openapi/nexus.json index 6cf143ac49c..1aaba49e9eb 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -20189,6 +20189,10 @@ "reason_for_creation": { "type": "string" }, + "reason_for_failure": { + "nullable": true, + "type": "string" + }, "state": { "$ref": "#/components/schemas/SupportBundleState" }, From 908f661a755cb7a1bf7ea4c159c0c1b830c1d809 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 8 Nov 2024 15:48:44 -0800 Subject: [PATCH 06/61] Merge reason for failure --- nexus/db-model/src/support_bundle.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index 56ebbf2e2f2..aedcdbf760e 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -71,6 +71,7 @@ impl From for SupportBundleView { id: bundle.id, time_created: bundle.time_created, reason_for_creation: bundle.reason_for_creation, + reason_for_failure: bundle.reason_for_failure, state: bundle.state.into(), } } From aa3978f4cedac5ce787ce851f0c0c456e7f74021 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 12 Nov 2024 12:03:56 -0800 Subject: [PATCH 07/61] Fix SQL, moving forward on tests --- nexus/db-model/src/schema.rs | 1 + nexus/db-model/src/support_bundle.rs | 63 +- nexus/db-queries/src/db/datastore/dataset.rs | 25 +- nexus/db-queries/src/db/datastore/mod.rs | 30 + .../src/db/datastore/support_bundle.rs | 855 ++++++++++++++++++ nexus/types/src/external_api/shared.rs | 4 +- schema/crdb/dbinit.sql | 39 +- uuid-kinds/src/lib.rs | 1 + 8 files changed, 978 insertions(+), 40 deletions(-) create mode 100644 nexus/db-queries/src/db/datastore/support_bundle.rs diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 56f1e7e8acd..34a29e7297e 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -2042,6 +2042,7 @@ allow_tables_to_appear_in_same_query!( console_session, sled, sled_resource, + support_bundle, router_route, vmm, volume, diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index aedcdbf760e..f2450473f54 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -10,9 +10,14 @@ use chrono::{DateTime, Utc}; use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; use nexus_types::external_api::shared::SupportBundleState as SupportBundleStateView; use omicron_uuid_kinds::DatasetKind; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::OmicronZoneKind; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleKind; +use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolKind; +use omicron_uuid_kinds::ZpoolUuid; use serde::{Deserialize, Serialize}; -use uuid::Uuid; impl_enum_type!( #[derive(SqlType, Debug, QueryId)] @@ -25,17 +30,35 @@ impl_enum_type!( // Enum values Collecting => b"collecting" + Collected => b"collected" Cancelling => b"cancelling" Failed => b"failed" Active => b"active" ); +impl SupportBundleState { + pub fn might_have_dataset_storage(&self) -> bool { + use SupportBundleState::*; + + match self { + Collecting => false, + Collected => true, + Cancelling => true, + Failed => false, + Active => true, + } + } +} + impl From for SupportBundleStateView { fn from(state: SupportBundleState) -> Self { use SupportBundleState::*; match state { Collecting => SupportBundleStateView::Collecting, + // The distinction between "collected" and "collecting" is an + // internal detail that doesn't need to be exposed through the API. + Collected => SupportBundleStateView::Collecting, Cancelling => SupportBundleStateView::Cancelling, Failed => SupportBundleStateView::Failed, Active => SupportBundleStateView::Active, @@ -55,20 +78,40 @@ impl From for SupportBundleStateView { )] #[diesel(table_name = support_bundle)] pub struct SupportBundle { - id: Uuid, - time_created: DateTime, - reason_for_creation: String, - reason_for_failure: Option, - state: SupportBundleState, - zpool_id: DbTypedUuid, - dataset_id: DbTypedUuid, - assigned_nexus: Option, + pub id: DbTypedUuid, + pub time_created: DateTime, + pub reason_for_creation: String, + pub reason_for_failure: Option, + pub state: SupportBundleState, + pub zpool_id: DbTypedUuid, + pub dataset_id: DbTypedUuid, + pub assigned_nexus: Option>, +} + +impl SupportBundle { + pub fn new( + reason_for_creation: &'static str, + zpool_id: ZpoolUuid, + dataset_id: DatasetUuid, + nexus_id: OmicronZoneUuid, + ) -> Self { + Self { + id: SupportBundleUuid::new_v4().into(), + time_created: Utc::now(), + reason_for_creation: reason_for_creation.to_string(), + reason_for_failure: None, + state: SupportBundleState::Collecting, + zpool_id: zpool_id.into(), + dataset_id: dataset_id.into(), + assigned_nexus: Some(nexus_id.into()), + } + } } impl From for SupportBundleView { fn from(bundle: SupportBundle) -> Self { Self { - id: bundle.id, + id: bundle.id.into(), time_created: bundle.time_created, reason_for_creation: bundle.reason_for_creation, reason_for_failure: bundle.reason_for_failure, diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 996f105254c..e5a72c5fb33 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -347,14 +347,13 @@ impl DataStore { #[cfg(test)] mod test { use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; use crate::db::pub_test_utils::TestDatabase; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; - use nexus_types::deployment::Blueprint; - use nexus_types::deployment::BlueprintTarget; use omicron_common::api::internal::shared::DatasetKind as ApiDatasetKind; use omicron_test_utils::dev; use omicron_uuid_kinds::SledUuid; @@ -516,28 +515,6 @@ mod test { logctx.cleanup_successful(); } - async fn bp_insert_and_make_target( - opctx: &OpContext, - datastore: &DataStore, - bp: &Blueprint, - ) { - datastore - .blueprint_insert(opctx, bp) - .await - .expect("inserted blueprint"); - datastore - .blueprint_target_set_current( - opctx, - BlueprintTarget { - target_id: bp.id, - enabled: true, - time_made_target: Utc::now(), - }, - ) - .await - .expect("made blueprint the target"); - } - fn new_dataset_on(zpool_id: ZpoolUuid) -> Dataset { Dataset::new( Uuid::new_v4(), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 5344ffc1d9a..c0b31ed2eff 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -93,6 +93,7 @@ mod sled; mod sled_instance; mod snapshot; mod ssh_key; +mod support_bundle; mod switch; mod switch_interface; mod switch_port; @@ -465,6 +466,8 @@ mod test { use nexus_db_fixed_data::silo::DEFAULT_SILO; use nexus_db_model::IpAttachState; use nexus_db_model::{to_db_typed_uuid, Generation}; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintTarget; use nexus_types::external_api::params; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_common::api::external::{ @@ -503,6 +506,33 @@ mod test { } } + /// Inserts a blueprint in the DB and forcibly makes it the target + /// + /// WARNING: This makes no attempts to validate the blueprint relative to + /// parents -- this is just a test-only helper to make testing + /// blueprint-specific checks easier. + pub async fn bp_insert_and_make_target( + opctx: &OpContext, + datastore: &DataStore, + bp: &Blueprint, + ) { + datastore + .blueprint_insert(opctx, bp) + .await + .expect("inserted blueprint"); + datastore + .blueprint_target_set_current( + opctx, + BlueprintTarget { + target_id: bp.id, + enabled: true, + time_made_target: Utc::now(), + }, + ) + .await + .expect("made blueprint the target"); + } + #[tokio::test] async fn test_project_creation() { let logctx = dev::test_setup_log("test_project_creation"); diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs new file mode 100644 index 00000000000..d289127f4b7 --- /dev/null +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -0,0 +1,855 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`SupportBundle`]s. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::model::Dataset; +use crate::db::model::DatasetKind; +use crate::db::model::SupportBundle; +use crate::db::model::SupportBundleState; +use crate::db::pagination::paginated; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use futures::FutureExt; +use nexus_types::identity::Asset; +use omicron_common::api::external; +use omicron_common::api::external::CreateResult; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::LookupResult; +use omicron_uuid_kinds::BlueprintUuid; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolUuid; +use uuid::Uuid; + +const CANNOT_ALLOCATE_ERR_MSG: &'static str = +"Current policy limits support bundle creation to 'one per external disk', and \ + no disks are available. Either delete old support bundles or add additional \ + disks"; + +/// Provides a report on how many bundle were expunged, and why. +#[derive(Debug, Clone)] +pub struct SupportBundleExpungementReport { + pub bundles_failed_missing_datasets: usize, + pub bundles_cancelled_missing_nexus: usize, + pub bundles_failed_missing_nexus: usize, +} + +impl DataStore { + /// Creates a new support bundle. + /// + /// Requires that the UUID of the calling Nexus be supplied as input - + /// this particular Zone is responsible for the collection process. + /// + /// Note that really any functioning Nexus would work as the "assignee", + /// but it's clear that our instance will work, because we're currently + /// running. + pub async fn support_bundle_create( + &self, + opctx: &OpContext, + reason_for_creation: &'static str, + this_nexus_id: OmicronZoneUuid, + ) -> CreateResult { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + #[derive(Debug)] + enum SupportBundleError { + TooManyBundles, + } + + let err = OptionalError::new(); + self.transaction_retry_wrapper("support_bundle_create") + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::support_bundle::dsl as support_bundle_dsl; + + // Observe all "non-deleted, debug datasets". + // + // Return the first one we find that doesn't already + // have a support bundle allocated to it. + let free_dataset = dataset_dsl::dataset + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::kind.eq(DatasetKind::Debug)) + .left_join(support_bundle_dsl::support_bundle.on( + dataset_dsl::id.eq(support_bundle_dsl::dataset_id), + )) + .filter(support_bundle_dsl::dataset_id.is_null()) + .select(Dataset::as_select()) + .first_async(&conn) + .await + .optional()?; + + let Some(dataset) = free_dataset else { + return Err( + err.bail(SupportBundleError::TooManyBundles) + ); + }; + + // We could check that "this_nexus_id" is not expunged, but + // we have some evidence that it is valid: this Nexus is + // currently running! + // + // Besides, we COULD be expunged immediately after inserting + // the SupportBundle. In this case, we'd fall back to the + // case of "clean up a bundle which is managed by an + // expunged Nexus" anyway. + + let bundle = SupportBundle::new( + reason_for_creation, + ZpoolUuid::from_untyped_uuid(dataset.pool_id), + DatasetUuid::from_untyped_uuid(dataset.id()), + this_nexus_id, + ); + + diesel::insert_into(support_bundle_dsl::support_bundle) + .values(bundle.clone()) + .execute_async(&conn) + .await?; + + Ok(bundle) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SupportBundleError::TooManyBundles => { + return external::Error::insufficient_capacity( + CANNOT_ALLOCATE_ERR_MSG, + "Support Bundle storage exhausted", + ); + } + } + } + public_error_from_diesel(e, ErrorHandler::Server) + }) + } + + /// Looks up a single support bundle + pub async fn support_bundle_get( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + dsl::support_bundle + .filter(dsl::id.eq(id.into_untyped_uuid())) + .select(SupportBundle::as_select()) + .first_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Lists one page of support bundles + pub async fn support_bundle_list( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Marks support bundles as failed if their assigned Nexus or backing + /// dataset has been destroyed. + pub async fn support_bundle_fail_expunged( + &self, + opctx: &OpContext, + blueprint: &nexus_types::deployment::Blueprint, + ) -> Result { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + // For this blueprint: The set of all expunged Nexus zones + let invalid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::Expunged, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + let valid_nexus_zones = blueprint + .all_omicron_zones( + nexus_types::deployment::BlueprintZoneFilter::ShouldBeRunning, + ) + .filter_map(|(_sled, zone)| { + if matches!( + zone.zone_type, + nexus_types::deployment::BlueprintZoneType::Nexus(_) + ) { + Some(zone.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + // For this blueprint: The set of expunged debug datasets + let invalid_datasets = blueprint + .all_omicron_datasets( + nexus_types::deployment::BlueprintDatasetFilter::Expunged, + ) + .filter_map(|dataset_config| { + if matches!( + dataset_config.kind, + omicron_common::api::internal::shared::DatasetKind::Debug + ) { + Some(dataset_config.id.into_untyped_uuid()) + } else { + None + } + }) + .collect::>(); + + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_if_current_blueprint_is( + &conn, + "support_bundle_fail_expunged", + opctx, + BlueprintUuid::from_untyped_uuid(blueprint.id), + |conn| { + let invalid_nexus_zones = invalid_nexus_zones.clone(); + let valid_nexus_zones = valid_nexus_zones.clone(); + let invalid_datasets = invalid_datasets.clone(); + async move { + use db::schema::support_bundle::dsl; + + // Find all bundles on datasets that no longer exist, and + // mark them "failed". + let bundles_with_bad_datasets = dsl::support_bundle + .filter(dsl::dataset_id.eq_any(invalid_datasets)) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await?; + + let bundles_failed_missing_datasets = + diesel::update(dsl::support_bundle) + .filter(dsl::state.ne(SupportBundleState::Failed)) + .filter( + dsl::id.eq_any( + bundles_with_bad_datasets + .iter() + .map(|b| b.id) + .collect::>(), + ), + ) + .set(( + dsl::state.eq(SupportBundleState::Failed), + dsl::reason_for_failure + .eq("Allocated dataset no longer exists"), + )) + .execute_async(&*conn) + .await?; + + let Some(arbitrary_valid_nexus) = + valid_nexus_zones.get(0).cloned() + else { + return Err(external::Error::internal_error( + "No valid Nexuses", + ) + .into()); + }; + + // Find all bundles on nexuses that no longer exist. + // + // If the bundle might have storage: + // - Mark it cancelled and re-assign the managing Nexus + // - Otherwise: mark it failed + let bundles_with_bad_nexuses = dsl::support_bundle + .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await?; + let (needs_cancellation, needs_failure): (Vec<_>, Vec<_>) = + bundles_with_bad_nexuses.into_iter().partition( + |bundle| bundle.state.might_have_dataset_storage(), + ); + + // Mark these support bundles as cancelled, and assign then + // to a nexus that should still exist. + // + // This should lead to their storage being freed, if it + // exists. + let bundles_cancelled_missing_nexus = diesel::update( + dsl::support_bundle, + ) + .filter(dsl::state.ne(SupportBundleState::Cancelling)) + .filter(dsl::state.ne(SupportBundleState::Failed)) + .filter( + dsl::id.eq_any( + needs_cancellation + .iter() + .map(|b| b.id) + .collect::>(), + ), + ) + .set(( + dsl::assigned_nexus.eq(arbitrary_valid_nexus), + dsl::state.eq(SupportBundleState::Cancelling), + dsl::reason_for_failure + .eq("Nexus managing this bundle no longer exists"), + )) + .execute_async(&*conn) + .await?; + + // Mark these support bundles as failed. + // + // If they don't have storage (e.g., they never got that + // far, or their underlying dataset was expunged) there + // isn't anything left to free. This means we can skip the + // "Cancelling" state and jump to "Failed". + let bundles_failed_missing_nexus = diesel::update( + dsl::support_bundle, + ) + .filter(dsl::state.ne(SupportBundleState::Cancelling)) + .filter(dsl::state.ne(SupportBundleState::Failed)) + .filter(dsl::id.eq_any( + needs_failure.iter().map(|b| b.id).collect::>(), + )) + .set(( + dsl::state.eq(SupportBundleState::Failed), + dsl::reason_for_failure + .eq("Nexus managing this bundle no longer exists"), + )) + .execute_async(&*conn) + .await?; + + Ok(SupportBundleExpungementReport { + bundles_failed_missing_datasets, + bundles_cancelled_missing_nexus, + bundles_failed_missing_nexus, + }) + } + .boxed() + }, + ) + .await + } + + /// Cancels a single support bundle. + /// + /// Note that this does not delete the support bundle record, but starts the + /// process by which it may later be deleted. + pub async fn support_bundle_cancel( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl as support_bundle_dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + diesel::update(support_bundle_dsl::support_bundle) + .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) + .set(support_bundle_dsl::state.eq(SupportBundleState::Cancelling)) + .execute_async(&*conn) + .await + .map(|_rows_modified| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + /// Deletes a support bundle. + /// + /// This should only be invoked after all storage for the support bundle has + /// been cleared. + pub async fn support_bundle_delete( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + use db::schema::support_bundle::dsl as support_bundle_dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + diesel::delete(support_bundle_dsl::support_bundle) + .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) + .execute_async(&*conn) + .await + .map(|_rows_modified| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::db::datastore::test::bp_insert_and_make_target; + use crate::db::pub_test_utils::TestDatabase; + use nexus_db_model::Generation; + use nexus_db_model::SledBaseboard; + use nexus_db_model::SledSystemHardware; + use nexus_db_model::SledUpdate; + use nexus_db_model::Zpool; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; + use nexus_reconfigurator_planning::example::SimRngState; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintDatasetDisposition; + use nexus_types::deployment::BlueprintDatasetFilter; + use nexus_types::deployment::BlueprintZoneFilter; + use nexus_types::deployment::BlueprintZoneType; + use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; + use omicron_test_utils::dev; + use omicron_uuid_kinds::SledUuid; + use rand::Rng; + + // Pool/Dataset pairs, for debug datasets only. + struct TestPool { + pool: ZpoolUuid, + dataset: DatasetUuid, + } + + // Sleds and their pools, with a focus on debug datasets only. + struct TestSled { + sled: SledUuid, + pools: Vec, + } + + impl TestSled { + fn new_with_pool_count(pool_count: usize) -> Self { + Self { + sled: SledUuid::new_v4(), + pools: (0..pool_count) + .map(|_| TestPool { + pool: ZpoolUuid::new_v4(), + dataset: DatasetUuid::new_v4(), + }) + .collect(), + } + } + + fn new_from_blueprint(blueprint: &Blueprint) -> Vec { + let mut sleds = vec![]; + for (sled, datasets) in &blueprint.blueprint_datasets { + let pools = datasets + .datasets + .values() + .filter_map(|dataset| { + if !matches!(dataset.kind, DebugDatasetKind) + || !dataset + .disposition + .matches(BlueprintDatasetFilter::InService) + { + return None; + }; + + Some(TestPool { + pool: dataset.pool.id(), + dataset: dataset.id, + }) + }) + .collect(); + + sleds.push(TestSled { sled: *sled, pools }); + } + sleds + } + + async fn create_database_records( + &self, + datastore: &DataStore, + opctx: &OpContext, + ) { + let rack_id = Uuid::new_v4(); + let sled = SledUpdate::new( + *self.sled.as_untyped_uuid(), + "[::1]:0".parse().unwrap(), + SledBaseboard { + serial_number: format!( + "test-{}", + rand::thread_rng().gen::() + ), + part_number: "test-pn".to_string(), + revision: 0, + }, + SledSystemHardware { + is_scrimlet: false, + usable_hardware_threads: 128, + usable_physical_ram: (64 << 30).try_into().unwrap(), + reservoir_size: (16 << 30).try_into().unwrap(), + }, + rack_id, + Generation::new(), + ); + datastore.sled_upsert(sled).await.expect("failed to upsert sled"); + + // Create fake zpools that back our fake datasets. + for pool in &self.pools { + let zpool = Zpool::new( + *pool.pool.as_untyped_uuid(), + *self.sled.as_untyped_uuid(), + Uuid::new_v4(), + ); + datastore + .zpool_insert(opctx, zpool) + .await + .expect("failed to upsert zpool"); + + let dataset = Dataset::new( + pool.dataset.into_untyped_uuid(), + pool.pool.into_untyped_uuid(), + None, + DebugDatasetKind, + ); + datastore + .dataset_upsert(dataset) + .await + .expect("failed to upsert dataset"); + } + } + } + + // Creates a fake sled with `pool_count` zpools, and a debug dataset on each + // zpool. + async fn create_sled_and_zpools( + datastore: &DataStore, + opctx: &OpContext, + pool_count: usize, + ) -> TestSled { + let sled = TestSled::new_with_pool_count(pool_count); + sled.create_database_records(&datastore, &opctx).await; + sled + } + + async fn support_bundle_create_expect_no_capacity( + datastore: &DataStore, + opctx: &OpContext, + this_nexus_id: OmicronZoneUuid, + ) { + let err = datastore + .support_bundle_create(&opctx, "for tests", this_nexus_id) + .await + .expect_err("Shouldn't provision bundle without datasets"); + let Error::InsufficientCapacity { message } = err else { + panic!("Unexpected error: {err:?} - we expected 'InsufficientCapacity'"); + }; + assert_eq!( + CANNOT_ALLOCATE_ERR_MSG, + message.external_message(), + "Unexpected error: {message:?}" + ); + } + + #[tokio::test] + async fn test_bundle_create_capacity_limits() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // No sleds, no datasets. Allocation should fail. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // Create a sled with a couple pools. Allocation should succeed. + + const POOL_COUNT: usize = 2; + let _test_sled = + create_sled_and_zpools(&datastore, &opctx, POOL_COUNT).await; + let mut bundles = vec![]; + for _ in 0..POOL_COUNT { + bundles.push( + datastore + .support_bundle_create( + &opctx, + "for the test", + this_nexus_id, + ) + .await + .expect("Should be able to create bundle"), + ); + } + + // If we try to allocate any more bundles, we'll run out of capacity. + + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we cancel a bundle, it isn't deleted (yet). + // This operation should signify that we can start to free up + // storage on the dataset, but that needs to happen outside the + // database. + // + // We should still expect to hit capacity limits. + + datastore + .support_bundle_cancel(&opctx, bundles[0].id.into()) + .await + .expect("Should be able to cancel this bundle"); + support_bundle_create_expect_no_capacity( + &datastore, + &opctx, + this_nexus_id, + ) + .await; + + // If we delete a bundle, it should be gone. This means we can + // re-allocate from that dataset which was just freed up. + + datastore + .support_bundle_delete(&opctx, bundles[0].id.into()) + .await + .expect("Should be able to cancel this bundle"); + datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_crud_operations() { + let logctx = dev::test_setup_log("test_crud_operations"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let test_sled = create_sled_and_zpools(&datastore, &opctx, 1).await; + let reason = "Bundle for test"; + let this_nexus_id = OmicronZoneUuid::new_v4(); + + // Create the bundle, then observe it through the "getter" APIs + + let mut bundle = datastore + .support_bundle_create(&opctx, reason, this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.reason_for_creation, reason); + assert_eq!(bundle.reason_for_failure, None); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.zpool_id, test_sled.pools[0].pool.into()); + assert_eq!(bundle.dataset_id, test_sled.pools[0].dataset.into()); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + // Overwrite this column; it is modified slightly upon database insertion. + bundle.time_created = observed_bundle.time_created; + assert_eq!(bundle, observed_bundle); + + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(1, observed_bundles.len()); + assert_eq!(bundle, observed_bundles[0]); + + // Cancel the bundle, observe the new state + + datastore + .support_bundle_cancel(&opctx, bundle.id.into()) + .await + .expect("Should be able to cancel our bundle"); + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just created"); + assert_eq!(SupportBundleState::Cancelling, observed_bundle.state); + + // Delete the bundle, observe that it's gone + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should be able to cancel our bundle"); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to get bundle we just created"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + fn get_nexuses_from_blueprint( + bp: &Blueprint, + filter: BlueprintZoneFilter, + ) -> Vec { + bp.blueprint_zones + .values() + .map(|zones_config| { + let mut nexus_zones = vec![]; + for zone in &zones_config.zones { + if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) + && zone.disposition.matches(filter) + { + nexus_zones.push(zone.id); + } + } + nexus_zones + }) + .flatten() + .collect() + } + + fn get_debug_datasets_from_blueprint( + bp: &Blueprint, + filter: BlueprintDatasetFilter, + ) -> Vec { + bp.blueprint_datasets + .values() + .map(|datasets_config| { + let mut debug_datasets = vec![]; + for dataset in datasets_config.datasets.values() { + if matches!(dataset.kind, DebugDatasetKind) + && dataset.disposition.matches(filter) + { + debug_datasets.push(dataset.id); + } + } + debug_datasets + }) + .flatten() + .collect() + } + + #[tokio::test] + async fn test_bundle_expungement() { + static TEST_NAME: &str = "test_bundle_expungement"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Expunge the bundle's dataset. + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + + for datasets in bp2.blueprint_datasets.values_mut() { + for dataset in datasets.datasets.values_mut() { + if dataset.id == bundle.dataset_id.into() { + dataset.disposition = + BlueprintDatasetDisposition::Expunged; + } + } + } + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + // TODO: Call this on bp1, observe no changes? + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + + assert_eq!(1, report.bundles_failed_missing_datasets); + assert_eq!(0, report.bundles_cancelled_missing_nexus); + assert_eq!(0, report.bundles_failed_missing_nexus); + + // TODO: Another test, maybe? + // TODO: Find the nexus where we allocated the bundle + // TODO: expunge it + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 2928fab46bc..b53dd78cdba 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -10,6 +10,7 @@ use chrono::DateTime; use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; +use omicron_uuid_kinds::SupportBundleUuid; use parse_display::FromStr; use schemars::JsonSchema; use serde::de::Error as _; @@ -444,8 +445,7 @@ pub enum SupportBundleState { #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct SupportBundleInfo { - // XXX: Strongly-typed Support Bundle UUID? - pub id: Uuid, + pub id: SupportBundleUuid, pub time_created: DateTime, pub reason_for_creation: String, pub reason_for_failure: Option, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 12883554630..fe29b2e65f0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2390,24 +2390,55 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- All storage for this bundle exists in Nexus's transient filesystem, as it + -- is being assembled. 'collecting', + + -- The bundle has been created, and is being transferred to a Sled. + -- + -- This bundle may been partially allocated to a Sled. + 'collected', + + -- The bundle has been explicitly marked cancelled. + -- + -- It may or may not have durable storage which needs to be freed. 'cancelling', + + -- The bundle, for whatever reason, has failed. + -- + -- It should no longer have any managed durable storage. 'failed', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. 'active' ); CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( - id UUID NOT NULL, + id UUID PRIMARY KEY, time_created TIMESTAMPTZ NOT NULL, reason_for_creation TEXT NOT NULL, reason_for_failure TEXT, state omicron.public.support_bundle_state NOT NULL, - zpool_id NOT NULL UUID, - dataset_id NOT NULL UUID, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, -- The Nexus which is in charge of collecting the support bundle, -- and later managing its storage. - assigned_nexus UUID, + assigned_nexus UUID +); + +-- The "UNIQUE" part of this index helps enforce that we allow one support bundle +-- per debug dataset. This constraint can be removed, if the query responsible +-- for allocation changes to allocate more intelligently. +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + +CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( + assigned_nexus ); /*******************************************************************/ diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 7947062a82b..50b263fa0b6 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -65,6 +65,7 @@ impl_typed_uuid_kind! { RackReset => "rack_reset", Region => "region", Sled => "sled", + SupportBundle => "support_bundle", TufRepo => "tuf_repo", Upstairs => "upstairs", UpstairsRepair => "upstairs_repair", From d6d53c974e66e64a48d99f83e3319d6fcd4a6973 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 12 Nov 2024 17:46:47 -0800 Subject: [PATCH 08/61] Refactor state machine --- nexus/db-model/src/support_bundle.rs | 39 +- .../src/db/datastore/support_bundle.rs | 522 ++++++++++++++---- nexus/types/src/external_api/shared.rs | 12 +- schema/crdb/dbinit.sql | 33 +- 4 files changed, 455 insertions(+), 151 deletions(-) diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index f2450473f54..0e530708770 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -30,22 +30,28 @@ impl_enum_type!( // Enum values Collecting => b"collecting" - Collected => b"collected" - Cancelling => b"cancelling" - Failed => b"failed" Active => b"active" + Destroying => b"destroying" + Failing => b"failing" + Failed => b"failed" ); impl SupportBundleState { - pub fn might_have_dataset_storage(&self) -> bool { + /// Returns the list of valid prior states. + /// + /// This is used to confirm that state updates are performed legally, + /// and defines the possible state transitions. + pub fn valid_old_states(&self) -> Vec { use SupportBundleState::*; match self { - Collecting => false, - Collected => true, - Cancelling => true, - Failed => false, - Active => true, + Collecting => vec![], + Active => vec![Collecting], + // The "Destroying" state is terminal. + Destroying => vec![Active, Collecting, Failing], + Failing => vec![Collecting, Active], + // The "Failed" state is terminal. + Failed => vec![Collecting, Active, Failing], } } } @@ -56,12 +62,17 @@ impl From for SupportBundleStateView { match state { Collecting => SupportBundleStateView::Collecting, - // The distinction between "collected" and "collecting" is an - // internal detail that doesn't need to be exposed through the API. - Collected => SupportBundleStateView::Collecting, - Cancelling => SupportBundleStateView::Cancelling, - Failed => SupportBundleStateView::Failed, Active => SupportBundleStateView::Active, + Destroying => SupportBundleStateView::Destroying, + // The distinction between "failing" and "failed" should not be + // visible to end-users. This is internal book-keeping to decide + // whether or not the bundle record can be safely deleted. + // + // Either way, it should be possible to delete the bundle. + // - "Failing" bundles will become "Destroying" + // - "Failed" bundles will be destroyed immediately + Failing => SupportBundleStateView::Failed, + Failed => SupportBundleStateView::Failed, } } } diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index d289127f4b7..4d4a6645c2a 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -15,6 +15,7 @@ use crate::db::model::DatasetKind; use crate::db::model::SupportBundle; use crate::db::model::SupportBundleState; use crate::db::pagination::paginated; +use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -39,12 +40,29 @@ const CANNOT_ALLOCATE_ERR_MSG: &'static str = no disks are available. Either delete old support bundles or add additional \ disks"; +const FAILURE_REASON_NO_DATASET: &'static str = + "Allocated dataset no longer exists"; +const FAILURE_REASON_NO_NEXUS: &'static str = + "Nexus managing this bundle no longer exists"; + /// Provides a report on how many bundle were expunged, and why. -#[derive(Debug, Clone)] +#[derive(Default, Debug, Clone, PartialEq)] pub struct SupportBundleExpungementReport { + /// Bundles marked "failed" because the datasets storing them have been + /// expunged. pub bundles_failed_missing_datasets: usize, - pub bundles_cancelled_missing_nexus: usize, - pub bundles_failed_missing_nexus: usize, + /// Bundles deleted because the datasets storing them have been + /// expunged. + pub bundles_deleted_missing_datasets: usize, + + /// Bundles marked "destroying" because the nexuses managing them have been + /// expunged. + /// + /// These bundles should be re-assigned to a different nexus for cleanup. + pub bundles_failing_missing_nexus: usize, + + /// Bundles which had a new Nexus assigned to them. + pub bundles_reassigned: usize, } impl DataStore { @@ -248,110 +266,108 @@ impl DataStore { async move { use db::schema::support_bundle::dsl; - // Find all bundles on datasets that no longer exist, and - // mark them "failed". + // Find all bundles without backing storage. let bundles_with_bad_datasets = dsl::support_bundle .filter(dsl::dataset_id.eq_any(invalid_datasets)) .select(SupportBundle::as_select()) .load_async(&*conn) .await?; + // Split these bundles into two categories: + // - Ones that are being destroyed anyway, and that can be + // fully deleted. + // - Ones that are NOT being destroyed, and should be marked + // failed so the end-user has visibility into their + // destruction. + let (bundles_to_delete, bundles_to_fail): (Vec<_>, Vec<_>) = + bundles_with_bad_datasets.into_iter().partition( + |bundle| bundle.state == SupportBundleState::Destroying + ); + let bundles_to_delete = bundles_to_delete.into_iter().map(|b| b.id).collect::>(); + let bundles_to_fail = bundles_to_fail.into_iter().map(|b| b.id).collect::>(); + + // Find all non-destroying bundles on datasets that no + // longer exist, and mark them "failed". They skip the + // "failing" state because there is no remaining storage to + // be cleaned up. + let state = SupportBundleState::Failed; let bundles_failed_missing_datasets = diesel::update(dsl::support_bundle) - .filter(dsl::state.ne(SupportBundleState::Failed)) - .filter( - dsl::id.eq_any( - bundles_with_bad_datasets - .iter() - .map(|b| b.id) - .collect::>(), - ), - ) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_fail)) .set(( - dsl::state.eq(SupportBundleState::Failed), - dsl::reason_for_failure - .eq("Allocated dataset no longer exists"), + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_DATASET), )) .execute_async(&*conn) .await?; + // For bundles that are in the process of being destroyed, + // the dataset expungement speeds up the process. + let bundles_deleted_missing_datasets = + diesel::delete(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_delete)) + // This check should be redundant (we already + // partitioned above based on this state) but out of + // an abundance of catuion we don't auto-delete a + // bundle in any other state. + .filter(dsl::state.eq(SupportBundleState::Destroying)) + .execute_async(&*conn) + .await?; let Some(arbitrary_valid_nexus) = valid_nexus_zones.get(0).cloned() else { return Err(external::Error::internal_error( - "No valid Nexuses", + "No valid Nexuses, we cannot re-assign this support bundle", ) .into()); }; // Find all bundles on nexuses that no longer exist. - // - // If the bundle might have storage: - // - Mark it cancelled and re-assign the managing Nexus - // - Otherwise: mark it failed let bundles_with_bad_nexuses = dsl::support_bundle .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) .select(SupportBundle::as_select()) .load_async(&*conn) .await?; - let (needs_cancellation, needs_failure): (Vec<_>, Vec<_>) = - bundles_with_bad_nexuses.into_iter().partition( - |bundle| bundle.state.might_have_dataset_storage(), - ); - // Mark these support bundles as cancelled, and assign then + let bundles_to_mark_failing = bundles_with_bad_nexuses.iter() + .map(|b| b.id).collect::>(); + let bundles_to_reassign = bundles_with_bad_nexuses.iter() + .filter_map(|bundle| { + if bundle.state != SupportBundleState::Failed { + Some(bundle.id) + } else { + None + } + }).collect::>(); + + // Mark these support bundles as failing, and assign then // to a nexus that should still exist. // // This should lead to their storage being freed, if it // exists. - let bundles_cancelled_missing_nexus = diesel::update( - dsl::support_bundle, - ) - .filter(dsl::state.ne(SupportBundleState::Cancelling)) - .filter(dsl::state.ne(SupportBundleState::Failed)) - .filter( - dsl::id.eq_any( - needs_cancellation - .iter() - .map(|b| b.id) - .collect::>(), - ), - ) - .set(( - dsl::assigned_nexus.eq(arbitrary_valid_nexus), - dsl::state.eq(SupportBundleState::Cancelling), - dsl::reason_for_failure - .eq("Nexus managing this bundle no longer exists"), - )) - .execute_async(&*conn) - .await?; - - // Mark these support bundles as failed. - // - // If they don't have storage (e.g., they never got that - // far, or their underlying dataset was expunged) there - // isn't anything left to free. This means we can skip the - // "Cancelling" state and jump to "Failed". - let bundles_failed_missing_nexus = diesel::update( - dsl::support_bundle, - ) - .filter(dsl::state.ne(SupportBundleState::Cancelling)) - .filter(dsl::state.ne(SupportBundleState::Failed)) - .filter(dsl::id.eq_any( - needs_failure.iter().map(|b| b.id).collect::>(), - )) - .set(( - dsl::state.eq(SupportBundleState::Failed), - dsl::reason_for_failure - .eq("Nexus managing this bundle no longer exists"), - )) - .execute_async(&*conn) - .await?; + let state = SupportBundleState::Failing; + let bundles_failing_missing_nexus = diesel::update(dsl::support_bundle) + .filter(dsl::state.eq_any(state.valid_old_states())) + .filter(dsl::id.eq_any(bundles_to_mark_failing)) + .set(( + dsl::state.eq(state), + dsl::reason_for_failure.eq(FAILURE_REASON_NO_NEXUS), + )) + .execute_async(&*conn) + .await?; + let bundles_reassigned = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq_any(bundles_to_reassign)) + .set(dsl::assigned_nexus.eq(arbitrary_valid_nexus)) + .execute_async(&*conn) + .await?; Ok(SupportBundleExpungementReport { bundles_failed_missing_datasets, - bundles_cancelled_missing_nexus, - bundles_failed_missing_nexus, + bundles_deleted_missing_datasets, + bundles_failing_missing_nexus, + bundles_reassigned, }) } .boxed() @@ -360,29 +376,35 @@ impl DataStore { .await } - /// Cancels a single support bundle. - /// - /// Note that this does not delete the support bundle record, but starts the - /// process by which it may later be deleted. - pub async fn support_bundle_cancel( + /// Updates the state of a support bundle + pub async fn support_bundle_update( &self, opctx: &OpContext, id: SupportBundleUuid, + state: SupportBundleState, ) -> Result<(), Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - use db::schema::support_bundle::dsl as support_bundle_dsl; - + use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; - diesel::update(support_bundle_dsl::support_bundle) - .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) - .set(support_bundle_dsl::state.eq(SupportBundleState::Cancelling)) - .execute_async(&*conn) + let result = diesel::update(dsl::support_bundle) + .filter(dsl::id.eq(id.into_untyped_uuid())) + .filter(dsl::state.eq_any(state.valid_old_states())) + .set(dsl::state.eq(state)) + .check_if_exists::(id.into_untyped_uuid()) + .execute_and_check(&*conn) .await - .map(|_rows_modified| ()) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - Ok(()) + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + Err(Error::invalid_request(format!( + "Cannot update support bundle state from {:?} to {:?}", + result.found.state, state + ))) + } + } } /// Deletes a support bundle. @@ -396,11 +418,16 @@ impl DataStore { ) -> Result<(), Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - use db::schema::support_bundle::dsl as support_bundle_dsl; + use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; - diesel::delete(support_bundle_dsl::support_bundle) - .filter(support_bundle_dsl::id.eq(id.into_untyped_uuid())) + diesel::delete(dsl::support_bundle) + .filter( + dsl::state + .eq(SupportBundleState::Destroying) + .or(dsl::state.eq(SupportBundleState::Failed)), + ) + .filter(dsl::id.eq(id.into_untyped_uuid())) .execute_async(&*conn) .await .map(|_rows_modified| ()) @@ -425,6 +452,7 @@ mod test { use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintDatasetDisposition; use nexus_types::deployment::BlueprintDatasetFilter; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; @@ -614,7 +642,7 @@ mod test { ) .await; - // If we cancel a bundle, it isn't deleted (yet). + // If we destroy a bundle, it isn't deleted (yet). // This operation should signify that we can start to free up // storage on the dataset, but that needs to happen outside the // database. @@ -622,9 +650,13 @@ mod test { // We should still expect to hit capacity limits. datastore - .support_bundle_cancel(&opctx, bundles[0].id.into()) + .support_bundle_update( + &opctx, + bundles[0].id.into(), + SupportBundleState::Destroying, + ) .await - .expect("Should be able to cancel this bundle"); + .expect("Should be able to destroy this bundle"); support_bundle_create_expect_no_capacity( &datastore, &opctx, @@ -638,7 +670,7 @@ mod test { datastore .support_bundle_delete(&opctx, bundles[0].id.into()) .await - .expect("Should be able to cancel this bundle"); + .expect("Should be able to destroy this bundle"); datastore .support_bundle_create(&opctx, "for the test", this_nexus_id) .await @@ -687,24 +719,28 @@ mod test { assert_eq!(1, observed_bundles.len()); assert_eq!(bundle, observed_bundles[0]); - // Cancel the bundle, observe the new state + // Destroy the bundle, observe the new state datastore - .support_bundle_cancel(&opctx, bundle.id.into()) + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) .await - .expect("Should be able to cancel our bundle"); + .expect("Should be able to destroy our bundle"); let observed_bundle = datastore .support_bundle_get(&opctx, bundle.id.into()) .await .expect("Should be able to get bundle we just created"); - assert_eq!(SupportBundleState::Cancelling, observed_bundle.state); + assert_eq!(SupportBundleState::Destroying, observed_bundle.state); // Delete the bundle, observe that it's gone datastore .support_bundle_delete(&opctx, bundle.id.into()) .await - .expect("Should be able to cancel our bundle"); + .expect("Should be able to destroy our bundle"); let observed_bundles = datastore .support_bundle_list(&opctx, &pagparams) .await @@ -757,9 +793,29 @@ mod test { .collect() } + fn expunge_dataset_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for datasets in bp.blueprint_datasets.values_mut() { + for dataset in datasets.datasets.values_mut() { + if dataset.id == bundle.dataset_id.into() { + dataset.disposition = BlueprintDatasetDisposition::Expunged; + } + } + } + } + + fn expunge_nexus_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) { + for zones in bp.blueprint_zones.values_mut() { + for zone in &mut zones.zones { + if zone.id == bundle.assigned_nexus.unwrap().into() { + zone.disposition = BlueprintZoneDisposition::Expunged; + } + } + } + } + #[tokio::test] - async fn test_bundle_expungement() { - static TEST_NAME: &str = "test_bundle_expungement"; + async fn test_bundle_failed_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_failed_from_expunged_dataset"; let logctx = dev::test_setup_log(TEST_NAME); let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); @@ -811,43 +867,277 @@ mod test { .await .expect("Should be able to create bundle"); assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); - assert!( debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), "Bundle should have been allocated from a blueprint dataset" ); - // Expunge the bundle's dataset. + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) let bp2 = { let mut bp2 = bp1.clone(); bp2.id = Uuid::new_v4(); bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; - for datasets in bp2.blueprint_datasets.values_mut() { - for dataset in datasets.datasets.values_mut() { - if dataset.id == bundle.dataset_id.into() { - dataset.disposition = - BlueprintDatasetDisposition::Expunged; - } - } - } + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_no_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_no_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Expunge the bundle's dataset. This marks it as "failed", and + // is a prerequisite for the bundle not later being re-assigned. + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); bp2 }; bp_insert_and_make_target(&opctx, &datastore, &bp2).await; - // TODO: Call this on bp1, observe no changes? let report = datastore .support_bundle_fail_expunged(&opctx, &bp2) .await .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_failed_missing_datasets: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + // Expunge the bundle's Nexus + let bp3 = { + let mut bp3 = bp2.clone(); + bp3.id = Uuid::new_v4(); + bp3.parent_blueprint_id = Some(bp2.id); + expunge_nexus_for_bundle(&mut bp3, &bundle); + bp3 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp3).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp3) + .await + .expect("Should have been able to mark bundle state as failed"); + + // Although the record for this bundle already exists, it is not + // re-assigned, and the original reason for it failing (dataset loss) is + // preserved. + assert_eq!(SupportBundleExpungementReport::default(), report); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failed, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_DATASET)); + + datastore + .support_bundle_delete(&opctx, bundle.id.into()) + .await + .expect("Should have been able to delete support bundle"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_bundle_failed_from_expunged_nexus_with_reassign() { + static TEST_NAME: &str = + "test_bundle_failed_from_expunged_nexus_with_reassign"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); - assert_eq!(1, report.bundles_failed_missing_datasets); - assert_eq!(0, report.bundles_cancelled_missing_nexus); - assert_eq!(0, report.bundles_failed_missing_nexus); + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; - // TODO: Another test, maybe? - // TODO: Find the nexus where we allocated the bundle - // TODO: expunge it + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let nexus_ids = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", nexus_ids[0]) + .await + .expect("Should be able to create bundle"); + + assert_eq!(bundle.state, SupportBundleState::Collecting); + assert_eq!(bundle.assigned_nexus, Some(nexus_ids[0].into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Update the bundle's state. + // + // This is what we would do when we finish collecting, and + // provisioned storage on a sled. + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // Expunge the bundle's Nexus (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_nexus_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as destroying"); + + assert_eq!( + SupportBundleExpungementReport { + bundles_failing_missing_nexus: 1, + bundles_reassigned: 1, + ..Default::default() + }, + report + ); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Should be able to get bundle we just failed"); + assert_eq!(SupportBundleState::Failing, observed_bundle.state); + assert!(observed_bundle + .reason_for_failure + .unwrap() + .contains(FAILURE_REASON_NO_NEXUS)); db.terminate().await; logctx.cleanup_successful(); diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index b53dd78cdba..9e9bc26ffe8 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -423,20 +423,22 @@ pub enum SupportBundleState { /// Support Bundle still actively being collected. /// /// This is the initial state for a Support Bundle, and it will - /// automatically transition to either "Failed" or "Active". + /// automatically transition to either "Failing" or "Active". /// /// If a user no longer wants to access a Support Bundle, they can - /// request cancellation, which will transition to the "Cancelling" state. + /// request cancellation, which will transition to the "Destroying" state. Collecting, - /// Support Bundle has been cancelled, and will be deleted shortly. - Cancelling, + /// Support Bundle is being destroyed. + /// + /// Once backing storage has been freed, this bundle is destroyed. + Destroying, /// Support Bundle was not created successfully, or was created and has lost /// backing storage. /// /// The record of the bundle still exists for readability, but the only - /// valid operation on these bundles is to delete them. + /// valid operation on these bundles is to destroy them. Failed, /// Support Bundle has been processed, and is ready for usage. diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index fe29b2e65f0..52af30612c7 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2392,28 +2392,29 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo_artifact ( CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( -- The bundle is currently being created. -- - -- All storage for this bundle exists in Nexus's transient filesystem, as it - -- is being assembled. + -- It might have storage that is partially allocated on a sled. 'collecting', - -- The bundle has been created, and is being transferred to a Sled. - -- - -- This bundle may been partially allocated to a Sled. - 'collected', + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', - -- The bundle has been explicitly marked cancelled. - -- - -- It may or may not have durable storage which needs to be freed. - 'cancelling', + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', - -- The bundle, for whatever reason, has failed. + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. -- - -- It should no longer have any managed durable storage. - 'failed', + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', - -- The bundle has been collected successfully, and has storage on - -- a particular sled. - 'active' + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' ); CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( From 483801a9b0456ebc7d9352a9aaa24364491cb761 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 11:33:30 -0800 Subject: [PATCH 09/61] Clippy, schema changes --- nexus/db-model/src/schema_versions.rs | 3 +- .../src/db/datastore/support_bundle.rs | 14 +++++----- openapi/nexus.json | 15 ++++++---- schema/crdb/dbinit.sql | 2 +- schema/crdb/support-bundles/up01.sql | 28 +++++++++++++++++++ schema/crdb/support-bundles/up02.sql | 14 ++++++++++ schema/crdb/support-bundles/up03.sql | 4 +++ schema/crdb/support-bundles/up04.sql | 4 +++ 8 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 schema/crdb/support-bundles/up01.sql create mode 100644 schema/crdb/support-bundles/up02.sql create mode 100644 schema/crdb/support-bundles/up03.sql create mode 100644 schema/crdb/support-bundles/up04.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 14eba1cb1cf..580b654b75c 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(113, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(114, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(114, "support-bundles"), KnownVersion::new(113, "add-tx-eq"), KnownVersion::new(112, "blueprint-dataset"), KnownVersion::new(111, "drop-omicron-zone-underlay-address"), diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 4d4a6645c2a..71d10700719 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -270,7 +270,7 @@ impl DataStore { let bundles_with_bad_datasets = dsl::support_bundle .filter(dsl::dataset_id.eq_any(invalid_datasets)) .select(SupportBundle::as_select()) - .load_async(&*conn) + .load_async(conn) .await?; // Split these bundles into two categories: @@ -299,7 +299,7 @@ impl DataStore { dsl::state.eq(state), dsl::reason_for_failure.eq(FAILURE_REASON_NO_DATASET), )) - .execute_async(&*conn) + .execute_async(conn) .await?; // For bundles that are in the process of being destroyed, // the dataset expungement speeds up the process. @@ -312,7 +312,7 @@ impl DataStore { // an abundance of catuion we don't auto-delete a // bundle in any other state. .filter(dsl::state.eq(SupportBundleState::Destroying)) - .execute_async(&*conn) + .execute_async(conn) .await?; let Some(arbitrary_valid_nexus) = @@ -328,7 +328,7 @@ impl DataStore { let bundles_with_bad_nexuses = dsl::support_bundle .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) .select(SupportBundle::as_select()) - .load_async(&*conn) + .load_async(conn) .await?; let bundles_to_mark_failing = bundles_with_bad_nexuses.iter() @@ -355,12 +355,12 @@ impl DataStore { dsl::state.eq(state), dsl::reason_for_failure.eq(FAILURE_REASON_NO_NEXUS), )) - .execute_async(&*conn) + .execute_async(conn) .await?; let bundles_reassigned = diesel::update(dsl::support_bundle) .filter(dsl::id.eq_any(bundles_to_reassign)) .set(dsl::assigned_nexus.eq(arbitrary_valid_nexus)) - .execute_async(&*conn) + .execute_async(conn) .await?; Ok(SupportBundleExpungementReport { @@ -392,7 +392,7 @@ impl DataStore { .filter(dsl::state.eq_any(state.valid_old_states())) .set(dsl::state.eq(state)) .check_if_exists::(id.into_untyped_uuid()) - .execute_and_check(&*conn) + .execute_and_check(&conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; diff --git a/openapi/nexus.json b/openapi/nexus.json index 1aaba49e9eb..18cb260fd08 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -20183,8 +20183,7 @@ "type": "object", "properties": { "id": { - "type": "string", - "format": "uuid" + "$ref": "#/components/schemas/TypedUuidForSupportBundleKind" }, "reason_for_creation": { "type": "string" @@ -20232,21 +20231,21 @@ "SupportBundleState": { "oneOf": [ { - "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failed\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Cancelling\" state.", + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failing\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Destroying\" state.", "type": "string", "enum": [ "collecting" ] }, { - "description": "Support Bundle has been cancelled, and will be deleted shortly.", + "description": "Support Bundle is being destroyed.\n\nOnce backing storage has been freed, this bundle is destroyed.", "type": "string", "enum": [ - "cancelling" + "destroying" ] }, { - "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to delete them.", + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to destroy them.", "type": "string", "enum": [ "failed" @@ -21275,6 +21274,10 @@ } } }, + "TypedUuidForSupportBundleKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 52af30612c7..0beb32c92a6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4661,7 +4661,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '113.0.0', NULL) + (TRUE, NOW(), NOW(), '114.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/support-bundles/up01.sql b/schema/crdb/support-bundles/up01.sql new file mode 100644 index 00000000000..1260e79e18c --- /dev/null +++ b/schema/crdb/support-bundles/up01.sql @@ -0,0 +1,28 @@ +CREATE TYPE IF NOT EXISTS omicron.public.support_bundle_state AS ENUM ( + -- The bundle is currently being created. + -- + -- It might have storage that is partially allocated on a sled. + 'collecting', + + -- The bundle has been collected successfully, and has storage on + -- a particular sled. + 'active', + + -- The user has explicitly requested that a bundle be destroyed. + -- We must ensure that storage backing that bundle is gone before + -- it is automatically deleted. + 'destroying', + + -- The support bundle is failing. + -- This happens when Nexus is expunged partway through collection. + -- + -- A different Nexus must ensure that storage is gone before the + -- bundle can be marked "failed". + 'failing', + + -- The bundle has finished failing. + -- + -- The only action that can be taken on this bundle is to delete it. + 'failed' +); + diff --git a/schema/crdb/support-bundles/up02.sql b/schema/crdb/support-bundles/up02.sql new file mode 100644 index 00000000000..bc61e704800 --- /dev/null +++ b/schema/crdb/support-bundles/up02.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + reason_for_creation TEXT NOT NULL, + reason_for_failure TEXT, + state omicron.public.support_bundle_state NOT NULL, + zpool_id UUID NOT NULL, + dataset_id UUID NOT NULL, + + -- The Nexus which is in charge of collecting the support bundle, + -- and later managing its storage. + assigned_nexus UUID +); + diff --git a/schema/crdb/support-bundles/up03.sql b/schema/crdb/support-bundles/up03.sql new file mode 100644 index 00000000000..8d4bdb47caa --- /dev/null +++ b/schema/crdb/support-bundles/up03.sql @@ -0,0 +1,4 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle ( + dataset_id +); + diff --git a/schema/crdb/support-bundles/up04.sql b/schema/crdb/support-bundles/up04.sql new file mode 100644 index 00000000000..6df95640fd9 --- /dev/null +++ b/schema/crdb/support-bundles/up04.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( + assigned_nexus +); + From 5f78af9015371bff473b50651172cc98212611fc Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 11:34:56 -0800 Subject: [PATCH 10/61] more clippy --- nexus/db-queries/src/db/datastore/support_bundle.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 71d10700719..eb099a1a480 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -757,7 +757,7 @@ mod test { ) -> Vec { bp.blueprint_zones .values() - .map(|zones_config| { + .flat_map(|zones_config| { let mut nexus_zones = vec![]; for zone in &zones_config.zones { if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) @@ -768,7 +768,6 @@ mod test { } nexus_zones }) - .flatten() .collect() } @@ -778,7 +777,7 @@ mod test { ) -> Vec { bp.blueprint_datasets .values() - .map(|datasets_config| { + .flat_map(|datasets_config| { let mut debug_datasets = vec![]; for dataset in datasets_config.datasets.values() { if matches!(dataset.kind, DebugDatasetKind) @@ -789,7 +788,6 @@ mod test { } debug_datasets }) - .flatten() .collect() } From 92b7d62aa28bb4e9ae9e2791ba889e6880968a7b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 12:10:32 -0800 Subject: [PATCH 11/61] Strongly-typed UUID --- nexus/types/src/external_api/shared.rs | 16 +++++++++------- openapi/nexus.json | 15 +++++++++------ uuid-kinds/src/lib.rs | 1 + 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 2928fab46bc..9e9bc26ffe8 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -10,6 +10,7 @@ use chrono::DateTime; use chrono::Utc; use omicron_common::api::external::Name; use omicron_common::api::internal::shared::NetworkInterface; +use omicron_uuid_kinds::SupportBundleUuid; use parse_display::FromStr; use schemars::JsonSchema; use serde::de::Error as _; @@ -422,20 +423,22 @@ pub enum SupportBundleState { /// Support Bundle still actively being collected. /// /// This is the initial state for a Support Bundle, and it will - /// automatically transition to either "Failed" or "Active". + /// automatically transition to either "Failing" or "Active". /// /// If a user no longer wants to access a Support Bundle, they can - /// request cancellation, which will transition to the "Cancelling" state. + /// request cancellation, which will transition to the "Destroying" state. Collecting, - /// Support Bundle has been cancelled, and will be deleted shortly. - Cancelling, + /// Support Bundle is being destroyed. + /// + /// Once backing storage has been freed, this bundle is destroyed. + Destroying, /// Support Bundle was not created successfully, or was created and has lost /// backing storage. /// /// The record of the bundle still exists for readability, but the only - /// valid operation on these bundles is to delete them. + /// valid operation on these bundles is to destroy them. Failed, /// Support Bundle has been processed, and is ready for usage. @@ -444,8 +447,7 @@ pub enum SupportBundleState { #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)] pub struct SupportBundleInfo { - // XXX: Strongly-typed Support Bundle UUID? - pub id: Uuid, + pub id: SupportBundleUuid, pub time_created: DateTime, pub reason_for_creation: String, pub reason_for_failure: Option, diff --git a/openapi/nexus.json b/openapi/nexus.json index 1aaba49e9eb..18cb260fd08 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -20183,8 +20183,7 @@ "type": "object", "properties": { "id": { - "type": "string", - "format": "uuid" + "$ref": "#/components/schemas/TypedUuidForSupportBundleKind" }, "reason_for_creation": { "type": "string" @@ -20232,21 +20231,21 @@ "SupportBundleState": { "oneOf": [ { - "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failed\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Cancelling\" state.", + "description": "Support Bundle still actively being collected.\n\nThis is the initial state for a Support Bundle, and it will automatically transition to either \"Failing\" or \"Active\".\n\nIf a user no longer wants to access a Support Bundle, they can request cancellation, which will transition to the \"Destroying\" state.", "type": "string", "enum": [ "collecting" ] }, { - "description": "Support Bundle has been cancelled, and will be deleted shortly.", + "description": "Support Bundle is being destroyed.\n\nOnce backing storage has been freed, this bundle is destroyed.", "type": "string", "enum": [ - "cancelling" + "destroying" ] }, { - "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to delete them.", + "description": "Support Bundle was not created successfully, or was created and has lost backing storage.\n\nThe record of the bundle still exists for readability, but the only valid operation on these bundles is to destroy them.", "type": "string", "enum": [ "failed" @@ -21275,6 +21274,10 @@ } } }, + "TypedUuidForSupportBundleKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 7947062a82b..50b263fa0b6 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -65,6 +65,7 @@ impl_typed_uuid_kind! { RackReset => "rack_reset", Region => "region", Sled => "sled", + SupportBundle => "support_bundle", TufRepo => "tuf_repo", Upstairs => "upstairs", UpstairsRepair => "upstairs_repair", From 410ff5bb1c1ae67b39e89f39028ff20551cef47b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 15:57:08 -0800 Subject: [PATCH 12/61] Skeleton of background task --- nexus-config/src/nexus_config.rs | 23 ++ .../src/db/datastore/support_bundle.rs | 29 +- nexus/examples/config-second.toml | 1 + nexus/examples/config.toml | 1 + nexus/src/app/background/init.rs | 26 +- nexus/src/app/background/tasks/mod.rs | 1 + .../tasks/support_bundle_collector.rs | 254 ++++++++++++++++++ nexus/tests/config.test.toml | 2 + smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + 10 files changed, 336 insertions(+), 3 deletions(-) create mode 100644 nexus/src/app/background/tasks/support_bundle_collector.rs diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 82362f2f0d7..81170e42f85 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -374,6 +374,8 @@ pub struct BackgroundTaskConfig { pub nat_cleanup: NatCleanupConfig, /// configuration for inventory tasks pub inventory: InventoryConfig, + /// configuration for support bundle collection + pub support_bundle_collector: SupportBundleCollectorConfig, /// configuration for physical disk adoption tasks pub physical_disk_adoption: PhysicalDiskAdoptionConfig, /// configuration for decommissioned disk cleaner task @@ -461,6 +463,20 @@ pub struct ExternalEndpointsConfig { // allow/disallow wildcard certs, don't serve expired certs, etc.) } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct SupportBundleCollectorConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, + + /// A toggle to disable support bundle collection + /// + /// Default: Off + #[serde(default)] + pub disable: bool, +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct PhysicalDiskAdoptionConfig { @@ -935,6 +951,7 @@ mod test { inventory.period_secs = 10 inventory.nkeep = 11 inventory.disable = false + support_bundle_collector.period_secs = 30 physical_disk_adoption.period_secs = 30 decommissioned_disk_cleaner.period_secs = 30 phantom_disks.period_secs = 30 @@ -1081,6 +1098,11 @@ mod test { nkeep: 11, disable: false, }, + support_bundle_collector: + SupportBundleCollectorConfig { + period_secs: Duration::from_secs(30), + disable: false, + }, physical_disk_adoption: PhysicalDiskAdoptionConfig { period_secs: Duration::from_secs(30), disable: false, @@ -1215,6 +1237,7 @@ mod test { inventory.period_secs = 10 inventory.nkeep = 3 inventory.disable = false + support_bundle_collector.period_secs = 30 physical_disk_adoption.period_secs = 30 decommissioned_disk_cleaner.period_secs = 30 phantom_disks.period_secs = 30 diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index eb099a1a480..18ce92ee876 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -194,6 +194,28 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Lists one page of support bundles in a particular state, assigned to + /// a particular Nexus. + pub async fn support_bundle_list_assigned_to_nexus( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + nexus_id: OmicronZoneUuid, + states: Vec, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .filter(dsl::assigned_nexus.eq(nexus_id.into_untyped_uuid())) + .filter(dsl::state.eq_any(states)) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Marks support bundles as failed if their assigned Nexus or backing /// dataset has been destroyed. pub async fn support_bundle_fail_expunged( @@ -376,7 +398,12 @@ impl DataStore { .await } - /// Updates the state of a support bundle + /// Updates the state of a support bundle. + /// + /// Returns: + /// - "Ok" if the bundle was updated successfully. + /// - "Err::InvalidRequest" if the bundle exists, but could not be updated + /// because the state transition is invalid. pub async fn support_bundle_update( &self, opctx: &OpContext, diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index a9557665545..86b3e41a6e6 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -122,6 +122,7 @@ inventory.nkeep = 5 inventory.disable = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 +support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index ce3dfe5751b..a56a8620228 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -108,6 +108,7 @@ inventory.nkeep = 5 inventory.disable = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 +support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ad39777054a..6d90108a0fd 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -115,6 +115,7 @@ use super::tasks::region_snapshot_replacement_start::*; use super::tasks::region_snapshot_replacement_step::*; use super::tasks::saga_recovery; use super::tasks::service_firewall_rules; +use super::tasks::support_bundle_collector; use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; use super::tasks::v2p_mappings::V2PManager; @@ -149,6 +150,7 @@ pub struct BackgroundTasks { pub task_nat_cleanup: Activator, pub task_bfd_manager: Activator, pub task_inventory_collection: Activator, + pub task_support_bundle_collector: Activator, pub task_physical_disk_adoption: Activator, pub task_decommissioned_disk_cleaner: Activator, pub task_phantom_disks: Activator, @@ -235,6 +237,7 @@ impl BackgroundTasksInitializer { task_nat_cleanup: Activator::new(), task_bfd_manager: Activator::new(), task_inventory_collection: Activator::new(), + task_support_bundle_collector: Activator::new(), task_physical_disk_adoption: Activator::new(), task_decommissioned_disk_cleaner: Activator::new(), task_phantom_disks: Activator::new(), @@ -302,6 +305,7 @@ impl BackgroundTasksInitializer { task_nat_cleanup, task_bfd_manager, task_inventory_collection, + task_support_bundle_collector, task_physical_disk_adoption, task_decommissioned_disk_cleaner, task_phantom_disks, @@ -487,7 +491,7 @@ impl BackgroundTasksInitializer { period: config.blueprints.period_secs_collect_crdb_node_ids, task_impl: Box::new(crdb_node_id_collector), opctx: opctx.child(BTreeMap::new()), - watchers: vec![Box::new(rx_blueprint)], + watchers: vec![Box::new(rx_blueprint.clone())], activator: task_crdb_node_id_collector, }); @@ -517,13 +521,31 @@ impl BackgroundTasksInitializer { period: config.inventory.period_secs, task_impl: Box::new(collector), opctx: opctx.child(BTreeMap::new()), - watchers: vec![Box::new(rx_blueprint_exec)], + watchers: vec![Box::new(rx_blueprint_exec.clone())], activator: task_inventory_collection, }); inventory_watcher }; + driver.register(TaskDefinition { + name: "support_bundle_collector", + description: + "ensure new support bundles are automatically marked in-service", + period: config.support_bundle_collector.period_secs, + task_impl: Box::new( + support_bundle_collector::SupportBundleCollector::new( + datastore.clone(), + config.support_bundle_collector.disable, + nexus_id, + rx_blueprint, + ), + ), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![Box::new(rx_blueprint_exec)], + activator: task_support_bundle_collector, + }); + driver.register(TaskDefinition { name: "physical_disk_adoption", description: diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index e4bbbfe6d04..2e58e58b76a 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -32,6 +32,7 @@ pub mod region_snapshot_replacement_start; pub mod region_snapshot_replacement_step; pub mod saga_recovery; pub mod service_firewall_rules; +pub mod support_bundle_collector; pub mod sync_service_zone_nat; pub mod sync_switch_configuration; pub mod v2p_mappings; diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs new file mode 100644 index 00000000000..699dcda4d55 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -0,0 +1,254 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for managing Support Bundles + +use crate::app::background::BackgroundTask; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_model::SupportBundleState; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::{Blueprint, BlueprintTarget}; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::OmicronZoneUuid; +use serde::Serialize; +use serde_json::json; +use std::sync::Arc; +use tokio::sync::watch; + +pub struct SupportBundleCollector { + datastore: Arc, + disable: bool, + nexus_id: OmicronZoneUuid, + rx_blueprint: watch::Receiver>>, +} + +#[derive(Debug, Default, Serialize)] +struct CleanupReport { + // Responses from our requests to Sled Agents + sled_bundles_deleted_ok: usize, + sled_bundles_deleted_not_found: usize, + sled_bundles_delete_failed: usize, + + // Results from updating our database records + db_destroying_bundles_removed: usize, + db_failing_bundles_updated: usize, +} + +#[derive(Debug, Default, Serialize)] +struct CollectionReport {} + +impl SupportBundleCollector { + pub fn new( + datastore: Arc, + disable: bool, + nexus_id: OmicronZoneUuid, + rx_blueprint: watch::Receiver< + Option>, + >, + ) -> Self { + SupportBundleCollector { datastore, disable, nexus_id, rx_blueprint } + } + + // Monitors all bundles that are "destroying" or "failing" and assigned to + // this Nexus, and attempts to clear their storage from Sled Agents. + async fn cleanup_destroyed_bundles<'a>( + &'a self, + opctx: &'a OpContext, + ) -> Result { + let pagparams = DataPageParams::max_page(); + let result = self + .datastore + .support_bundle_list_assigned_to_nexus( + opctx, + &pagparams, + self.nexus_id, + vec![ + SupportBundleState::Destroying, + SupportBundleState::Failing, + ], + ) + .await; + + let bundles_to_destroy = match result { + Ok(r) => r, + Err(err) => { + warn!( + &opctx.log, + "SupportBundleCollector: Failed to list terminating bundles"; + "err" => %err + ); + return Err(format!("failed to query database: {:#}", err)); + } + }; + + let mut report = CleanupReport::default(); + + // NOTE: This could be concurrent? + for bundle in bundles_to_destroy { + // XXX: Contact the Sled Agent holding this bundle, release it. + // If OK: report.bundles_deleted_ok++ + // If 404: report.bundles_deleted_not_found++ + // Otherwise: report.bundles_deleted_failed++, and continue. + + match bundle.state { + SupportBundleState::Destroying => { + // Destroying is a terminal state; no one should be able to + // change this state from underneath us. + self.datastore.support_bundle_delete( + opctx, + bundle.id.into(), + ).await.map_err(|err| { + warn!( + &opctx.log, + "SupportBundleCollector: Could not delete 'destroying' bundle"; + "err" => %err + ); + format!("Could not delete 'destroying' bundle: {:#}", err) + })?; + + report.db_destroying_bundles_removed += 1; + } + SupportBundleState::Failing => { + if let Err(err) = self + .datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Failed, + ) + .await + { + if matches!(err, Error::InvalidRequest { .. }) { + // It's possible that the bundle is marked "destroying" by a + // user request, concurrently with our operation. + // + // In this case, we log that this happened, but do nothing. + // The next iteration of this background task should treat + // this as the "Destroying" case, and delete the bundle. + info!( + &opctx.log, + "SupportBundleCollector: Concurrent state change"; + "err" => ?err, + ); + } else { + return Err(format!( + "Could not delete 'failing' bundle: {:#}", + err + )); + } + } + + report.db_failing_bundles_updated += 1; + } + other => { + // We should be filtering to only see "Destroying" and + // "Failing" bundles in our database request above. + error!( + &opctx.log, + "SupportBundleCollector: Cleaning bundle in unexpected state"; + "id" => %bundle.id, + "state" => ?other + ); + } + } + } + Ok(report) + } + + async fn collect_bundle<'a>( + &'a self, + opctx: &'a OpContext, + ) -> Result { + let mut report = CollectionReport::default(); + + let pagparams = DataPageParams::max_page(); + let result = self + .datastore + .support_bundle_list_assigned_to_nexus( + opctx, + &pagparams, + self.nexus_id, + vec![SupportBundleState::Collecting], + ) + .await; + + let bundle_to_collect = match result { + Ok(bundles) => { + if let Some(bundle) = bundles.get(0) { + info!( + &opctx.log, + "SupportBundleCollector: Found bundle to collect"; + "bundle" => %bundle.id, + "bundles_in_queue" => bundles.len() + ); + bundle.clone() + } else { + info!(&opctx.log, "No bundles to collect"); + return Ok(report); + } + } + Err(err) => { + warn!( + &opctx.log, + "SupportBundleCollector: Failed to list collecting bundles"; + "err" => %err + ); + return Err(format!("failed to query database: {:#}", err)); + } + }; + + // TODO: Actually collect the bundle here. + + // TODO: Periodically check the state of the bundle we're collecting - + // it might have been cancelled if this is taking a while. + // + // Whenever we check, we're basically creating a "yield" point. + // This should perhaps be set up on a timer. + + // TODO: Store the bundle once collection has finished. + + Ok(report) + } +} + +impl BackgroundTask for SupportBundleCollector { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async move { + if self.disable { + return json!({ "error": "task disabled" }); + } + + let mut cleanup_report = None; + let mut cleanup_err = None; + let mut collection_report = None; + let mut collection_err = None; + + match self.cleanup_destroyed_bundles(&opctx).await { + Ok(report) => cleanup_report = Some(report), + Err(err) => cleanup_err = Some(json!({ "cleanup_error": err })), + }; + + match self.collect_bundle(&opctx).await { + Ok(report) => collection_report = Some(report), + Err(err) => { + collection_err = Some(json!({ "collect_error": err })) + } + }; + + json!({ + "cleanup_report": cleanup_report, + "cleanup_err": cleanup_err, + "collection_report": collection_report, + "collection_err": collection_err, + }) + } + .boxed() + } +} diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 94d22d491fe..247caafcb5b 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -104,6 +104,8 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 # Disable automatic disk adoption to avoid interfering with tests. physical_disk_adoption.disable = true +support_bundle_collector.period_secs = 30 +support_bundle_collector.disable = true decommissioned_disk_cleaner.period_secs = 60 # Disable disk decommissioning cleanup to avoid interfering with tests. decommissioned_disk_cleaner.disable = true diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index b824f4b7c97..73f2720efc4 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -50,6 +50,7 @@ inventory.nkeep = 3 inventory.disable = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 +support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index e966b3eabd4..ee48e4e0c06 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -50,6 +50,7 @@ inventory.nkeep = 3 inventory.disable = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 +support_bundle_collector.period_secs = 30 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 From 22343010f1185293bf41413d4f51170a85ec86c8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 16:09:55 -0800 Subject: [PATCH 13/61] less bp --- nexus/src/app/background/init.rs | 8 +++- .../tasks/support_bundle_collector.rs | 44 ++++++++++++++----- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 6d90108a0fd..304f4f20c59 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -491,7 +491,7 @@ impl BackgroundTasksInitializer { period: config.blueprints.period_secs_collect_crdb_node_ids, task_impl: Box::new(crdb_node_id_collector), opctx: opctx.child(BTreeMap::new()), - watchers: vec![Box::new(rx_blueprint.clone())], + watchers: vec![Box::new(rx_blueprint)], activator: task_crdb_node_id_collector, }); @@ -528,6 +528,11 @@ impl BackgroundTasksInitializer { inventory_watcher }; + // Cleans up and collects support bundles. + // + // This task is triggered by blueprint execution, since blueprint + // execution may cause bundles to start failing and need garbage + // collection. driver.register(TaskDefinition { name: "support_bundle_collector", description: @@ -538,7 +543,6 @@ impl BackgroundTasksInitializer { datastore.clone(), config.support_bundle_collector.disable, nexus_id, - rx_blueprint, ), ), opctx: opctx.child(BTreeMap::new()), diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 699dcda4d55..94d415b64a4 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -10,25 +10,23 @@ use futures::FutureExt; use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_types::deployment::{Blueprint, BlueprintTarget}; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SupportBundleUuid; use serde::Serialize; use serde_json::json; use std::sync::Arc; -use tokio::sync::watch; pub struct SupportBundleCollector { datastore: Arc, disable: bool, nexus_id: OmicronZoneUuid, - rx_blueprint: watch::Receiver>>, } #[derive(Debug, Default, Serialize)] struct CleanupReport { - // Responses from our requests to Sled Agents + // Responses from Sled Agents sled_bundles_deleted_ok: usize, sled_bundles_deleted_not_found: usize, sled_bundles_delete_failed: usize, @@ -39,18 +37,17 @@ struct CleanupReport { } #[derive(Debug, Default, Serialize)] -struct CollectionReport {} +struct CollectionReport { + bundle: Option, +} impl SupportBundleCollector { pub fn new( datastore: Arc, disable: bool, nexus_id: OmicronZoneUuid, - rx_blueprint: watch::Receiver< - Option>, - >, ) -> Self { - SupportBundleCollector { datastore, disable, nexus_id, rx_blueprint } + SupportBundleCollector { datastore, disable, nexus_id } } // Monitors all bundles that are "destroying" or "failing" and assigned to @@ -87,7 +84,7 @@ impl SupportBundleCollector { let mut report = CleanupReport::default(); - // NOTE: This could be concurrent? + // NOTE: This could be concurrent, but the priority for that also seems low for bundle in bundles_to_destroy { // XXX: Contact the Sled Agent holding this bundle, release it. // If OK: report.bundles_deleted_ok++ @@ -131,7 +128,8 @@ impl SupportBundleCollector { // this as the "Destroying" case, and delete the bundle. info!( &opctx.log, - "SupportBundleCollector: Concurrent state change"; + "SupportBundleCollector: Concurrent state change failing bundle"; + "bundle" => %bundle.id, "err" => ?err, ); } else { @@ -176,7 +174,7 @@ impl SupportBundleCollector { ) .await; - let bundle_to_collect = match result { + let bundle = match result { Ok(bundles) => { if let Some(bundle) = bundles.get(0) { info!( @@ -200,6 +198,7 @@ impl SupportBundleCollector { return Err(format!("failed to query database: {:#}", err)); } }; + report.bundle = Some(bundle.id.into()); // TODO: Actually collect the bundle here. @@ -211,6 +210,27 @@ impl SupportBundleCollector { // TODO: Store the bundle once collection has finished. + // TODO: Remove the temporary storage we used in the Nexus zone to + // collect the bundle. + + if let Err(err) = self + .datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Active, + ) + .await + { + info!( + &opctx.log, + "SupportBundleCollector: Concurrent state change activating bundle"; + "bundle" => %bundle.id, + "err" => ?err, + ); + return Ok(report); + } + Ok(report) } } From 6c529d1b8cdf0f5b7b6b536eb07aca636f9e27a7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 13 Nov 2024 16:30:15 -0800 Subject: [PATCH 14/61] Add list-by-nexus method, test it --- .../src/db/datastore/support_bundle.rs | 156 +++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index eb099a1a480..562b1ec2b12 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -194,6 +194,29 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Lists one page of support bundles in a particular state, assigned to + /// a particular Nexus. + pub async fn support_bundle_list_assigned_to_nexus( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + nexus_id: OmicronZoneUuid, + states: Vec, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::support_bundle::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::support_bundle, dsl::id, pagparams) + .filter(dsl::assigned_nexus.eq(nexus_id.into_untyped_uuid())) + .filter(dsl::state.eq_any(states)) + .order(dsl::time_created.asc()) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Marks support bundles as failed if their assigned Nexus or backing /// dataset has been destroyed. pub async fn support_bundle_fail_expunged( @@ -376,7 +399,12 @@ impl DataStore { .await } - /// Updates the state of a support bundle + /// Updates the state of a support bundle. + /// + /// Returns: + /// - "Ok" if the bundle was updated successfully. + /// - "Err::InvalidRequest" if the bundle exists, but could not be updated + /// because the state transition is invalid. pub async fn support_bundle_update( &self, opctx: &OpContext, @@ -597,6 +625,132 @@ mod test { ); } + #[tokio::test] + async fn test_bundle_list_filtering() { + let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let nexus_a = OmicronZoneUuid::new_v4(); + let nexus_b = OmicronZoneUuid::new_v4(); + + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + let pagparams = DataPageParams::max_page(); + + // No bundles exist yet, so the list should be empty + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles"), + vec![] + ); + + // Create two bundles on "nexus A", one bundle on "nexus B" + + let bundle_a1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_a2 = datastore + .support_bundle_create(&opctx, "for the test", nexus_a) + .await + .expect("Should be able to create bundle"); + let bundle_b1 = datastore + .support_bundle_create(&opctx, "for the test", nexus_b) + .await + .expect("Should be able to create bundle"); + + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_b, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_b1.id,] + ); + + // When we update the state of the bundles, the list results + // should also be filtered. + datastore + .support_bundle_update( + &opctx, + bundle_a1.id.into(), + SupportBundleState::Active, + ) + .await + .expect("Should have been able to update state"); + + // "bundle_a1" is no longer collecting, so it won't appear here. + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![SupportBundleState::Collecting] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a2.id,] + ); + + // ... but if we ask for enough states, it'll show up + assert_eq!( + datastore + .support_bundle_list_assigned_to_nexus( + &opctx, + &pagparams, + nexus_a, + vec![ + SupportBundleState::Active, + SupportBundleState::Collecting + ] + ) + .await + .expect("Should always be able to list bundles") + .iter() + .map(|b| b.id) + .collect::>(), + vec![bundle_a1.id, bundle_a2.id,] + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_bundle_create_capacity_limits() { let logctx = dev::test_setup_log("test_bundle_create_capacity_limits"); From e61365a1c000f22b06500685c23c1cc7e07babb3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Nov 2024 13:21:23 -0800 Subject: [PATCH 15/61] merge --- nexus/test-utils/src/resource_helpers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index ce1fb38f016..6f704fac00e 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1175,7 +1175,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for disk in &disks_config.disks { self.add_zpool_with_dataset_ext( *sled_id, - disk.id, + PhysicalDiskUuid::from_untyped_uuid(disk.id), disk.pool_id, DatasetUuid::new_v4(), Self::DEFAULT_ZPOOL_SIZE_GIB, From 01949c97d3406e494c2b8e4ea83d26d3ee427318 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Nov 2024 13:23:41 -0800 Subject: [PATCH 16/61] Merge UUID changes --- nexus/db-queries/src/db/datastore/support_bundle.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 562b1ec2b12..07179769084 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -28,7 +28,6 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_uuid_kinds::BlueprintUuid; -use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SupportBundleUuid; @@ -131,7 +130,7 @@ impl DataStore { let bundle = SupportBundle::new( reason_for_creation, ZpoolUuid::from_untyped_uuid(dataset.pool_id), - DatasetUuid::from_untyped_uuid(dataset.id()), + dataset.id(), this_nexus_id, ); @@ -581,7 +580,7 @@ mod test { .expect("failed to upsert zpool"); let dataset = Dataset::new( - pool.dataset.into_untyped_uuid(), + pool.dataset, pool.pool.into_untyped_uuid(), None, DebugDatasetKind, From 81c7121fa3c13e42167386dd579aaf86e70e1c08 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Nov 2024 13:23:41 -0800 Subject: [PATCH 17/61] Merge UUID changes --- nexus/db-queries/src/db/datastore/support_bundle.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 562b1ec2b12..b89ce5cec09 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -28,7 +28,6 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_uuid_kinds::BlueprintUuid; -use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SupportBundleUuid; @@ -131,7 +130,7 @@ impl DataStore { let bundle = SupportBundle::new( reason_for_creation, ZpoolUuid::from_untyped_uuid(dataset.pool_id), - DatasetUuid::from_untyped_uuid(dataset.id()), + dataset.id(), this_nexus_id, ); @@ -485,6 +484,7 @@ mod test { use nexus_types::deployment::BlueprintZoneType; use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; use omicron_test_utils::dev; + use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::SledUuid; use rand::Rng; @@ -581,7 +581,7 @@ mod test { .expect("failed to upsert zpool"); let dataset = Dataset::new( - pool.dataset.into_untyped_uuid(), + pool.dataset, pool.pool.into_untyped_uuid(), None, DebugDatasetKind, From 744e351de6f71540a9d1974a2675dac0e7cc2778 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Nov 2024 16:53:52 -0800 Subject: [PATCH 18/61] Implement storage, use sled agent APIs --- Cargo.lock | 3 + clients/sled-agent-client/src/lib.rs | 1 + nexus/Cargo.toml | 3 + nexus/db-queries/src/db/datastore/zpool.rs | 22 + .../tasks/support_bundle_collector.rs | 437 ++++++++++++++++-- 5 files changed, 424 insertions(+), 42 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 709aaca3f20..3713c061359 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6925,6 +6925,7 @@ dependencies = [ "hubtools", "hyper", "hyper-rustls 0.26.0", + "hyper-staticfile", "illumos-utils", "internal-dns-resolver", "internal-dns-types", @@ -6997,6 +6998,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "serde_with", + "sha2", "similar-asserts", "sled-agent-client", "sled-agent-types", @@ -7021,6 +7023,7 @@ dependencies = [ "update-common", "update-engine", "uuid", + "zip 2.1.3", ] [[package]] diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 800123b1161..9530a59e3ba 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -78,6 +78,7 @@ progenitor::generate_api!( TypedUuidForOmicronZoneKind = omicron_uuid_kinds::OmicronZoneUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::PropolisUuid, TypedUuidForSledKind = omicron_uuid_kinds::SledUuid, + TypedUuidForSupportBundleKind = omicron_uuid_kinds::SupportBundleUuid, TypedUuidForZpoolKind = omicron_uuid_kinds::ZpoolUuid, Vni = omicron_common::api::external::Vni, ZpoolKind = omicron_common::zpool_name::ZpoolKind, diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index edf9cf5ae27..89a4f7ce704 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -42,6 +42,7 @@ hickory-resolver.workspace = true http.workspace = true http-body-util.workspace = true hyper.workspace = true +hyper-staticfile.workspace = true illumos-utils.workspace = true internal-dns-resolver.workspace = true internal-dns-types.workspace = true @@ -81,6 +82,7 @@ serde.workspace = true serde_json.workspace = true serde_urlencoded.workspace = true serde_with.workspace = true +sha2.workspace = true sled-agent-client.workspace = true slog.workspace = true slog-async.workspace = true @@ -119,6 +121,7 @@ update-common.workspace = true update-engine.workspace = true omicron-workspace-hack.workspace = true omicron-uuid-kinds.workspace = true +zip.workspace = true [dev-dependencies] async-bb8-diesel.workspace = true diff --git a/nexus/db-queries/src/db/datastore/zpool.rs b/nexus/db-queries/src/db/datastore/zpool.rs index a314b9d3716..9d128b2541c 100644 --- a/nexus/db-queries/src/db/datastore/zpool.rs +++ b/nexus/db-queries/src/db/datastore/zpool.rs @@ -31,9 +31,11 @@ use omicron_common::api::external::DataPageParams; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; use uuid::Uuid; @@ -270,4 +272,24 @@ impl DataStore { Ok(()) } + + pub async fn zpool_get_sled( + &self, + opctx: &OpContext, + id: ZpoolUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + use db::schema::zpool::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let id = dsl::zpool + .filter(dsl::id.eq(id.into_untyped_uuid())) + .filter(dsl::time_deleted.is_null()) + .select(dsl::sled_id) + .first_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(SledUuid::from_untyped_uuid(id)) + } } diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 94d415b64a4..b74ffb889ca 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -5,18 +5,36 @@ //! Background task for managing Support Bundles use crate::app::background::BackgroundTask; +use camino::Utf8Path; +use camino_tempfile::tempdir; +use camino_tempfile::tempfile; +use camino_tempfile::Utf8TempDir; use futures::future::BoxFuture; use futures::FutureExt; +use futures::TryStreamExt; +use nexus_db_model::SupportBundle; use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_types::deployment::SledFilter; +use nexus_types::identity::Asset; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::update::ArtifactHash; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolUuid; use serde::Serialize; use serde_json::json; +use sha2::{Digest, Sha256}; +use std::io::Write; use std::sync::Arc; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; +use tokio::io::SeekFrom; +use zip::ZipWriter; pub struct SupportBundleCollector { datastore: Arc, @@ -36,11 +54,6 @@ struct CleanupReport { db_failing_bundles_updated: usize, } -#[derive(Debug, Default, Serialize)] -struct CollectionReport { - bundle: Option, -} - impl SupportBundleCollector { pub fn new( datastore: Arc, @@ -52,10 +65,10 @@ impl SupportBundleCollector { // Monitors all bundles that are "destroying" or "failing" and assigned to // this Nexus, and attempts to clear their storage from Sled Agents. - async fn cleanup_destroyed_bundles<'a>( - &'a self, - opctx: &'a OpContext, - ) -> Result { + async fn cleanup_destroyed_bundles( + &self, + opctx: &OpContext, + ) -> anyhow::Result { let pagparams = DataPageParams::max_page(); let result = self .datastore @@ -78,7 +91,7 @@ impl SupportBundleCollector { "SupportBundleCollector: Failed to list terminating bundles"; "err" => %err ); - return Err(format!("failed to query database: {:#}", err)); + anyhow::bail!("failed to query database: {:#}", err); } }; @@ -86,10 +99,76 @@ impl SupportBundleCollector { // NOTE: This could be concurrent, but the priority for that also seems low for bundle in bundles_to_destroy { - // XXX: Contact the Sled Agent holding this bundle, release it. - // If OK: report.bundles_deleted_ok++ - // If 404: report.bundles_deleted_not_found++ - // Otherwise: report.bundles_deleted_failed++, and continue. + info!( + &opctx.log, + "SupportBundleCollector starting bundle deletion"; + "id" => %bundle.id + ); + + // Find the sled where we're storing this bundle. + // + // TODO: What happens if the zpool is concurrently expunged here? + // TODO: Should we "continue" to other bundles rather than bailing + // early? + let sled_id = self + .datastore + .zpool_get_sled(&opctx, bundle.zpool_id.into()) + .await?; + let sled_client = nexus_networking::sled_client( + &self.datastore, + &opctx, + sled_id.into_untyped_uuid(), + &opctx.log, + ) + .await?; + + let result = sled_client + .support_bundle_delete( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + ) + .await; + + match result { + Ok(_) => { + info!( + &opctx.log, + "SupportBundleCollector deleted bundle"; + "id" => %bundle.id + ); + report.sled_bundles_deleted_ok += 1; + + // Safe to fall-through; we deleted sled-local state. + } + Err(progenitor_client::Error::ErrorResponse(err)) + if err.status() == http::StatusCode::NOT_FOUND => + { + warn!( + &opctx.log, + "SupportBundleCollector could not delete bundle (not found)"; + "id" => %bundle.id + ); + + report.sled_bundles_deleted_not_found += 1; + + // Safe to fall-through; sled-local state does not exist + } + err => { + warn!( + &opctx.log, + "SupportBundleCollector could not delete bundle"; + "id" => %bundle.id, + "err" => ?err, + ); + + report.sled_bundles_delete_failed += 1; + + // We don't delete this bundle -- the sled storage may be + // experiencing a transient error. + continue; + } + } match bundle.state { SupportBundleState::Destroying => { @@ -104,7 +183,7 @@ impl SupportBundleCollector { "SupportBundleCollector: Could not delete 'destroying' bundle"; "err" => %err ); - format!("Could not delete 'destroying' bundle: {:#}", err) + anyhow::anyhow!("Could not delete 'destroying' bundle: {:#}", err) })?; report.db_destroying_bundles_removed += 1; @@ -133,10 +212,10 @@ impl SupportBundleCollector { "err" => ?err, ); } else { - return Err(format!( + anyhow::bail!( "Could not delete 'failing' bundle: {:#}", err - )); + ); } } @@ -157,12 +236,10 @@ impl SupportBundleCollector { Ok(report) } - async fn collect_bundle<'a>( - &'a self, - opctx: &'a OpContext, - ) -> Result { - let mut report = CollectionReport::default(); - + async fn collect_bundle( + &self, + opctx: &OpContext, + ) -> anyhow::Result> { let pagparams = DataPageParams::max_page(); let result = self .datastore @@ -186,7 +263,7 @@ impl SupportBundleCollector { bundle.clone() } else { info!(&opctx.log, "No bundles to collect"); - return Ok(report); + return Ok(None); } } Err(err) => { @@ -195,24 +272,12 @@ impl SupportBundleCollector { "SupportBundleCollector: Failed to list collecting bundles"; "err" => %err ); - return Err(format!("failed to query database: {:#}", err)); + anyhow::bail!("failed to query database: {:#}", err); } }; - report.bundle = Some(bundle.id.into()); - - // TODO: Actually collect the bundle here. - - // TODO: Periodically check the state of the bundle we're collecting - - // it might have been cancelled if this is taking a while. - // - // Whenever we check, we're basically creating a "yield" point. - // This should perhaps be set up on a timer. - - // TODO: Store the bundle once collection has finished. - - // TODO: Remove the temporary storage we used in the Nexus zone to - // collect the bundle. + let mut report = + self.collect_bundle_and_store_on_sled(&opctx, &bundle).await?; if let Err(err) = self .datastore .support_bundle_update( @@ -228,9 +293,113 @@ impl SupportBundleCollector { "bundle" => %bundle.id, "err" => ?err, ); - return Ok(report); + return Ok(Some(report)); } + report.activated_in_db_ok = true; + Ok(Some(report)) + } + + // Collect the bundle within Nexus, and store it on a target sled. + async fn collect_bundle_and_store_on_sled( + &self, + opctx: &OpContext, + bundle: &SupportBundle, + ) -> anyhow::Result { + // Create a temporary directory where we'll store the support bundle + // as it's being collected. + let dir = tempdir()?; + let mut collection = Box::pin(collect_bundle_as_file( + &self.datastore, + opctx, + &dir, + bundle, + )); + + // We periodically check the state of the support bundle - if a user + // explicitly cancels it, we should stop the collection process and + // return. + let work_duration = tokio::time::Duration::from_secs(5); + let mut yield_interval = tokio::time::interval_at( + tokio::time::Instant::now() + work_duration, + work_duration, + ); + + let report = loop { + tokio::select! { + // Timer fired mid-collection - let's check if we should stop. + _ = yield_interval.tick() => { + info!( + &opctx.log, + "Checking if Bundle Collection cancelled"; + "bundle" => %bundle.id + ); + + let bundle = self.datastore.support_bundle_get( + &opctx, + bundle.id.into() + ).await?; + if !matches!(bundle.state, SupportBundleState::Collecting) { + warn!( + &opctx.log, + "Support Bundle cancelled - stopping collection"; + "bundle" => %bundle.id, + "state" => ?bundle.state + ); + anyhow::bail!("Support Bundle Cancelled"); + } + }, + // Otherwise, keep making progress on the collection itself. + report = &mut collection => { + info!( + &opctx.log, + "Bundle Collection collected"; + "bundle" => %bundle.id + ); + break report?; + }, + } + }; + + // Create the zipfile as a temporary file + let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile(&dir)?); + // Verify the hash locally before we send it over the network + zipfile.seek(SeekFrom::Start(0)).await?; + let hash = sha2_hash(&mut zipfile).await?; + + // Find the sled where we're storing this bundle. + let sled_id = self + .datastore + .zpool_get_sled(&opctx, bundle.zpool_id.into()) + .await?; + let sled_client = nexus_networking::sled_client( + &self.datastore, + &opctx, + sled_id.into_untyped_uuid(), + &opctx.log, + ) + .await?; + + // Stream the zipfile to the sled where it should be kept + zipfile.seek(SeekFrom::Start(0)).await?; + let file_access = hyper_staticfile::vfs::TokioFileAccess::new(zipfile); + let file_stream = + hyper_staticfile::util::FileBytesStream::new(file_access); + let body = + reqwest::Body::wrap(hyper_staticfile::Body::Full(file_stream)); + + sled_client + .support_bundle_create( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + &hash.to_string(), + body, + ) + .await?; + + // Returning from this method should drop all temporary storage + // allocated locally for this support bundle. Ok(report) } } @@ -252,13 +421,17 @@ impl BackgroundTask for SupportBundleCollector { match self.cleanup_destroyed_bundles(&opctx).await { Ok(report) => cleanup_report = Some(report), - Err(err) => cleanup_err = Some(json!({ "cleanup_error": err })), + Err(err) => { + cleanup_err = + Some(json!({ "cleanup_error": err.to_string() })) + } }; match self.collect_bundle(&opctx).await { Ok(report) => collection_report = Some(report), Err(err) => { - collection_err = Some(json!({ "collect_error": err })) + collection_err = + Some(json!({ "collect_error": err.to_string() })) } }; @@ -272,3 +445,183 @@ impl BackgroundTask for SupportBundleCollector { .boxed() } } + +// Takes a directory "dir", and zips the contents into a single zipfile. +fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { + let tempfile = tempfile()?; + let mut zip = ZipWriter::new(tempfile); + + recursively_add_directory_to_zipfile(&mut zip, dir.path())?; + + Ok(zip.finish()?) +} + +fn recursively_add_directory_to_zipfile( + zip: &mut ZipWriter, + dir_path: &Utf8Path, +) -> anyhow::Result<()> { + for entry in dir_path.read_dir_utf8()? { + let entry = entry?; + let file_type = entry.file_type()?; + if file_type.is_file() { + zip.deep_copy_file_from_path( + entry.path(), + entry.path().strip_prefix(dir_path)?, + )?; + } + if file_type.is_dir() { + recursively_add_directory_to_zipfile(zip, entry.path())?; + } + } + Ok(()) +} + +async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { + let mut buf = vec![0u8; 65536]; + let mut ctx = Sha256::new(); + loop { + let n = file.read(&mut buf).await?; + if n == 0 { + break; + } + ctx.write_all(&buf[0..n])?; + } + + let digest = ctx.finalize(); + Ok(ArtifactHash(digest.as_slice().try_into()?)) +} + +// Identifies what we could or could not store within this support bundle. +// +// This struct will get emitted as part of the background task infrastructure. +#[derive(Debug, Serialize)] +struct CollectionReport { + bundle: SupportBundleUuid, + + // True iff we could list in-service sleds + listed_in_service_sleds: bool, + + // True iff the bundle was successfully made 'active' in the database. + activated_in_db_ok: bool, +} + +impl CollectionReport { + fn new(bundle: SupportBundleUuid) -> Self { + Self { + bundle, + listed_in_service_sleds: false, + activated_in_db_ok: false, + } + } +} + +// Perform the work of collecting the support bundle into a temporary directory +// +// - "dir" is a directory where data can be stored. +// - "bundle" is metadata about the bundle being collected. +// +// If a partial bundle can be collected, it should be returned as +// an Ok(CollectionReport). Any failures from this function will prevent +// the support bundle from being collected altogether. +// +// NOTE: The background task infrastructure will periodically check to see +// if the bundle has been cancelled by a user while it is being collected. +// If that happens, this function will be CANCELLED at an await point. +// +// As a result, it is important that this function be implemented as +// cancel-safe. +async fn collect_bundle_as_file( + datastore: &Arc, + opctx: &OpContext, + dir: &Utf8TempDir, + bundle: &SupportBundle, +) -> anyhow::Result { + let mut report = CollectionReport::new(bundle.id.into()); + + tokio::fs::write(dir.path().join("bundle_id.txt"), bundle.id.to_string()) + .await?; + + if let Ok(all_sleds) = + datastore.sled_list_all_batched(&opctx, SledFilter::InService).await + { + report.listed_in_service_sleds = true; + + // NOTE: This could be, and probably should be, done concurrently. + for sled in &all_sleds { + let sled_path = dir + .path() + .join("rack") + .join(sled.rack_id.to_string()) + .join("sled") + .join(sled.id().to_string()); + tokio::fs::create_dir_all(&sled_path).await?; + tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")) + .await?; + + let Ok(sled_client) = nexus_networking::sled_client( + &datastore, + &opctx, + sled.id(), + &opctx.log, + ) + .await + else { + tokio::fs::write( + sled_path.join("error.txt"), + "Could not contact sled", + ) + .await?; + continue; + }; + + write_command_result_or_error( + &sled_path, + "dladm", + sled_client.support_dladm_info().await, + ) + .await?; + write_command_result_or_error( + &sled_path, + "ipadm", + sled_client.support_ipadm_info().await, + ) + .await?; + write_command_result_or_error( + &sled_path, + "zoneadm", + sled_client.support_zoneadm_info().await, + ) + .await?; + } + } + + Ok(report) +} + +async fn write_command_result_or_error( + path: &Utf8Path, + command: &str, + result: Result< + sled_agent_client::ResponseValue, + sled_agent_client::Error, + >, +) -> anyhow::Result<()> { + match result { + Ok(result) => { + let output = result + .into_inner_stream() + .try_collect::() + .await?; + tokio::fs::write(path.join(format!("{command}.txt")), output) + .await?; + } + Err(err) => { + tokio::fs::write( + path.join(format!("{command}_err.txt")), + err.to_string(), + ) + .await?; + } + } + Ok(()) +} From 7c33f19af9504fba317ae3587602b8a47832b34b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 11:11:48 -0800 Subject: [PATCH 19/61] fix typed uuid merge --- nexus/test-utils/src/resource_helpers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index ebf652b8b29..c5cb0231d18 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1175,7 +1175,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for disk in &disks_config.disks { self.add_zpool_with_dataset_ext( *sled_id, - PhysicalDiskUuid::from_untyped_uuid(disk.id), + disk.id, disk.pool_id, DatasetUuid::new_v4(), Self::DEFAULT_ZPOOL_SIZE_GIB, From 10c3745349f5ef3c2890e208928b29dca7f74ef6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 11:16:30 -0800 Subject: [PATCH 20/61] more typed UUID merges --- nexus/db-queries/src/db/datastore/support_bundle.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index b89ce5cec09..3f04cb11352 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -485,6 +485,7 @@ mod test { use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; + use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use rand::Rng; @@ -573,7 +574,7 @@ mod test { let zpool = Zpool::new( *pool.pool.as_untyped_uuid(), *self.sled.as_untyped_uuid(), - Uuid::new_v4(), + PhysicalDiskUuid::new_v4(), ); datastore .zpool_insert(opctx, zpool) From 8b8fab709f73719a6225d0b31f676d82e05eb82f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 11:27:44 -0800 Subject: [PATCH 21/61] update omdb output --- dev-tools/omdb/tests/env.out | 12 ++++++++++++ dev-tools/omdb/tests/successes.out | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index f6b56e2b7bc..a7dc291bd68 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -162,6 +162,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "support_bundle_collector" + ensure new support bundles are automatically marked in-service + + task: "switch_port_config_manager" manages switch port settings for rack switches @@ -333,6 +337,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "support_bundle_collector" + ensure new support bundles are automatically marked in-service + + task: "switch_port_config_manager" manages switch port settings for rack switches @@ -491,6 +499,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "support_bundle_collector" + ensure new support bundles are automatically marked in-service + + task: "switch_port_config_manager" manages switch port settings for rack switches diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index dcf1671d8f4..b374816d2db 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -380,6 +380,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "support_bundle_collector" + ensure new support bundles are automatically marked in-service + + task: "switch_port_config_manager" manages switch port settings for rack switches @@ -691,6 +695,13 @@ task: "service_zone_nat_tracker" started at (s ago) and ran for ms last completion reported error: inventory collection is None +task: "support_bundle_collector" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: task disabled + task: "switch_port_config_manager" configured period: every s currently executing: no @@ -1134,6 +1145,13 @@ task: "service_zone_nat_tracker" started at (s ago) and ran for ms last completion reported error: inventory collection is None +task: "support_bundle_collector" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: task disabled + task: "switch_port_config_manager" configured period: every s currently executing: no From 03af36d1ced986f7cb3fc50468bf176becff5ab3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 16:31:18 -0800 Subject: [PATCH 22/61] [nexus] Implement Support Bundle API --- Cargo.lock | 1 + clients/sled-agent-client/src/lib.rs | 2 + common/src/api/external/mod.rs | 18 +++ nexus/Cargo.toml | 1 + nexus/external-api/src/lib.rs | 17 ++- nexus/src/app/mod.rs | 1 + nexus/src/app/support_bundles.rs | 130 +++++++++++++++++++++ nexus/src/external_api/http_entrypoints.rs | 119 ++++++++++++++----- nexus/types/src/external_api/params.rs | 2 +- sled-agent/api/src/lib.rs | 19 +-- 10 files changed, 264 insertions(+), 46 deletions(-) create mode 100644 nexus/src/app/support_bundles.rs diff --git a/Cargo.lock b/Cargo.lock index e056f89fa84..de165704f99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6984,6 +6984,7 @@ dependencies = [ "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=8f8fbb74662b4e19b643c500d55d2d384a6cee5e)", "qorb", "rand", + "range-requests", "rcgen", "ref-cast", "regex", diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index f0fd4ceeaa1..f818f241fa4 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -75,6 +75,8 @@ progenitor::generate_api!( RouterVersion = omicron_common::api::internal::shared::RouterVersion, SledRole = nexus_sled_agent_shared::inventory::SledRole, SourceNatConfig = omicron_common::api::internal::shared::SourceNatConfig, + SupportBundleGetQueryParams = omicron_common::api::external::SupportBundleGetQueryParams, + SupportBundleQueryType = omicron_common::api::external::SupportBundleQueryType, SwitchLocation = omicron_common::api::external::SwitchLocation, TypedUuidForDatasetKind = omicron_uuid_kinds::DatasetUuid, TypedUuidForInstanceKind = omicron_uuid_kinds::InstanceUuid, diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 38a9de05648..6db4d5c1890 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -3090,6 +3090,24 @@ pub enum ImportExportPolicy { Allow(Vec), } +/// Query parameters for reading the support bundle +#[derive(Deserialize, Serialize, JsonSchema)] +pub struct SupportBundleGetQueryParams { + pub query_type: SupportBundleQueryType, +} + +/// Describes the type of access to the support bundle +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum SupportBundleQueryType { + /// Access the whole support bundle + Whole, + /// Access the names of all files within the support bundle + Index, + /// Access a specific file within the support bundle + Path { file_path: String }, +} + #[cfg(test)] mod test { use serde::Deserialize; diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 89a4f7ce704..cc9c056cc1c 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -72,6 +72,7 @@ progenitor-client.workspace = true propolis-client.workspace = true qorb.workspace = true rand.workspace = true +range-requests.workspace = true ref-cast.workspace = true reqwest = { workspace = true, features = ["json"] } ring.workspace = true diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 2860db740e6..4c65825396a 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2793,6 +2793,19 @@ pub trait NexusExternalApi { async fn support_bundle_download( rqctx: RequestContext, path_params: Path, + body: TypedBody, + ) -> Result, HttpError>; + + /// Download the metadata of a single support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, ) -> Result, HttpError>; /// Create a new support bundle @@ -2803,7 +2816,7 @@ pub trait NexusExternalApi { }] async fn support_bundle_create( rqctx: RequestContext, - ) -> Result, HttpError>; + ) -> Result, HttpError>; /// Delete an existing support bundle /// @@ -2817,7 +2830,7 @@ pub trait NexusExternalApi { async fn support_bundle_delete( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError>; + ) -> Result; // Probes (experimental) diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 435ca2a56d1..a4f833c9712 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -77,6 +77,7 @@ mod sled; mod sled_instance; mod snapshot; mod ssh_key; +mod support_bundles; mod switch; mod switch_interface; mod switch_port; diff --git a/nexus/src/app/support_bundles.rs b/nexus/src/app/support_bundles.rs new file mode 100644 index 00000000000..7f91d3b8d59 --- /dev/null +++ b/nexus/src/app/support_bundles.rs @@ -0,0 +1,130 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Access to Support Bundles + +use dropshot::Body; +use futures::TryStreamExt; +use http::Response; +use nexus_db_model::SupportBundle; +use nexus_db_model::SupportBundleState; +use nexus_db_queries::context::OpContext; +use omicron_common::api::external::CreateResult; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::DeleteResult; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; +use omicron_common::api::external::LookupResult; +use omicron_common::api::external::SupportBundleGetQueryParams; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolUuid; +use range_requests::PotentialRange; +use uuid::Uuid; + +impl super::Nexus { + pub async fn support_bundle_list( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + self.db_datastore.support_bundle_list(&opctx, pagparams).await + } + + pub async fn support_bundle_view( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> LookupResult { + self.db_datastore.support_bundle_get(&opctx, id).await + } + + pub async fn support_bundle_create( + &self, + opctx: &OpContext, + reason: &'static str, + ) -> CreateResult { + self.db_datastore.support_bundle_create(&opctx, reason, self.id).await + } + + pub async fn support_bundle_download( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + query: &SupportBundleGetQueryParams, + head: bool, + _range: Option, + ) -> Result, Error> { + // Lookup the bundle, confirm it's accessible + let bundle = self.db_datastore.support_bundle_get(&opctx, id).await?; + if !matches!(bundle.state, SupportBundleState::Active) { + return Err(Error::invalid_request( + "Cannot download bundle in non-active state", + )); + } + + // Lookup the sled holding the bundle, forward the request there + let sled_id = self + .db_datastore + .zpool_get_sled(&opctx, bundle.zpool_id.into()) + .await?; + let client = self.sled_client(&sled_id).await?; + + // TODO: Use "range"? + + let response = if head { + client + .support_bundle_head( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + &query, + ) + .await + } else { + client + .support_bundle_get( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + &query, + ) + .await + }; + + let response = + response.map_err(|err| Error::internal_error(&err.to_string()))?; + + // The result from the sled agent a "ResponseValue", but we + // need to coerce that type into a "Response" while preserving the + // status, headers, and body. + let mut builder = Response::builder().status(response.status()); + let headers = builder.headers_mut().unwrap(); + headers.extend( + response.headers().iter().map(|(k, v)| (k.clone(), v.clone())), + ); + let body = http_body_util::StreamBody::new( + response + .into_inner_stream() + .map_ok(|b| hyper::body::Frame::data(b)), + ); + Ok(builder.body(Body::wrap(body)).unwrap()) + } + + pub async fn support_bundle_delete( + &self, + opctx: &OpContext, + id: SupportBundleUuid, + ) -> DeleteResult { + // NOTE: We can't necessarily delete the support bundle + // immediately - it might have state that needs cleanup + // by a background task - so, instead, we mark it deleting. + // + // This is a terminal state + self.db_datastore + .support_bundle_update(&opctx, id, SupportBundleState::Destroying) + .await?; + Ok(()) + } +} diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 004a3482c56..c00912a6624 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -81,6 +81,7 @@ use omicron_common::api::external::NameOrId; use omicron_common::api::external::Probe; use omicron_common::api::external::RouterRoute; use omicron_common::api::external::RouterRouteKind; +use omicron_common::api::external::SupportBundleGetQueryParams; use omicron_common::api::external::SwitchPort; use omicron_common::api::external::SwitchPortSettings; use omicron_common::api::external::SwitchPortSettingsView; @@ -90,11 +91,13 @@ use omicron_common::api::external::VpcFirewallRuleUpdateParams; use omicron_common::api::external::VpcFirewallRules; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SupportBundleUuid; use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode; use propolis_client::support::tungstenite::protocol::{ CloseFrame, Role as WebSocketRole, }; use propolis_client::support::WebSocketStream; +use range_requests::RequestContextEx; use ref_cast::RefCast; type NexusApiDescription = ApiDescription; @@ -6001,20 +6004,33 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_list( rqctx: RequestContext, - _query_params: Query, + query_params: Query, ) -> Result>, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; + let query = query_params.into_inner(); + let pagparams = data_page_params_for(&rqctx, &query)?; + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus - .unimplemented_todo(&opctx, crate::app::Unimpl::Public) - .await - .into()) + let bundles = nexus + .support_bundle_list(&opctx, &pagparams) + .await? + .into_iter() + .map(|p| p.into()) + .collect(); + + Ok(HttpResponseOk(ScanById::results_page( + &query, + bundles, + &|_, bundle: &shared::SupportBundleInfo| { + bundle.id.into_untyped_uuid() + }, + )?)) }; apictx .context @@ -6025,19 +6041,24 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_view( rqctx: RequestContext, - _path_params: Path, + path_params: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; + let path = path_params.into_inner(); let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus - .unimplemented_todo(&opctx, crate::app::Unimpl::Public) - .await - .into()) + let bundle = nexus + .support_bundle_view( + &opctx, + SupportBundleUuid::from_untyped_uuid(path.support_bundle), + ) + .await?; + + Ok(HttpResponseOk(bundle.into())) }; apictx .context @@ -6048,19 +6069,60 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_download( rqctx: RequestContext, - _path_params: Path, + path_params: Path, + body: TypedBody, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; + let path = path_params.into_inner(); + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + let head = false; + let range = rqctx.range(); + let body = nexus + .support_bundle_download( + &opctx, + SupportBundleUuid::from_untyped_uuid(path.support_bundle), + &body.into_inner(), + head, + range, + ) + .await?; + Ok(body) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + let path = path_params.into_inner(); let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let head = true; + let range = rqctx.range(); - Err(nexus - .unimplemented_todo(&opctx, crate::app::Unimpl::Public) - .await - .into()) + let body = nexus + .support_bundle_download( + &opctx, + SupportBundleUuid::from_untyped_uuid(path.support_bundle), + &body.into_inner(), + head, + range, + ) + .await?; + Ok(body) }; apictx .context @@ -6071,7 +6133,7 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_create( rqctx: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; @@ -6079,10 +6141,10 @@ impl NexusExternalApi for NexusExternalApiImpl { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus - .unimplemented_todo(&opctx, crate::app::Unimpl::Public) - .await - .into()) + let bundle = nexus + .support_bundle_create(&opctx, "Created by external API") + .await?; + Ok(HttpResponseCreated(bundle.into())) }; apictx .context @@ -6093,19 +6155,24 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_delete( rqctx: RequestContext, - _path_params: Path, - ) -> Result, HttpError> { + path_params: Path, + ) -> Result { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; + let path = path_params.into_inner(); let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - Err(nexus - .unimplemented_todo(&opctx, crate::app::Unimpl::Public) - .await - .into()) + nexus + .support_bundle_delete( + &opctx, + SupportBundleUuid::from_untyped_uuid(path.support_bundle), + ) + .await?; + + Ok(HttpResponseDeleted()) }; apictx .context diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 0929b2c04b5..4a4af8b052f 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -89,8 +89,8 @@ path_param!(SshKeyPath, ssh_key, "SSH key"); path_param!(AddressLotPath, address_lot, "address lot"); path_param!(ProbePath, probe, "probe"); path_param!(CertificatePath, certificate, "certificate"); -path_param!(SupportBundlePath, support_bundle, "support bundle"); +id_path_param!(SupportBundlePath, support_bundle, "support bundle"); id_path_param!(GroupPath, group_id, "group"); // TODO: The hardware resources should be represented by its UUID or a hardware diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 634640079a2..be0dbdf1d8f 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -702,23 +702,8 @@ pub struct SupportBundleGetHeaders { range: String, } -/// Query parameters for reading the support bundle -#[derive(Deserialize, Serialize, JsonSchema)] -pub struct SupportBundleGetQueryParams { - pub query_type: SupportBundleQueryType, -} - -/// Describes the type of access to the support bundle -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum SupportBundleQueryType { - /// Access the whole support bundle - Whole, - /// Access the names of all files within the support bundle - Index, - /// Access a specific file within the support bundle - Path { file_path: String }, -} +pub use omicron_common::api::external::SupportBundleGetQueryParams; +pub use omicron_common::api::external::SupportBundleQueryType; #[derive(Deserialize, Debug, Serialize, JsonSchema, PartialEq)] #[serde(rename_all = "snake_case")] From 2b42d0e9eba874261cfe04e8dada79942d9b9633 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 27 Nov 2024 16:47:23 -0800 Subject: [PATCH 23/61] API updates pulled from 7187 as I built it --- clients/sled-agent-client/src/lib.rs | 3 + common/src/api/external/mod.rs | 18 +++ nexus/external-api/output/nexus_tags.txt | 1 + nexus/external-api/src/lib.rs | 17 ++- nexus/src/external_api/http_entrypoints.rs | 30 ++++- nexus/types/src/external_api/params.rs | 2 +- openapi/nexus.json | 145 ++++++++++++++++++--- sled-agent/api/src/lib.rs | 19 +-- 8 files changed, 196 insertions(+), 39 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index f2eb957650c..f818f241fa4 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -75,12 +75,15 @@ progenitor::generate_api!( RouterVersion = omicron_common::api::internal::shared::RouterVersion, SledRole = nexus_sled_agent_shared::inventory::SledRole, SourceNatConfig = omicron_common::api::internal::shared::SourceNatConfig, + SupportBundleGetQueryParams = omicron_common::api::external::SupportBundleGetQueryParams, + SupportBundleQueryType = omicron_common::api::external::SupportBundleQueryType, SwitchLocation = omicron_common::api::external::SwitchLocation, TypedUuidForDatasetKind = omicron_uuid_kinds::DatasetUuid, TypedUuidForInstanceKind = omicron_uuid_kinds::InstanceUuid, TypedUuidForOmicronZoneKind = omicron_uuid_kinds::OmicronZoneUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::PropolisUuid, TypedUuidForSledKind = omicron_uuid_kinds::SledUuid, + TypedUuidForSupportBundleKind = omicron_uuid_kinds::SupportBundleUuid, TypedUuidForZpoolKind = omicron_uuid_kinds::ZpoolUuid, Vni = omicron_common::api::external::Vni, ZpoolKind = omicron_common::zpool_name::ZpoolKind, diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 38a9de05648..6db4d5c1890 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -3090,6 +3090,24 @@ pub enum ImportExportPolicy { Allow(Vec), } +/// Query parameters for reading the support bundle +#[derive(Deserialize, Serialize, JsonSchema)] +pub struct SupportBundleGetQueryParams { + pub query_type: SupportBundleQueryType, +} + +/// Describes the type of access to the support bundle +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum SupportBundleQueryType { + /// Access the whole support bundle + Whole, + /// Access the names of all files within the support bundle + Index, + /// Access a specific file within the support bundle + Path { file_path: String }, +} + #[cfg(test)] mod test { use serde::Deserialize; diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 0526a10b33b..11e942496ec 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -33,6 +33,7 @@ probe_view GET /experimental/v1/probes/{probe support_bundle_create POST /experimental/v1/system/support-bundles support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_head HEAD /experimental/v1/system/support-bundles/{support_bundle}/download support_bundle_list GET /experimental/v1/system/support-bundles support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 2860db740e6..4c65825396a 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2793,6 +2793,19 @@ pub trait NexusExternalApi { async fn support_bundle_download( rqctx: RequestContext, path_params: Path, + body: TypedBody, + ) -> Result, HttpError>; + + /// Download the metadata of a single support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, ) -> Result, HttpError>; /// Create a new support bundle @@ -2803,7 +2816,7 @@ pub trait NexusExternalApi { }] async fn support_bundle_create( rqctx: RequestContext, - ) -> Result, HttpError>; + ) -> Result, HttpError>; /// Delete an existing support bundle /// @@ -2817,7 +2830,7 @@ pub trait NexusExternalApi { async fn support_bundle_delete( rqctx: RequestContext, path_params: Path, - ) -> Result, HttpError>; + ) -> Result; // Probes (experimental) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 004a3482c56..bab84e8c38d 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -81,6 +81,7 @@ use omicron_common::api::external::NameOrId; use omicron_common::api::external::Probe; use omicron_common::api::external::RouterRoute; use omicron_common::api::external::RouterRouteKind; +use omicron_common::api::external::SupportBundleGetQueryParams; use omicron_common::api::external::SwitchPort; use omicron_common::api::external::SwitchPortSettings; use omicron_common::api::external::SwitchPortSettingsView; @@ -6049,6 +6050,31 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_download( rqctx: RequestContext, _path_params: Path, + _body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head( + rqctx: RequestContext, + _path_params: Path, + _body: TypedBody, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { @@ -6071,7 +6097,7 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_create( rqctx: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; @@ -6094,7 +6120,7 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_delete( rqctx: RequestContext, _path_params: Path, - ) -> Result, HttpError> { + ) -> Result { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 0929b2c04b5..4a4af8b052f 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -89,8 +89,8 @@ path_param!(SshKeyPath, ssh_key, "SSH key"); path_param!(AddressLotPath, address_lot, "address lot"); path_param!(ProbePath, probe, "probe"); path_param!(CertificatePath, certificate, "certificate"); -path_param!(SupportBundlePath, support_bundle, "support bundle"); +id_path_param!(SupportBundlePath, support_bundle, "support bundle"); id_path_param!(GroupPath, group_id, "group"); // TODO: The hardware resources should be represented by its UUID or a hardware diff --git a/openapi/nexus.json b/openapi/nexus.json index c9da9dac5a2..fe91ff98e31 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -366,8 +366,8 @@ "summary": "Create a new support bundle", "operationId": "support_bundle_create", "responses": { - "200": { - "description": "successful operation", + "201": { + "description": "successful creation", "content": { "application/json": { "schema": { @@ -396,10 +396,11 @@ { "in": "path", "name": "support_bundle", - "description": "Name or ID of the support bundle", + "description": "ID of the support bundle", "required": true, "schema": { - "$ref": "#/components/schemas/NameOrId" + "type": "string", + "format": "uuid" } } ], @@ -433,23 +434,17 @@ { "in": "path", "name": "support_bundle", - "description": "Name or ID of the support bundle", + "description": "ID of the support bundle", "required": true, "schema": { - "$ref": "#/components/schemas/NameOrId" + "type": "string", + "format": "uuid" } } ], "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SupportBundleInfo" - } - } - } + "204": { + "description": "successful deletion" }, "4XX": { "$ref": "#/components/responses/Error" @@ -471,13 +466,63 @@ { "in": "path", "name": "support_bundle", - "description": "Name or ID of the support bundle", + "description": "ID of the support bundle", "required": true, "schema": { - "$ref": "#/components/schemas/NameOrId" + "type": "string", + "format": "uuid" } } ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleGetQueryParams" + } + } + }, + "required": true + }, + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a single support bundle", + "operationId": "support_bundle_head", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SupportBundleGetQueryParams" + } + } + }, + "required": true + }, "responses": { "default": { "description": "", @@ -20199,6 +20244,18 @@ "items" ] }, + "SupportBundleGetQueryParams": { + "description": "Query parameters for reading the support bundle", + "type": "object", + "properties": { + "query_type": { + "$ref": "#/components/schemas/SupportBundleQueryType" + } + }, + "required": [ + "query_type" + ] + }, "SupportBundleInfo": { "type": "object", "properties": { @@ -20248,6 +20305,60 @@ "items" ] }, + "SupportBundleQueryType": { + "description": "Describes the type of access to the support bundle", + "oneOf": [ + { + "description": "Access the whole support bundle", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "whole" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Access the names of all files within the support bundle", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "index" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Access a specific file within the support bundle", + "type": "object", + "properties": { + "file_path": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "path" + ] + } + }, + "required": [ + "file_path", + "type" + ] + } + ] + }, "SupportBundleState": { "oneOf": [ { diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 634640079a2..be0dbdf1d8f 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -702,23 +702,8 @@ pub struct SupportBundleGetHeaders { range: String, } -/// Query parameters for reading the support bundle -#[derive(Deserialize, Serialize, JsonSchema)] -pub struct SupportBundleGetQueryParams { - pub query_type: SupportBundleQueryType, -} - -/// Describes the type of access to the support bundle -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum SupportBundleQueryType { - /// Access the whole support bundle - Whole, - /// Access the names of all files within the support bundle - Index, - /// Access a specific file within the support bundle - Path { file_path: String }, -} +pub use omicron_common::api::external::SupportBundleGetQueryParams; +pub use omicron_common::api::external::SupportBundleQueryType; #[derive(Deserialize, Debug, Serialize, JsonSchema, PartialEq)] #[serde(rename_all = "snake_case")] From 1fccea8089bf39ff8806a6c0f767da59a3b11aa3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 2 Dec 2024 10:33:20 -0800 Subject: [PATCH 24/61] expectorate --- nexus/tests/output/uncovered-authz-endpoints.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index ec4790c071d..13fb389d6ec 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -8,6 +8,7 @@ support_bundle_view (get "/experimental/v1/system/suppor support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") +support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") From 417cab94031d44701778ec513deff6d7aa0f53a1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 2 Dec 2024 11:18:01 -0800 Subject: [PATCH 25/61] update description --- dev-tools/omdb/tests/env.out | 6 +++--- dev-tools/omdb/tests/successes.out | 2 +- nexus/src/app/background/init.rs | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index a7dc291bd68..787a83236ea 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -163,7 +163,7 @@ task: "service_zone_nat_tracker" task: "support_bundle_collector" - ensure new support bundles are automatically marked in-service + Manage support bundle collection and cleanup task: "switch_port_config_manager" @@ -338,7 +338,7 @@ task: "service_zone_nat_tracker" task: "support_bundle_collector" - ensure new support bundles are automatically marked in-service + Manage support bundle collection and cleanup task: "switch_port_config_manager" @@ -500,7 +500,7 @@ task: "service_zone_nat_tracker" task: "support_bundle_collector" - ensure new support bundles are automatically marked in-service + Manage support bundle collection and cleanup task: "switch_port_config_manager" diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index b374816d2db..0c1d5402fbc 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -381,7 +381,7 @@ task: "service_zone_nat_tracker" task: "support_bundle_collector" - ensure new support bundles are automatically marked in-service + Manage support bundle collection and cleanup task: "switch_port_config_manager" diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 304f4f20c59..19cff790575 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -535,8 +535,7 @@ impl BackgroundTasksInitializer { // collection. driver.register(TaskDefinition { name: "support_bundle_collector", - description: - "ensure new support bundles are automatically marked in-service", + description: "Manage support bundle collection and cleanup", period: config.support_bundle_collector.period_secs, task_impl: Box::new( support_bundle_collector::SupportBundleCollector::new( From d10e0346b8c7495b9feb6c61359becbbe08cb31c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 4 Dec 2024 17:15:28 -0800 Subject: [PATCH 26/61] big one, should be split up - Made an API abstraction above support bundles so it can be shared by the real/simulated sled agent - To use this API: - Refactored simulated sled agent storage APIs (internal mutability) - Changed a bunch of sled agent locks to std mutex - Implemented a "nested dataset" API in the simulated sled agent storage - Started using this simulated sled agent API in the support bundle collector background task tests --- .../tasks/support_bundle_collector.rs | 698 ++++++++++++++---- nexus/src/app/sagas/disk_create.rs | 11 +- nexus/src/app/sagas/instance_create.rs | 2 +- nexus/src/app/sagas/instance_ip_attach.rs | 4 +- nexus/src/app/sagas/instance_ip_detach.rs | 4 +- nexus/test-utils/src/resource_helpers.rs | 65 +- .../crucible_replacements.rs | 1 - nexus/tests/integration_tests/disks.rs | 10 +- nexus/tests/integration_tests/instances.rs | 8 +- nexus/tests/integration_tests/snapshots.rs | 4 +- .../integration_tests/volume_management.rs | 12 +- sled-agent/src/sim/artifact_store.rs | 9 +- sled-agent/src/sim/http_entrypoints.rs | 76 +- sled-agent/src/sim/http_entrypoints_pantry.rs | 27 +- .../src/sim/http_entrypoints_storage.rs | 34 +- sled-agent/src/sim/mod.rs | 1 + sled-agent/src/sim/server.rs | 45 +- sled-agent/src/sim/sled_agent.rs | 215 +++--- sled-agent/src/sim/storage.rs | 632 +++++++++------- sled-agent/src/support_bundle/storage.rs | 159 +++- 20 files changed, 1298 insertions(+), 719 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index b74ffb889ca..7a6e9f06046 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -34,15 +34,18 @@ use std::sync::Arc; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio::io::SeekFrom; +use zip::write::FullFileOptions; use zip::ZipWriter; -pub struct SupportBundleCollector { - datastore: Arc, - disable: bool, - nexus_id: OmicronZoneUuid, +// Specifies the data to be collected within the Support Bundle. +#[derive(Default)] +struct BundleRequest { + // If "false": Skip collecting host-specific info from each sled. + skip_sled_info: bool, } -#[derive(Debug, Default, Serialize)] +// Describes what happened while attempting to clean up Support Bundles. +#[derive(Debug, Default, Serialize, Eq, PartialEq)] struct CleanupReport { // Responses from Sled Agents sled_bundles_deleted_ok: usize, @@ -54,6 +57,13 @@ struct CleanupReport { db_failing_bundles_updated: usize, } +/// The background task responsible for cleaning and collecting support bundles +pub struct SupportBundleCollector { + datastore: Arc, + disable: bool, + nexus_id: OmicronZoneUuid, +} + impl SupportBundleCollector { pub fn new( datastore: Arc, @@ -239,6 +249,7 @@ impl SupportBundleCollector { async fn collect_bundle( &self, opctx: &OpContext, + request: &BundleRequest, ) -> anyhow::Result> { let pagparams = DataPageParams::max_page(); let result = self @@ -276,8 +287,15 @@ impl SupportBundleCollector { } }; - let mut report = - self.collect_bundle_and_store_on_sled(&opctx, &bundle).await?; + let collection = BundleCollection { + collector: &self, + log: opctx.log.new(slog::o!("bundle" => bundle.id.to_string())), + opctx, + request, + bundle: &bundle, + }; + + let mut report = collection.collect_bundle_and_store_on_sled().await?; if let Err(err) = self .datastore .support_bundle_update( @@ -298,22 +316,30 @@ impl SupportBundleCollector { report.activated_in_db_ok = true; Ok(Some(report)) } +} + +// Wraps up all arguments to perform a single support bundle collection +struct BundleCollection<'a> { + // The task responsible for this collection + collector: &'a SupportBundleCollector, + log: slog::Logger, + opctx: &'a OpContext, + request: &'a BundleRequest, + bundle: &'a SupportBundle, +} + +impl<'a> BundleCollection<'a> { // Collect the bundle within Nexus, and store it on a target sled. async fn collect_bundle_and_store_on_sled( &self, - opctx: &OpContext, - bundle: &SupportBundle, ) -> anyhow::Result { // Create a temporary directory where we'll store the support bundle // as it's being collected. let dir = tempdir()?; - let mut collection = Box::pin(collect_bundle_as_file( - &self.datastore, - opctx, - &dir, - bundle, - )); + + println!("created tempdir, starting collection"); + let mut collection = Box::pin(self.collect_bundle_as_file(&dir)); // We periodically check the state of the support bundle - if a user // explicitly cancels it, we should stop the collection process and @@ -329,21 +355,21 @@ impl SupportBundleCollector { // Timer fired mid-collection - let's check if we should stop. _ = yield_interval.tick() => { info!( - &opctx.log, + &self.log, "Checking if Bundle Collection cancelled"; - "bundle" => %bundle.id + "bundle" => %self.bundle.id ); - let bundle = self.datastore.support_bundle_get( - &opctx, - bundle.id.into() + let bundle = self.collector.datastore.support_bundle_get( + &self.opctx, + self.bundle.id.into() ).await?; if !matches!(bundle.state, SupportBundleState::Collecting) { warn!( - &opctx.log, + &self.log, "Support Bundle cancelled - stopping collection"; - "bundle" => %bundle.id, - "state" => ?bundle.state + "bundle" => %self.bundle.id, + "state" => ?self.bundle.state ); anyhow::bail!("Support Bundle Cancelled"); } @@ -351,9 +377,9 @@ impl SupportBundleCollector { // Otherwise, keep making progress on the collection itself. report = &mut collection => { info!( - &opctx.log, - "Bundle Collection collected"; - "bundle" => %bundle.id + &self.log, + "Bundle Collection completed"; + "bundle" => %self.bundle.id ); break report?; }, @@ -369,14 +395,15 @@ impl SupportBundleCollector { // Find the sled where we're storing this bundle. let sled_id = self + .collector .datastore - .zpool_get_sled(&opctx, bundle.zpool_id.into()) + .zpool_get_sled(&self.opctx, self.bundle.zpool_id.into()) .await?; let sled_client = nexus_networking::sled_client( - &self.datastore, - &opctx, + &self.collector.datastore, + &self.opctx, sled_id.into_untyped_uuid(), - &opctx.log, + &self.log, ) .await?; @@ -390,9 +417,9 @@ impl SupportBundleCollector { sled_client .support_bundle_create( - &ZpoolUuid::from(bundle.zpool_id), - &DatasetUuid::from(bundle.dataset_id), - &SupportBundleUuid::from(bundle.id), + &ZpoolUuid::from(self.bundle.zpool_id), + &DatasetUuid::from(self.bundle.dataset_id), + &SupportBundleUuid::from(self.bundle.id), &hash.to_string(), body, ) @@ -402,6 +429,104 @@ impl SupportBundleCollector { // allocated locally for this support bundle. Ok(report) } + + // Perform the work of collecting the support bundle into a temporary directory + // + // - "dir" is a directory where data can be stored. + // - "bundle" is metadata about the bundle being collected. + // + // If a partial bundle can be collected, it should be returned as + // an Ok(CollectionReport). Any failures from this function will prevent + // the support bundle from being collected altogether. + // + // NOTE: The background task infrastructure will periodically check to see + // if the bundle has been cancelled by a user while it is being collected. + // If that happens, this function will be CANCELLED at an await point. + // + // As a result, it is important that this function be implemented as + // cancel-safe. + async fn collect_bundle_as_file( + &self, + dir: &Utf8TempDir, + ) -> anyhow::Result { + let log = &self.log; + + info!(&log, "Collecting bundle as local file"); + let mut report = CollectionReport::new(self.bundle.id.into()); + + tokio::fs::write( + dir.path().join("bundle_id.txt"), + self.bundle.id.to_string(), + ) + .await?; + + if let Ok(all_sleds) = self + .collector + .datastore + .sled_list_all_batched(&self.opctx, SledFilter::InService) + .await + { + report.listed_in_service_sleds = true; + + // NOTE: This could be, and probably should be, done concurrently. + for sled in &all_sleds { + info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id()); + let sled_path = dir + .path() + .join("rack") + .join(sled.rack_id.to_string()) + .join("sled") + .join(sled.id().to_string()); + tokio::fs::create_dir_all(&sled_path).await?; + tokio::fs::write( + sled_path.join("sled.txt"), + format!("{sled:?}"), + ) + .await?; + + if self.request.skip_sled_info { + continue; + } + + let Ok(sled_client) = nexus_networking::sled_client( + &self.collector.datastore, + &self.opctx, + sled.id(), + log, + ) + .await + else { + tokio::fs::write( + sled_path.join("error.txt"), + "Could not contact sled", + ) + .await?; + continue; + }; + + write_command_result_or_error( + &sled_path, + "dladm", + sled_client.support_dladm_info().await, + ) + .await?; + write_command_result_or_error( + &sled_path, + "ipadm", + sled_client.support_ipadm_info().await, + ) + .await?; + write_command_result_or_error( + &sled_path, + "zoneadm", + sled_client.support_zoneadm_info().await, + ) + .await?; + } + } + + Ok(report) + } } impl BackgroundTask for SupportBundleCollector { @@ -427,7 +552,8 @@ impl BackgroundTask for SupportBundleCollector { } }; - match self.collect_bundle(&opctx).await { + let request = BundleRequest::default(); + match self.collect_bundle(&opctx, &request).await { Ok(report) => collection_report = Some(report), Err(err) => { collection_err = @@ -451,26 +577,34 @@ fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { let tempfile = tempfile()?; let mut zip = ZipWriter::new(tempfile); - recursively_add_directory_to_zipfile(&mut zip, dir.path())?; + recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; Ok(zip.finish()?) } +// TODO: Could maybe test this in isolation + fn recursively_add_directory_to_zipfile( zip: &mut ZipWriter, + root_path: &Utf8Path, dir_path: &Utf8Path, ) -> anyhow::Result<()> { for entry in dir_path.read_dir_utf8()? { let entry = entry?; let file_type = entry.file_type()?; if file_type.is_file() { - zip.deep_copy_file_from_path( - entry.path(), - entry.path().strip_prefix(dir_path)?, - )?; + let opts = FullFileOptions::default(); + let src = entry.path(); + let dst = entry.path().strip_prefix(root_path)?; + + zip.start_file_from_path(dst, opts)?; + let buf = std::fs::read(&src)?; + zip.write_all(&buf)?; } if file_type.is_dir() { - recursively_add_directory_to_zipfile(zip, entry.path())?; + let opts = FullFileOptions::default(); + zip.add_directory_from_path(entry.path(), opts)?; + recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; } } Ok(()) @@ -494,7 +628,7 @@ async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { // Identifies what we could or could not store within this support bundle. // // This struct will get emitted as part of the background task infrastructure. -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, PartialEq, Eq)] struct CollectionReport { bundle: SupportBundleUuid, @@ -515,89 +649,6 @@ impl CollectionReport { } } -// Perform the work of collecting the support bundle into a temporary directory -// -// - "dir" is a directory where data can be stored. -// - "bundle" is metadata about the bundle being collected. -// -// If a partial bundle can be collected, it should be returned as -// an Ok(CollectionReport). Any failures from this function will prevent -// the support bundle from being collected altogether. -// -// NOTE: The background task infrastructure will periodically check to see -// if the bundle has been cancelled by a user while it is being collected. -// If that happens, this function will be CANCELLED at an await point. -// -// As a result, it is important that this function be implemented as -// cancel-safe. -async fn collect_bundle_as_file( - datastore: &Arc, - opctx: &OpContext, - dir: &Utf8TempDir, - bundle: &SupportBundle, -) -> anyhow::Result { - let mut report = CollectionReport::new(bundle.id.into()); - - tokio::fs::write(dir.path().join("bundle_id.txt"), bundle.id.to_string()) - .await?; - - if let Ok(all_sleds) = - datastore.sled_list_all_batched(&opctx, SledFilter::InService).await - { - report.listed_in_service_sleds = true; - - // NOTE: This could be, and probably should be, done concurrently. - for sled in &all_sleds { - let sled_path = dir - .path() - .join("rack") - .join(sled.rack_id.to_string()) - .join("sled") - .join(sled.id().to_string()); - tokio::fs::create_dir_all(&sled_path).await?; - tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")) - .await?; - - let Ok(sled_client) = nexus_networking::sled_client( - &datastore, - &opctx, - sled.id(), - &opctx.log, - ) - .await - else { - tokio::fs::write( - sled_path.join("error.txt"), - "Could not contact sled", - ) - .await?; - continue; - }; - - write_command_result_or_error( - &sled_path, - "dladm", - sled_client.support_dladm_info().await, - ) - .await?; - write_command_result_or_error( - &sled_path, - "ipadm", - sled_client.support_ipadm_info().await, - ) - .await?; - write_command_result_or_error( - &sled_path, - "zoneadm", - sled_client.support_zoneadm_info().await, - ) - .await?; - } - } - - Ok(report) -} - async fn write_command_result_or_error( path: &Utf8Path, command: &str, @@ -625,3 +676,402 @@ async fn write_command_result_or_error( } Ok(()) } + +#[cfg(test)] +mod test { + use super::*; + + use nexus_db_model::Dataset; + use nexus_db_model::PhysicalDisk; + use nexus_db_model::PhysicalDiskKind; + use nexus_db_model::PhysicalDiskPolicy; + use nexus_db_model::Zpool; + use nexus_test_utils::SLED_AGENT_UUID; + use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external::Generation; + use omicron_common::api::internal::shared::DatasetKind; + use omicron_common::disk::DatasetConfig; + use omicron_common::disk::DatasetName; + use omicron_common::disk::DatasetsConfig; + use omicron_common::disk::SharedDatasetConfig; + use omicron_common::zpool_name::ZpoolName; + use omicron_uuid_kinds::{DatasetUuid, PhysicalDiskUuid, SledUuid}; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + // If we have not populated any bundles needing cleanup, the cleanup + // process should succeed with an empty cleanup report. + #[nexus_test(server = crate::Server)] + async fn test_cleanup_noop(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + + let report = collector + .cleanup_destroyed_bundles(&opctx) + .await + .expect("Cleanup should succeed with no work to do"); + + assert_eq!(report, CleanupReport::default()); + } + + // If there are no bundles in need of collection, the collection task should + // run without error, but return nothing. + #[nexus_test(server = crate::Server)] + async fn test_collect_noop(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + + let request = BundleRequest::default(); + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should succeed with no bundles"); + assert!(report.is_none()); + } + + async fn add_zpool_and_debug_dataset( + datastore: &DataStore, + opctx: &OpContext, + id: PhysicalDiskUuid, + sled_id: SledUuid, + ) -> (ZpoolUuid, DatasetUuid) { + let zpool = datastore + .zpool_insert( + opctx, + Zpool::new(Uuid::new_v4(), sled_id.into_untyped_uuid(), id), + ) + .await + .unwrap(); + + let dataset = datastore + .dataset_upsert(Dataset::new( + DatasetUuid::new_v4(), + zpool.id(), + None, + DatasetKind::Debug, + )) + .await + .unwrap(); + (ZpoolUuid::from_untyped_uuid(zpool.id()), dataset.id()) + } + + async fn make_disk_in_db( + datastore: &DataStore, + opctx: &OpContext, + i: usize, + sled_id: SledUuid, + ) -> PhysicalDiskUuid { + let id = PhysicalDiskUuid::new_v4(); + let physical_disk = PhysicalDisk::new( + id, + "v".into(), + format!("s-{i})"), + "m".into(), + PhysicalDiskKind::U2, + sled_id.into_untyped_uuid(), + ); + datastore + .physical_disk_insert(&opctx, physical_disk.clone()) + .await + .unwrap(); + id + } + + struct TestDataset { + disk_id: PhysicalDiskUuid, + zpool_id: ZpoolUuid, + dataset_id: DatasetUuid, + } + + impl TestDataset { + // For the purposes of this test, we're going to provision support + // bundles to a single sled. + async fn setup( + cptestctx: &ControlPlaneTestContext, + datastore: &Arc, + opctx: &OpContext, + count: usize, + ) -> Vec { + let sled_id = SledUuid::from_untyped_uuid( + SLED_AGENT_UUID.parse::().unwrap(), + ); + + let mut disks = vec![]; + + for i in 0..count { + // Create the (disk, zpool, dataset) tuple in Nexus + let disk_id = + make_disk_in_db(datastore, opctx, i, sled_id).await; + let (zpool_id, dataset_id) = add_zpool_and_debug_dataset( + &datastore, &opctx, disk_id, sled_id, + ) + .await; + + // Tell the simulated sled agent to create this storage. + // + // (We could do this via HTTP request, but it's in the test process, + // so we can just directly call the method on the sled agent) + cptestctx.sled_agent.sled_agent.create_zpool( + zpool_id, + disk_id, + 1 << 40, + ); + disks.push(Self { disk_id, zpool_id, dataset_id }) + } + + // Create a configuration for the sled agent consisting of all these + // debug datasets. + let datasets = disks + .iter() + .map(|TestDataset { zpool_id, dataset_id, .. }| { + ( + *dataset_id, + DatasetConfig { + id: *dataset_id, + name: DatasetName::new( + ZpoolName::new_external(*zpool_id), + DatasetKind::Debug, + ), + inner: SharedDatasetConfig::default(), + }, + ) + }) + .collect(); + let dataset_config = + DatasetsConfig { generation: Generation::new(), datasets }; + + let res = cptestctx + .sled_agent + .sled_agent + .datasets_ensure(dataset_config) + .unwrap(); + assert!(!res.has_error()); + + disks + } + + async fn expunge_disk( + &self, + datastore: &Arc, + opctx: &OpContext, + ) { + datastore + .physical_disk_update_policy( + &opctx, + self.disk_id, + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + } + } + + #[nexus_test(server = crate::Server)] + async fn test_collect_one(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 1).await; + + // Assign a bundle to ourselves. We expect to collect it on + // the next call to "collect_bundle". + let bundle = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a support bundle"); + assert_eq!(bundle.state, SupportBundleState::Collecting); + + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + + // The bundle collection should complete successfully. + let request = BundleRequest { + // NOTE: The support bundle querying interface isn't supported on + // the simulated sled agent (yet?) so we're skipping this step. + skip_sled_info: true, + ..Default::default() + }; + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should have succeeded under test") + .expect("Collecting the bundle should have generated a report"); + assert_eq!(report.bundle, bundle.id.into()); + assert!(report.listed_in_service_sleds); + assert!(report.activated_in_db_ok); + + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle.id.into()) + .await + .expect("Bundle should definitely be in db by this point"); + assert_eq!(observed_bundle.state, SupportBundleState::Active); + + // If we retry bundle collection, nothing should happen. + // The bundle has already been collected. + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should be a no-op the second time"); + assert!(report.is_none()); + } + + #[nexus_test(server = crate::Server)] + async fn test_collect_many(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 2).await; + + // Assign two bundles to ourselves. + let bundle1 = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a support bundle"); + let bundle2 = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a second support bundle"); + + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + + // Each time we call "collect_bundle", we collect a SINGLE bundle. + let request = + BundleRequest { skip_sled_info: true, ..Default::default() }; + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should have succeeded under test") + .expect("Collecting the bundle should have generated a report"); + assert_eq!(report.bundle, bundle1.id.into()); + assert!(report.listed_in_service_sleds); + assert!(report.activated_in_db_ok); + + // This is observable by checking the state of bundle1 and bundle2: + // the first is active, the second is still collecting. + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle1.id.into()) + .await + .unwrap(); + assert_eq!(observed_bundle.state, SupportBundleState::Active); + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle2.id.into()) + .await + .unwrap(); + assert_eq!(observed_bundle.state, SupportBundleState::Collecting); + + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should have succeeded under test") + .expect("Collecting the bundle should have generated a report"); + assert_eq!(report.bundle, bundle2.id.into()); + assert!(report.listed_in_service_sleds); + assert!(report.activated_in_db_ok); + + // After another collection request, we'll see that both bundles have + // been collected. + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle1.id.into()) + .await + .unwrap(); + assert_eq!(observed_bundle.state, SupportBundleState::Active); + let observed_bundle = datastore + .support_bundle_get(&opctx, bundle2.id.into()) + .await + .unwrap(); + assert_eq!(observed_bundle.state, SupportBundleState::Active); + } + + #[nexus_test(server = crate::Server)] + async fn test_bundle_cleanup_cancel_before_collect( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 2).await; + + // If we delete the bundle before we start collection, we can delete it + // immediately. + let bundle = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a support bundle"); + assert_eq!(bundle.state, SupportBundleState::Collecting); + + // Cancel the bundle immediately + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .unwrap(); + + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + + let report = collector + .cleanup_destroyed_bundles(&opctx) + .await + .expect("Cleanup should succeed with no work to do"); + assert_eq!( + report, + CleanupReport { + // Nothing was provisioned on the sled, since we hadn't started + // collection yet. + sled_bundles_deleted_not_found: 1, + // The database state was "destroying", and now it's gone. + db_destroying_bundles_removed: 1, + ..Default::default() + } + ); + } + + // TODO: Delete the bundle after collection? + + // TODO: Try to cancel a bundle mid-collection + + // TODO: Collect a bundle from... + // ... an expunged disk? + // ... an expunged sled? + + // TODO: Verify nested dataset storage is cleared? +} diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index 51f6d29de10..7e04dbe8578 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -1034,15 +1034,12 @@ pub(crate) mod test { true } - async fn no_regions_ensured( - sled_agent: &SledAgent, - test: &DiskTest<'_>, - ) -> bool { + fn no_regions_ensured(sled_agent: &SledAgent, test: &DiskTest<'_>) -> bool { for zpool in test.zpools() { for dataset in &zpool.datasets { let crucible_dataset = - sled_agent.get_crucible_dataset(zpool.id, dataset.id).await; - if !crucible_dataset.is_empty().await { + sled_agent.get_crucible_dataset(zpool.id, dataset.id); + if !crucible_dataset.is_empty() { return false; } } @@ -1073,7 +1070,7 @@ pub(crate) mod test { .await ); assert!(no_region_allocations_exist(datastore, &test).await); - assert!(no_regions_ensured(&sled_agent, &test).await); + assert!(no_regions_ensured(&sled_agent, &test)); assert!(test.crucible_resources_deleted().await); } diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 07f7911ef54..aa181d7b792 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -1346,7 +1346,7 @@ pub mod test { assert!(disk_is_detached(datastore).await); assert!(no_instances_or_disks_on_sled(&sled_agent).await); - let v2p_mappings = &*sled_agent.v2p_mappings.lock().await; + let v2p_mappings = &*sled_agent.v2p_mappings.lock().unwrap(); assert!(v2p_mappings.is_empty()); } diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index e6fb8654ea0..c890e65d333 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -435,7 +435,7 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().await; + let mut eips = sled_agent.external_ips.lock().unwrap(); let my_eips = eips.entry(vmm_id).or_default(); assert!(my_eips .iter() @@ -497,7 +497,7 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().await; + let mut eips = sled_agent.external_ips.lock().unwrap(); let my_eips = eips.entry(vmm_id).or_default(); assert!(my_eips.is_empty()); } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index d9da9fc05cf..f96fbdf702c 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -405,7 +405,7 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().await; + let mut eips = sled_agent.external_ips.lock().unwrap(); let my_eips = eips.entry(vmm_id).or_default(); assert!(my_eips.is_empty()); @@ -467,7 +467,7 @@ pub(crate) mod test { assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); // No IP bindings remain on sled-agent. - let eips = &*sled_agent.external_ips.lock().await; + let eips = &*sled_agent.external_ips.lock().unwrap(); for (_nic_id, eip_set) in eips { assert_eq!(eip_set.len(), 2); } diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index c5cb0231d18..89f453c873c 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1282,26 +1282,24 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { // Tell the simulated sled agent to create the disk and zpool containing // these datasets. - sled_agent - .create_external_physical_disk( - physical_disk_id, - disk_identity.clone(), - ) - .await; - sled_agent - .create_zpool(zpool.id, physical_disk_id, zpool.size.to_bytes()) - .await; + sled_agent.create_external_physical_disk( + physical_disk_id, + disk_identity.clone(), + ); + sled_agent.create_zpool( + zpool.id, + physical_disk_id, + zpool.size.to_bytes(), + ); for dataset in &zpool.datasets { // Sled Agent side: Create the Dataset, make sure regions can be // created immediately if Nexus requests anything. let address = - sled_agent.create_crucible_dataset(zpool.id, dataset.id).await; + sled_agent.create_crucible_dataset(zpool.id, dataset.id); let crucible = - sled_agent.get_crucible_dataset(zpool.id, dataset.id).await; - crucible - .set_create_callback(Box::new(|_| RegionState::Created)) - .await; + sled_agent.get_crucible_dataset(zpool.id, dataset.id); + crucible.set_create_callback(Box::new(|_| RegionState::Created)); // Nexus side: Notify Nexus of the physical disk/zpool/dataset // combination that exists. @@ -1381,23 +1379,19 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for dataset in &zpool.datasets { let crucible = self .get_sled(*sled_id) - .get_crucible_dataset(zpool.id, dataset.id) - .await; + .get_crucible_dataset(zpool.id, dataset.id); let called = std::sync::atomic::AtomicBool::new(false); - crucible - .set_create_callback(Box::new(move |_| { - if !called.load(std::sync::atomic::Ordering::SeqCst) - { - called.store( - true, - std::sync::atomic::Ordering::SeqCst, - ); - RegionState::Requested - } else { - RegionState::Created - } - })) - .await; + crucible.set_create_callback(Box::new(move |_| { + if !called.load(std::sync::atomic::Ordering::SeqCst) { + called.store( + true, + std::sync::atomic::Ordering::SeqCst, + ); + RegionState::Requested + } else { + RegionState::Created + } + })); } } } @@ -1409,11 +1403,9 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for dataset in &zpool.datasets { let crucible = self .get_sled(*sled_id) - .get_crucible_dataset(zpool.id, dataset.id) - .await; + .get_crucible_dataset(zpool.id, dataset.id); crucible - .set_create_callback(Box::new(|_| RegionState::Failed)) - .await; + .set_create_callback(Box::new(|_| RegionState::Failed)); } } } @@ -1430,9 +1422,8 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for dataset in &zpool.datasets { let crucible = self .get_sled(*sled_id) - .get_crucible_dataset(zpool.id, dataset.id) - .await; - if !crucible.is_empty().await { + .get_crucible_dataset(zpool.id, dataset.id); + if !crucible.is_empty() { return false; } } diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 5f7e92c3937..621df474482 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -373,7 +373,6 @@ impl<'a> RegionReplacementDeletedVolumeTest<'a> { .activate_background_attachment( region_replacement.volume_id.to_string(), ) - .await .unwrap(); } diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 54cf9036af6..cdcf037c66f 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -1342,9 +1342,7 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_deletion_error(true) - .await; + .set_region_deletion_error(true); // Delete the disk - expect this to fail NexusRequest::new( @@ -1380,9 +1378,7 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_deletion_error(false) - .await; + .set_region_deletion_error(false); // Request disk delete again NexusRequest::new( @@ -2480,7 +2476,7 @@ async fn test_no_halt_disk_delete_one_region_on_expunged_agent( let zpool = disk_test.zpools().next().expect("Expected at least one zpool"); let dataset = &zpool.datasets[0]; - cptestctx.sled_agent.sled_agent.drop_dataset(zpool.id, dataset.id).await; + cptestctx.sled_agent.sled_agent.drop_dataset(zpool.id, dataset.id); // Spawn a task that tries to delete the disk let disk_url = get_disk_url(DISK_NAME); diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 2949ea45600..85dfd0153c6 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -647,7 +647,7 @@ async fn test_instance_start_creates_networking_state( sled_agents.push(&cptestctx.sled_agent.sled_agent); for agent in &sled_agents { - agent.v2p_mappings.lock().await.clear(); + agent.v2p_mappings.lock().unwrap().clear(); } // Start the instance and make sure that it gets to Running. @@ -6245,7 +6245,7 @@ async fn test_instance_v2p_mappings(cptestctx: &ControlPlaneTestContext) { // Validate that every sled no longer has the V2P mapping for this instance for sled_agent in &sled_agents { let condition = || async { - let v2p_mappings = sled_agent.v2p_mappings.lock().await; + let v2p_mappings = sled_agent.v2p_mappings.lock().unwrap(); if v2p_mappings.is_empty() { Ok(()) } else { @@ -6502,7 +6502,7 @@ async fn assert_sled_v2p_mappings( vni: Vni, ) { let condition = || async { - let v2p_mappings = sled_agent.v2p_mappings.lock().await; + let v2p_mappings = sled_agent.v2p_mappings.lock().unwrap(); let mapping = v2p_mappings.iter().find(|mapping| { mapping.virtual_ip == nic.ip && mapping.virtual_mac == nic.mac @@ -6574,7 +6574,7 @@ pub async fn assert_sled_vpc_routes( kind: RouterKind::Custom(db_subnet.ipv4_block.0.into()), }; - let vpc_routes = sled_agent.vpc_routes.lock().await; + let vpc_routes = sled_agent.vpc_routes.lock().unwrap(); let sys_routes_found = vpc_routes .iter() .any(|(id, set)| *id == sys_key && set.routes == system_routes); diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index accb4470fb4..dff07732c7d 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -980,9 +980,7 @@ async fn test_snapshot_unwind(cptestctx: &ControlPlaneTestContext) { .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_creating_a_running_snapshot_should_fail() - .await; + .set_creating_a_running_snapshot_should_fail(); // Issue snapshot request, expecting it to fail let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 6a9ce283899..543e66ee303 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -2528,9 +2528,7 @@ async fn test_disk_create_saga_unwinds_correctly( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_creation_error(true) - .await; + .set_region_creation_error(true); let disk_size = ByteCount::from_gibibytes_u32(2); let base_disk = params::DiskCreate { @@ -2598,9 +2596,7 @@ async fn test_snapshot_create_saga_unwinds_correctly( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_creation_error(true) - .await; + .set_region_creation_error(true); // Create a snapshot let snapshot_create = params::SnapshotCreate { @@ -4225,11 +4221,9 @@ async fn test_read_only_region_reference_counting( TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), db_read_only_dataset.id(), ) - .await .get(crucible_agent_client::types::RegionId( read_only_region.id().to_string() )) - .await .unwrap() .state, crucible_agent_client::types::State::Created @@ -4297,11 +4291,9 @@ async fn test_read_only_region_reference_counting( TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), db_read_only_dataset.id(), ) - .await .get(crucible_agent_client::types::RegionId( read_only_region.id().to_string() )) - .await .unwrap() .state, crucible_agent_client::types::State::Destroyed diff --git a/sled-agent/src/sim/artifact_store.rs b/sled-agent/src/sim/artifact_store.rs index d73f5a28802..efb4fa9f36e 100644 --- a/sled-agent/src/sim/artifact_store.rs +++ b/sled-agent/src/sim/artifact_store.rs @@ -5,10 +5,7 @@ //! Implementation of `crate::artifact_store::StorageBackend` for our simulated //! storage. -use std::sync::Arc; - use camino_tempfile::Utf8TempDir; -use futures::lock::Mutex; use sled_storage::error::Error as StorageError; use super::storage::Storage; @@ -16,11 +13,11 @@ use crate::artifact_store::DatasetsManager; pub(super) struct SimArtifactStorage { root: Utf8TempDir, - backend: Arc>, + backend: Storage, } impl SimArtifactStorage { - pub(super) fn new(backend: Arc>) -> SimArtifactStorage { + pub(super) fn new(backend: Storage) -> SimArtifactStorage { SimArtifactStorage { root: camino_tempfile::tempdir().unwrap(), backend, @@ -36,9 +33,7 @@ impl DatasetsManager for SimArtifactStorage { let config = self .backend .lock() - .await .datasets_config_list() - .await .map_err(|_| StorageError::LedgerNotFound)?; Ok(crate::artifact_store::filter_dataset_mountpoints( config, diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 2d23f9150b2..ac4a9a2605b 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -37,6 +37,7 @@ use omicron_common::disk::DatasetsManagementResult; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; use omicron_common::update::ArtifactHash; +use range_requests::RequestContextEx; use sled_agent_api::*; use sled_agent_types::boot_disk::BootDiskOsWriteStatus; use sled_agent_types::boot_disk::BootDiskPathParams; @@ -240,7 +241,6 @@ impl SledAgentApi for SledAgentSimImpl { path_params.disk_id, body.snapshot_id, ) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(VmmIssueDiskSnapshotRequestResponse { @@ -268,7 +268,6 @@ impl SledAgentApi for SledAgentSimImpl { let body_args = body.into_inner(); sa.set_virtual_nic_host(&body_args) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseUpdatedNoContent()) @@ -282,7 +281,6 @@ impl SledAgentApi for SledAgentSimImpl { let body_args = body.into_inner(); sa.unset_virtual_nic_host(&body_args) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseUpdatedNoContent()) @@ -294,7 +292,7 @@ impl SledAgentApi for SledAgentSimImpl { { let sa = rqctx.context(); - let vnics = sa.list_virtual_nics().await.map_err(HttpError::from)?; + let vnics = sa.list_virtual_nics().map_err(HttpError::from)?; Ok(HttpResponseOk(vnics)) } @@ -310,7 +308,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result, HttpError> { let config = - rqctx.context().bootstore_network_config.lock().await.clone(); + rqctx.context().bootstore_network_config.lock().unwrap().clone(); Ok(HttpResponseOk(config)) } @@ -318,7 +316,8 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, body: TypedBody, ) -> Result { - let mut config = rqctx.context().bootstore_network_config.lock().await; + let mut config = + rqctx.context().bootstore_network_config.lock().unwrap(); *config = body.into_inner(); Ok(HttpResponseUpdatedNoContent()) } @@ -329,7 +328,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { let sa = rqctx.context(); Ok(HttpResponseOk( - sa.inventory(rqctx.server.local_addr).await.map_err(|e| { + sa.inventory(rqctx.server.local_addr).map_err(|e| { HttpError::for_internal_error(format!("{:#}", e)) })?, )) @@ -341,7 +340,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { let sa = rqctx.context(); let body_args = body.into_inner(); - let result = sa.datasets_ensure(body_args).await?; + let result = sa.datasets_ensure(body_args)?; Ok(HttpResponseOk(result)) } @@ -349,7 +348,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.datasets_config_list().await?)) + Ok(HttpResponseOk(sa.datasets_config_list()?)) } async fn omicron_physical_disks_put( @@ -358,7 +357,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { let sa = rqctx.context(); let body_args = body.into_inner(); - let result = sa.omicron_physical_disks_ensure(body_args).await?; + let result = sa.omicron_physical_disks_ensure(body_args)?; Ok(HttpResponseOk(result)) } @@ -366,7 +365,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) + Ok(HttpResponseOk(sa.omicron_physical_disks_list()?)) } async fn omicron_zones_put( @@ -375,7 +374,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result { let sa = rqctx.context(); let body_args = body.into_inner(); - sa.omicron_zones_ensure(body_args).await; + sa.omicron_zones_ensure(body_args); Ok(HttpResponseUpdatedNoContent()) } @@ -390,7 +389,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result>, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.list_vpc_routes().await)) + Ok(HttpResponseOk(sa.list_vpc_routes())) } async fn set_vpc_routes( @@ -398,7 +397,7 @@ impl SledAgentApi for SledAgentSimImpl { body: TypedBody>, ) -> Result { let sa = rqctx.context(); - sa.set_vpc_routes(body.into_inner()).await; + sa.set_vpc_routes(body.into_inner()); Ok(HttpResponseUpdatedNoContent()) } @@ -419,7 +418,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, path_params: Path, query_params: Query, - _body: StreamingBody, + body: StreamingBody, ) -> Result, HttpError> { let sa = rqctx.context(); @@ -433,6 +432,7 @@ impl SledAgentApi for SledAgentSimImpl { dataset_id, support_bundle_id, hash, + body.into_stream(), ) .await?, )) @@ -441,43 +441,45 @@ impl SledAgentApi for SledAgentSimImpl { async fn support_bundle_get( rqctx: RequestContext, path_params: Path, - _body: TypedBody, + body: TypedBody, ) -> Result, HttpError> { let sa = rqctx.context(); let SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .body(dropshot::Body::with_content( - "simulated support bundle; do not eat", - )) - .unwrap()) + let range = rqctx.range(); + let query = body.into_inner().query_type; + Ok(sa + .support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + query, + ) + .await?) } async fn support_bundle_head( rqctx: RequestContext, path_params: Path, - _body: TypedBody, + body: TypedBody, ) -> Result, HttpError> { let sa = rqctx.context(); let SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - let fictional_length = 10000; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .header(hyper::header::ACCEPT_RANGES, "bytes") - .header(hyper::header::CONTENT_LENGTH, fictional_length) - .body(dropshot::Body::empty()) - .unwrap()) + let range = rqctx.range(); + let query = body.into_inner().query_type; + Ok(sa + .support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + query, + ) + .await?) } async fn support_bundle_delete( diff --git a/sled-agent/src/sim/http_entrypoints_pantry.rs b/sled-agent/src/sim/http_entrypoints_pantry.rs index c98c7db665a..cbdc9329d46 100644 --- a/sled-agent/src/sim/http_entrypoints_pantry.rs +++ b/sled-agent/src/sim/http_entrypoints_pantry.rs @@ -69,7 +69,7 @@ async fn pantry_status( ) -> Result, HttpError> { let pantry = rc.context(); - let status = pantry.status().await?; + let status = pantry.status()?; Ok(HttpResponseOk(status)) } @@ -103,7 +103,7 @@ async fn volume_status( let path = path.into_inner(); let pantry = rc.context(); - let status = pantry.volume_status(path.id.clone()).await?; + let status = pantry.volume_status(path.id.clone())?; Ok(HttpResponseOk(status)) } @@ -134,7 +134,6 @@ async fn attach( pantry .attach(path.id.clone(), body.volume_construction_request) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(AttachResult { id: path.id })) @@ -161,13 +160,11 @@ async fn attach_activate_background( let body = body.into_inner(); let pantry = rc.context(); - pantry - .attach_activate_background( - path.id.clone(), - body.job_id, - body.volume_construction_request, - ) - .await?; + pantry.attach_activate_background( + path.id.clone(), + body.job_id, + body.volume_construction_request, + )?; Ok(HttpResponseUpdatedNoContent()) } @@ -194,7 +191,7 @@ async fn is_job_finished( let path = path.into_inner(); let pantry = rc.context(); - let job_is_finished = pantry.is_job_finished(path.id).await?; + let job_is_finished = pantry.is_job_finished(path.id)?; Ok(HttpResponseOk(JobPollResponse { job_is_finished })) } @@ -217,7 +214,7 @@ async fn job_result_ok( let path = path.into_inner(); let pantry = rc.context(); - let job_result = pantry.get_job_result(path.id).await?; + let job_result = pantry.get_job_result(path.id)?; match job_result { Ok(job_result_ok) => { @@ -260,7 +257,6 @@ async fn import_from_url( let job_id = pantry .import_from_url(path.id.clone(), body.url, body.expected_digest) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(ImportFromUrlResponse { job_id })) @@ -287,7 +283,6 @@ async fn snapshot( pantry .snapshot(path.id.clone(), body.snapshot_id) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseUpdatedNoContent()) @@ -320,7 +315,7 @@ async fn bulk_write( ) .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?; - pantry.bulk_write(path.id.clone(), body.offset, data).await?; + pantry.bulk_write(path.id.clone(), body.offset, data)?; Ok(HttpResponseUpdatedNoContent()) } @@ -344,7 +339,6 @@ async fn scrub( let job_id = pantry .scrub(path.id.clone()) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(ScrubResponse { job_id })) @@ -364,7 +358,6 @@ async fn detach( pantry .detach(path.id) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseDeleted()) diff --git a/sled-agent/src/sim/http_entrypoints_storage.rs b/sled-agent/src/sim/http_entrypoints_storage.rs index 34e26f51915..cb53d96fd11 100644 --- a/sled-agent/src/sim/http_entrypoints_storage.rs +++ b/sled-agent/src/sim/http_entrypoints_storage.rs @@ -63,7 +63,7 @@ async fn region_list( rc: RequestContext>, ) -> Result>, HttpError> { let crucible = rc.context(); - Ok(HttpResponseOk(crucible.list().await)) + Ok(HttpResponseOk(crucible.list())) } #[endpoint { @@ -77,7 +77,7 @@ async fn region_create( let params = body.into_inner(); let crucible = rc.context(); - let region = crucible.create(params).await.map_err(|e| { + let region = crucible.create(params).map_err(|e| { HttpError::for_internal_error( format!("region create failure: {:?}", e,), ) @@ -97,7 +97,7 @@ async fn region_get( let id = path.into_inner().id; let crucible = rc.context(); - match crucible.get(id).await { + match crucible.get(id) { Some(region) => Ok(HttpResponseOk(region)), None => { Err(HttpError::for_not_found(None, "Region not found".to_string())) @@ -118,7 +118,6 @@ async fn region_delete( crucible .delete(id) - .await .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?; Ok(HttpResponseDeleted()) @@ -136,16 +135,16 @@ async fn region_get_snapshots( let crucible = rc.context(); - if crucible.get(id.clone()).await.is_none() { + if crucible.get(id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - let snapshots = crucible.snapshots_for_region(&id).await; + let snapshots = crucible.snapshots_for_region(&id); - let running_snapshots = crucible.running_snapshots_for_id(&id).await; + let running_snapshots = crucible.running_snapshots_for_id(&id); Ok(HttpResponseOk(GetSnapshotResponse { snapshots, running_snapshots })) } @@ -167,14 +166,14 @@ async fn region_get_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - for snapshot in &crucible.snapshots_for_region(&p.id).await { + for snapshot in &crucible.snapshots_for_region(&p.id) { if snapshot.name == p.name { return Ok(HttpResponseOk(snapshot.clone())); } @@ -203,7 +202,7 @@ async fn region_delete_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), @@ -212,7 +211,6 @@ async fn region_delete_snapshot( crucible .delete_snapshot(&p.id, &p.name) - .await .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?; Ok(HttpResponseDeleted()) @@ -235,14 +233,14 @@ async fn region_run_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - let snapshots = crucible.snapshots_for_region(&p.id).await; + let snapshots = crucible.snapshots_for_region(&p.id); if !snapshots.iter().any(|x| x.name == p.name) { return Err(HttpError::for_not_found( @@ -251,10 +249,8 @@ async fn region_run_snapshot( )); } - let running_snapshot = crucible - .create_running_snapshot(&p.id, &p.name) - .await - .map_err(|e| { + let running_snapshot = + crucible.create_running_snapshot(&p.id, &p.name).map_err(|e| { HttpError::for_internal_error(format!( "running snapshot create failure: {:?}", e, @@ -275,14 +271,14 @@ async fn region_delete_running_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - crucible.delete_running_snapshot(&p.id, &p.name).await.map_err(|e| { + crucible.delete_running_snapshot(&p.id, &p.name).map_err(|e| { HttpError::for_internal_error(format!( "running snapshot create failure: {:?}", e, diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index ab3b155b36d..c59af4ccce7 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -24,3 +24,4 @@ pub use config::{ }; pub use server::{run_standalone_server, RssArgs, Server}; pub use sled_agent::SledAgent; +pub(crate) use storage::Storage; diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index 3833d5ca7c6..ef13cdfcdd8 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -188,23 +188,19 @@ impl Server { let vendor = "synthetic-vendor".to_string(); let serial = format!("synthetic-serial-{zpool_id}"); let model = "synthetic-model".to_string(); - sled_agent - .create_external_physical_disk( - physical_disk_id, - DiskIdentity { - vendor: vendor.clone(), - serial: serial.clone(), - model: model.clone(), - }, - ) - .await; + sled_agent.create_external_physical_disk( + physical_disk_id, + DiskIdentity { + vendor: vendor.clone(), + serial: serial.clone(), + model: model.clone(), + }, + ); - sled_agent - .create_zpool(zpool_id, physical_disk_id, zpool.size) - .await; + sled_agent.create_zpool(zpool_id, physical_disk_id, zpool.size); let dataset_id = DatasetUuid::new_v4(); let address = - sled_agent.create_crucible_dataset(zpool_id, dataset_id).await; + sled_agent.create_crucible_dataset(zpool_id, dataset_id); datasets.push(NexusTypes::DatasetCreateRequest { zpool_id: zpool_id.into_untyped_uuid(), @@ -218,10 +214,8 @@ impl Server { // Whenever Nexus tries to allocate a region, it should complete // immediately. What efficiency! let crucible = - sled_agent.get_crucible_dataset(zpool_id, dataset_id).await; - crucible - .set_create_callback(Box::new(|_| RegionState::Created)) - .await; + sled_agent.get_crucible_dataset(zpool_id, dataset_id); + crucible.set_create_callback(Box::new(|_| RegionState::Created)) } Ok(Server { @@ -240,8 +234,7 @@ impl Server { self.log.new(o!("kind" => "pantry")), self.config.storage.ip, self.sled_agent.clone(), - ) - .await; + ); self.pantry_server = Some(pantry_server); self.pantry_server.as_ref().unwrap() } @@ -370,7 +363,7 @@ pub async fn run_standalone_server( dns.initialize_with_config(&log, &dns_config).await?; let internal_dns_version = dns_config.generation; - let all_u2_zpools = server.sled_agent.get_zpools().await; + let all_u2_zpools = server.sled_agent.get_zpools(); let get_random_zpool = || { use rand::seq::SliceRandom; let pool = all_u2_zpools @@ -516,12 +509,12 @@ pub async fn run_standalone_server( }; let mut datasets = vec![]; - let physical_disks = server.sled_agent.get_all_physical_disks().await; - let zpools = server.sled_agent.get_zpools().await; + let physical_disks = server.sled_agent.get_all_physical_disks(); + let zpools = server.sled_agent.get_zpools(); for zpool in &zpools { let zpool_id = ZpoolUuid::from_untyped_uuid(zpool.id); for (dataset_id, address) in - server.sled_agent.get_crucible_datasets(zpool_id).await + server.sled_agent.get_crucible_datasets(zpool_id) { datasets.push(NexusTypes::DatasetCreateRequest { zpool_id: zpool.id, @@ -540,7 +533,7 @@ pub async fn run_standalone_server( }; let omicron_physical_disks_config = - server.sled_agent.omicron_physical_disks_list().await?; + server.sled_agent.omicron_physical_disks_list()?; let mut sled_configs = BTreeMap::new(); sled_configs.insert( config.id, @@ -559,7 +552,7 @@ pub async fn run_standalone_server( }) .collect(), }, - datasets: server.sled_agent.datasets_config_list().await?, + datasets: server.sled_agent.datasets_config_list()?, zones, }, ); diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 1f099fc0366..72dcd4a8d52 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -17,8 +17,10 @@ use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; use anyhow::bail; use anyhow::Context; +use bytes::Bytes; +use dropshot::Body; use dropshot::HttpError; -use futures::lock::Mutex; +use futures::Stream; use nexus_sled_agent_shared::inventory::{ Inventory, InventoryDataset, InventoryDisk, InventoryZpool, OmicronZonesConfig, SledRole, @@ -47,8 +49,9 @@ use oxnet::Ipv6Net; use propolis_client::{ types::VolumeConstructionRequest, Client as PropolisClient, }; +use range_requests::PotentialRange; use sled_agent_api::SupportBundleMetadata; -use sled_agent_api::SupportBundleState; +use sled_agent_api::SupportBundleQueryType; use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::{ EarlyNetworkConfig, EarlyNetworkConfigBody, @@ -62,6 +65,7 @@ use std::collections::{HashMap, HashSet, VecDeque}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::Arc; +use std::sync::Mutex; use std::time::Duration; use uuid::Uuid; @@ -79,14 +83,15 @@ pub struct SledAgent { vmms: Arc>, /// collection of simulated disks, indexed by disk uuid disks: Arc>, - storage: Arc>, + storage: Storage, updates: UpdateManager, nexus_address: SocketAddr, pub nexus_client: Arc, disk_id_to_region_ids: Mutex>>, pub v2p_mappings: Mutex>, - mock_propolis: - Mutex>, + mock_propolis: futures::lock::Mutex< + Option<(propolis_mock_server::Server, PropolisClient)>, + >, /// lists of external IPs assigned to instances pub external_ips: Mutex>>, @@ -173,11 +178,11 @@ impl SledAgent { }, }); - let storage = Arc::new(Mutex::new(Storage::new( + let storage = Storage::new( id.into_untyped_uuid(), config.storage.ip, storage_log, - ))); + ); let artifacts = ArtifactStore::new(&log, SimArtifactStorage::new(storage.clone())); @@ -202,7 +207,7 @@ impl SledAgent { v2p_mappings: Mutex::new(HashSet::new()), external_ips: Mutex::new(HashMap::new()), vpc_routes: Mutex::new(HashMap::new()), - mock_propolis: Mutex::new(None), + mock_propolis: futures::lock::Mutex::new(None), config: config.clone(), fake_zones: Mutex::new(OmicronZonesConfig { generation: Generation::new(), @@ -222,7 +227,7 @@ impl SledAgent { /// three crucible regions). Extract the region addresses, lookup the region /// from the port and pair disk id with region ids. This map is referred to /// later when making snapshots. - pub async fn map_disk_ids_to_region_ids( + pub fn map_disk_ids_to_region_ids( &self, volume_construction_request: &VolumeConstructionRequest, ) -> Result<(), Error> { @@ -243,11 +248,10 @@ impl SledAgent { let mut region_ids = Vec::new(); - let storage = self.storage.lock().await; + let storage = self.storage.lock(); for target in targets { let region = storage .get_region_for_port(target.port()) - .await .ok_or_else(|| { Error::internal_error(&format!( "no region for port {}", @@ -259,7 +263,8 @@ impl SledAgent { region_ids.push(region_id); } - let mut disk_id_to_region_ids = self.disk_id_to_region_ids.lock().await; + let mut disk_id_to_region_ids = + self.disk_id_to_region_ids.lock().unwrap(); disk_id_to_region_ids.insert(disk_id.to_string(), region_ids.clone()); Ok(()) @@ -398,10 +403,10 @@ impl SledAgent { for disk_request in &hardware.disks { let vcr = &disk_request.volume_construction_request; - self.map_disk_ids_to_region_ids(&vcr).await?; + self.map_disk_ids_to_region_ids(&vcr)?; } - let mut routes = self.vpc_routes.lock().await; + let mut routes = self.vpc_routes.lock().unwrap(); for nic in &hardware.nics { let my_routers = [ RouterId { vni: nic.vni, kind: RouterKind::System }, @@ -449,7 +454,8 @@ impl SledAgent { propolis_id: PropolisUuid, state: VmmStateRequested, ) -> Result { - if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref() + if let Some(e) = + self.instance_ensure_state_error.lock().unwrap().as_ref() { return Err(e.clone().into()); } @@ -552,7 +558,7 @@ impl SledAgent { } pub async fn set_instance_ensure_state_error(&self, error: Option) { - *self.instance_ensure_state_error.lock().await = error; + *self.instance_ensure_state_error.lock().unwrap() = error; } /// Idempotently ensures that the given API Disk (described by `api_disk`) @@ -592,89 +598,58 @@ impl SledAgent { } /// Adds a Physical Disk to the simulated sled agent. - pub async fn create_external_physical_disk( + pub fn create_external_physical_disk( &self, id: PhysicalDiskUuid, identity: DiskIdentity, ) { let variant = DiskVariant::U2; - self.storage - .lock() - .await - .insert_physical_disk(id, identity, variant) - .await; + self.storage.lock().insert_physical_disk(id, identity, variant); } - pub async fn get_all_physical_disks( + pub fn get_all_physical_disks( &self, ) -> Vec { - self.storage.lock().await.get_all_physical_disks() + self.storage.lock().get_all_physical_disks() } - pub async fn get_zpools( - &self, - ) -> Vec { - self.storage.lock().await.get_all_zpools() + pub fn get_zpools(&self) -> Vec { + self.storage.lock().get_all_zpools() } - pub async fn get_crucible_datasets( + pub fn get_crucible_datasets( &self, zpool_id: ZpoolUuid, ) -> Vec<(DatasetUuid, SocketAddr)> { - self.storage.lock().await.get_all_crucible_datasets(zpool_id) + self.storage.lock().get_all_crucible_datasets(zpool_id) } /// Adds a Zpool to the simulated sled agent. - pub async fn create_zpool( + pub fn create_zpool( &self, id: ZpoolUuid, physical_disk_id: PhysicalDiskUuid, size: u64, ) { - self.storage - .lock() - .await - .insert_zpool(id, physical_disk_id, size) - .await; - } - - /// Adds a debug dataset within a zpool - pub async fn create_debug_dataset( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) { - self.storage - .lock() - .await - .insert_debug_dataset(zpool_id, dataset_id) - .await + self.storage.lock().insert_zpool(id, physical_disk_id, size); } /// Adds a Crucible Dataset within a zpool. - pub async fn create_crucible_dataset( + pub fn create_crucible_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> SocketAddr { - self.storage - .lock() - .await - .insert_crucible_dataset(zpool_id, dataset_id) - .await + self.storage.lock().insert_crucible_dataset(zpool_id, dataset_id) } /// Returns a crucible dataset within a particular zpool. - pub async fn get_crucible_dataset( + pub fn get_crucible_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> Arc { - self.storage - .lock() - .await - .get_crucible_dataset(zpool_id, dataset_id) - .await + self.storage.lock().get_crucible_dataset(zpool_id, dataset_id) } /// Issue a snapshot request for a Crucible disk attached to an instance. @@ -686,7 +661,7 @@ impl SledAgent { /// /// We're not simulating the propolis server, so directly create a /// snapshot here. - pub async fn instance_issue_disk_snapshot_request( + pub fn instance_issue_disk_snapshot_request( &self, _propolis_id: PropolisUuid, disk_id: Uuid, @@ -696,7 +671,7 @@ impl SledAgent { // for each region that makes up the disk. Use the disk_id_to_region_ids // map to perform lookup based on this function's disk id argument. - let disk_id_to_region_ids = self.disk_id_to_region_ids.lock().await; + let disk_id_to_region_ids = self.disk_id_to_region_ids.lock().unwrap(); let region_ids = disk_id_to_region_ids.get(&disk_id.to_string()); let region_ids = region_ids.ok_or_else(|| { @@ -705,16 +680,14 @@ impl SledAgent { info!(self.log, "disk id {} region ids are {:?}", disk_id, region_ids); - let storage = self.storage.lock().await; + let storage = self.storage.lock(); for region_id in region_ids { - let crucible_data = - storage.get_dataset_for_region(*region_id).await; + let crucible_data = storage.get_dataset_for_region(*region_id); if let Some(crucible_data) = crucible_data { crucible_data .create_snapshot(*region_id, snapshot_id) - .await .map_err(|e| Error::internal_error(&e.to_string()))?; } else { return Err(Error::not_found_by_id( @@ -727,28 +700,28 @@ impl SledAgent { Ok(()) } - pub async fn set_virtual_nic_host( + pub fn set_virtual_nic_host( &self, mapping: &VirtualNetworkInterfaceHost, ) -> Result<(), Error> { - let mut v2p_mappings = self.v2p_mappings.lock().await; + let mut v2p_mappings = self.v2p_mappings.lock().unwrap(); v2p_mappings.insert(mapping.clone()); Ok(()) } - pub async fn unset_virtual_nic_host( + pub fn unset_virtual_nic_host( &self, mapping: &VirtualNetworkInterfaceHost, ) -> Result<(), Error> { - let mut v2p_mappings = self.v2p_mappings.lock().await; + let mut v2p_mappings = self.v2p_mappings.lock().unwrap(); v2p_mappings.remove(mapping); Ok(()) } - pub async fn list_virtual_nics( + pub fn list_virtual_nics( &self, ) -> Result, Error> { - let v2p_mappings = self.v2p_mappings.lock().await; + let v2p_mappings = self.v2p_mappings.lock().unwrap(); Ok(Vec::from_iter(v2p_mappings.clone())) } @@ -763,7 +736,7 @@ impl SledAgent { )); } - let mut eips = self.external_ips.lock().await; + let mut eips = self.external_ips.lock().unwrap(); let my_eips = eips.entry(propolis_id).or_default(); // High-level behaviour: this should always succeed UNLESS @@ -796,7 +769,7 @@ impl SledAgent { )); } - let mut eips = self.external_ips.lock().await; + let mut eips = self.external_ips.lock().unwrap(); let my_eips = eips.entry(propolis_id).or_default(); my_eips.remove(&body_args); @@ -841,10 +814,7 @@ impl SledAgent { Ok(addr) } - pub async fn inventory( - &self, - addr: SocketAddr, - ) -> anyhow::Result { + pub fn inventory(&self, addr: SocketAddr) -> anyhow::Result { let sled_agent_address = match addr { SocketAddr::V4(_) => { bail!("sled_agent_ip must be v6 for inventory") @@ -852,7 +822,7 @@ impl SledAgent { SocketAddr::V6(v6) => v6, }; - let storage = self.storage.lock().await; + let storage = self.storage.lock(); Ok(Inventory { sled_id: self.id, sled_agent_address, @@ -867,7 +837,7 @@ impl SledAgent { self.config.hardware.reservoir_ram, ) .context("reservoir_size")?, - omicron_zones: self.fake_zones.lock().await.clone(), + omicron_zones: self.fake_zones.lock().unwrap().clone(), disks: storage .physical_disks() .values() @@ -899,7 +869,6 @@ impl SledAgent { // represent the "real" datasets the sled agent can observe. datasets: storage .datasets_config_list() - .await .map(|config| { config .datasets @@ -926,10 +895,10 @@ impl SledAgent { dataset_id: DatasetUuid, ) -> Result, HttpError> { self.storage - .lock() - .await - .support_bundle_list(zpool_id, dataset_id) + .as_support_bundle_storage(&self.log) + .list(zpool_id, dataset_id) .await + .map_err(|err| err.into()) } pub async fn support_bundle_create( @@ -938,22 +907,19 @@ impl SledAgent { dataset_id: DatasetUuid, support_bundle_id: SupportBundleUuid, expected_hash: ArtifactHash, + stream: impl Stream>, ) -> Result { self.storage - .lock() - .await - .support_bundle_create( + .as_support_bundle_storage(&self.log) + .create( zpool_id, dataset_id, support_bundle_id, expected_hash, + stream, ) - .await?; - - Ok(SupportBundleMetadata { - support_bundle_id, - state: SupportBundleState::Complete, - }) + .await + .map_err(|err| err.into()) } pub async fn support_bundle_get( @@ -961,12 +927,14 @@ impl SledAgent { zpool_id: ZpoolUuid, dataset_id: DatasetUuid, support_bundle_id: SupportBundleUuid, - ) -> Result<(), HttpError> { + range: Option, + query: SupportBundleQueryType, + ) -> Result, HttpError> { self.storage - .lock() - .await - .support_bundle_exists(zpool_id, dataset_id, support_bundle_id) + .as_support_bundle_storage(&self.log) + .get(zpool_id, dataset_id, support_bundle_id, range, query) .await + .map_err(|err| err.into()) } pub async fn support_bundle_delete( @@ -976,67 +944,58 @@ impl SledAgent { support_bundle_id: SupportBundleUuid, ) -> Result<(), HttpError> { self.storage - .lock() - .await - .support_bundle_delete(zpool_id, dataset_id, support_bundle_id) + .as_support_bundle_storage(&self.log) + .delete(zpool_id, dataset_id, support_bundle_id) .await + .map_err(|err| err.into()) } - pub async fn datasets_ensure( + pub fn datasets_ensure( &self, config: DatasetsConfig, ) -> Result { - self.storage.lock().await.datasets_ensure(config).await + self.storage.lock().datasets_ensure(config) } - pub async fn datasets_config_list( - &self, - ) -> Result { - self.storage.lock().await.datasets_config_list().await + pub fn datasets_config_list(&self) -> Result { + self.storage.lock().datasets_config_list() } - pub async fn omicron_physical_disks_list( + pub fn omicron_physical_disks_list( &self, ) -> Result { - self.storage.lock().await.omicron_physical_disks_list().await + self.storage.lock().omicron_physical_disks_list() } - pub async fn omicron_physical_disks_ensure( + pub fn omicron_physical_disks_ensure( &self, config: OmicronPhysicalDisksConfig, ) -> Result { - self.storage.lock().await.omicron_physical_disks_ensure(config).await + self.storage.lock().omicron_physical_disks_ensure(config) } - pub async fn omicron_zones_list(&self) -> OmicronZonesConfig { - self.fake_zones.lock().await.clone() + pub fn omicron_zones_list(&self) -> OmicronZonesConfig { + self.fake_zones.lock().unwrap().clone() } - pub async fn omicron_zones_ensure( - &self, - requested_zones: OmicronZonesConfig, - ) { - *self.fake_zones.lock().await = requested_zones; + pub fn omicron_zones_ensure(&self, requested_zones: OmicronZonesConfig) { + *self.fake_zones.lock().unwrap() = requested_zones; } - pub async fn drop_dataset( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) { - self.storage.lock().await.drop_dataset(zpool_id, dataset_id) + pub fn drop_dataset(&self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid) { + self.storage.lock().drop_dataset(zpool_id, dataset_id) } - pub async fn list_vpc_routes(&self) -> Vec { - let routes = self.vpc_routes.lock().await; + pub fn list_vpc_routes(&self) -> Vec { + let routes = self.vpc_routes.lock().unwrap(); routes .iter() .map(|(k, v)| ResolvedVpcRouteState { id: *k, version: v.version }) .collect() } - pub async fn set_vpc_routes(&self, new_routes: Vec) { - let mut routes = self.vpc_routes.lock().await; + pub fn set_vpc_routes(&self, new_routes: Vec) { + let mut routes = self.vpc_routes.lock().unwrap(); for new in new_routes { // Disregard any route information for a subnet we don't have. let Some(old) = routes.get(&new.id) else { diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index dc8cf63fe41..2c44e4a4891 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -12,15 +12,18 @@ use crate::sim::http_entrypoints_pantry::ExpectedDigest; use crate::sim::http_entrypoints_pantry::PantryStatus; use crate::sim::http_entrypoints_pantry::VolumeStatus; use crate::sim::SledAgent; +use crate::support_bundle::storage::SupportBundleManager; use anyhow::{self, bail, Result}; +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; use chrono::prelude::*; use crucible_agent_client::types::{ CreateRegion, Region, RegionId, RunningSnapshot, Snapshot, State, }; use dropshot::HandlerTaskMode; use dropshot::HttpError; -use futures::lock::Mutex; use omicron_common::disk::DatasetManagementStatus; +use omicron_common::disk::DatasetName; use omicron_common::disk::DatasetsConfig; use omicron_common::disk::DatasetsManagementResult; use omicron_common::disk::DiskIdentity; @@ -28,24 +31,26 @@ use omicron_common::disk::DiskManagementStatus; use omicron_common::disk::DiskVariant; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; -use omicron_common::update::ArtifactHash; +use omicron_common::disk::SharedDatasetConfig; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::PropolisUuid; -use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; use propolis_client::types::VolumeConstructionRequest; use serde::Serialize; -use sled_agent_api::SupportBundleMetadata; -use sled_agent_api::SupportBundleState; +use sled_storage::manager::NestedDatasetConfig; +use sled_storage::manager::NestedDatasetListOptions; +use sled_storage::manager::NestedDatasetLocation; use slog::Logger; +use std::collections::BTreeMap; use std::collections::HashMap; use std::collections::HashSet; use std::net::{IpAddr, SocketAddr}; use std::str::FromStr; use std::sync::Arc; +use std::sync::Mutex; use uuid::Uuid; type CreateCallback = Box State + Send + 'static>; @@ -676,91 +681,90 @@ impl CrucibleData { } } - pub async fn set_create_callback(&self, callback: CreateCallback) { - self.inner.lock().await.set_create_callback(callback); + pub fn set_create_callback(&self, callback: CreateCallback) { + self.inner.lock().unwrap().set_create_callback(callback); } - pub async fn list(&self) -> Vec { - self.inner.lock().await.list() + pub fn list(&self) -> Vec { + self.inner.lock().unwrap().list() } - pub async fn create(&self, params: CreateRegion) -> Result { - self.inner.lock().await.create(params) + pub fn create(&self, params: CreateRegion) -> Result { + self.inner.lock().unwrap().create(params) } - pub async fn get(&self, id: RegionId) -> Option { - self.inner.lock().await.get(id) + pub fn get(&self, id: RegionId) -> Option { + self.inner.lock().unwrap().get(id) } - pub async fn delete(&self, id: RegionId) -> Result> { - self.inner.lock().await.delete(id) + pub fn delete(&self, id: RegionId) -> Result> { + self.inner.lock().unwrap().delete(id) } - pub async fn create_snapshot( + pub fn create_snapshot( &self, id: Uuid, snapshot_id: Uuid, ) -> Result { - self.inner.lock().await.create_snapshot(id, snapshot_id) + self.inner.lock().unwrap().create_snapshot(id, snapshot_id) } - pub async fn snapshots_for_region(&self, id: &RegionId) -> Vec { - self.inner.lock().await.snapshots_for_region(id) + pub fn snapshots_for_region(&self, id: &RegionId) -> Vec { + self.inner.lock().unwrap().snapshots_for_region(id) } - pub async fn get_snapshot_for_region( + pub fn get_snapshot_for_region( &self, id: &RegionId, snapshot_id: &str, ) -> Option { - self.inner.lock().await.get_snapshot_for_region(id, snapshot_id) + self.inner.lock().unwrap().get_snapshot_for_region(id, snapshot_id) } - pub async fn running_snapshots_for_id( + pub fn running_snapshots_for_id( &self, id: &RegionId, ) -> HashMap { - self.inner.lock().await.running_snapshots_for_id(id) + self.inner.lock().unwrap().running_snapshots_for_id(id) } - pub async fn delete_snapshot( - &self, - id: &RegionId, - name: &str, - ) -> Result<()> { - self.inner.lock().await.delete_snapshot(id, name) + pub fn delete_snapshot(&self, id: &RegionId, name: &str) -> Result<()> { + self.inner.lock().unwrap().delete_snapshot(id, name) } - pub async fn set_creating_a_running_snapshot_should_fail(&self) { - self.inner.lock().await.set_creating_a_running_snapshot_should_fail(); + pub fn set_creating_a_running_snapshot_should_fail(&self) { + self.inner + .lock() + .unwrap() + .set_creating_a_running_snapshot_should_fail(); } - pub async fn set_region_creation_error(&self, value: bool) { - self.inner.lock().await.set_region_creation_error(value); + pub fn set_region_creation_error(&self, value: bool) { + self.inner.lock().unwrap().set_region_creation_error(value); } - pub async fn set_region_deletion_error(&self, value: bool) { - self.inner.lock().await.set_region_deletion_error(value); + pub fn set_region_deletion_error(&self, value: bool) { + self.inner.lock().unwrap().set_region_deletion_error(value); } - pub async fn create_running_snapshot( + pub fn create_running_snapshot( &self, id: &RegionId, name: &str, ) -> Result { - self.inner.lock().await.create_running_snapshot(id, name) + self.inner.lock().unwrap().create_running_snapshot(id, name) } - pub async fn delete_running_snapshot( + pub fn delete_running_snapshot( &self, id: &RegionId, name: &str, ) -> Result<()> { - self.inner.lock().await.delete_running_snapshot(id, name) + self.inner.lock().unwrap().delete_running_snapshot(id, name) } - pub async fn is_empty(&self) -> bool { - self.inner.lock().await.is_empty() + pub fn is_empty(&self) -> bool { + self.inner.lock().unwrap().is_empty() } } @@ -814,11 +818,6 @@ impl CrucibleServer { } } -#[derive(Default)] -pub(crate) struct DebugData { - bundles: HashMap, -} - pub(crate) struct PhysicalDisk { pub(crate) identity: DiskIdentity, pub(crate) variant: DiskVariant, @@ -828,7 +827,6 @@ pub(crate) struct PhysicalDisk { /// Describes data being simulated within a dataset. pub(crate) enum DatasetContents { Crucible(CrucibleServer), - Debug(DebugData), } pub(crate) struct Zpool { @@ -847,10 +845,6 @@ impl Zpool { Zpool { id, physical_disk_id, total_size, datasets: HashMap::new() } } - fn insert_debug_dataset(&mut self, id: DatasetUuid) { - self.datasets.insert(id, DatasetContents::Debug(DebugData::default())); - } - fn insert_crucible_dataset( &mut self, log: &Logger, @@ -871,10 +865,7 @@ impl Zpool { let DatasetContents::Crucible(crucible) = self .datasets .get(&id) - .expect("Failed to get the dataset we just inserted") - else { - panic!("Should have just inserted Crucible dataset"); - }; + .expect("Failed to get the dataset we just inserted"); crucible } @@ -882,17 +873,16 @@ impl Zpool { self.total_size } - pub async fn get_dataset_for_region( + pub fn get_dataset_for_region( &self, region_id: Uuid, ) -> Option> { for dataset in self.datasets.values() { - if let DatasetContents::Crucible(dataset) = dataset { - for region in &dataset.data().list().await { - let id = Uuid::from_str(®ion.id.0).unwrap(); - if id == region_id { - return Some(dataset.data()); - } + let DatasetContents::Crucible(dataset) = dataset; + for region in &dataset.data().list() { + let id = Uuid::from_str(®ion.id.0).unwrap(); + if id == region_id { + return Some(dataset.data()); } } } @@ -900,19 +890,18 @@ impl Zpool { None } - pub async fn get_region_for_port(&self, port: u16) -> Option { + pub fn get_region_for_port(&self, port: u16) -> Option { let mut regions = vec![]; for dataset in self.datasets.values() { - if let DatasetContents::Crucible(dataset) = dataset { - for region in &dataset.data().list().await { - if region.state == State::Destroyed { - continue; - } + let DatasetContents::Crucible(dataset) = dataset; + for region in &dataset.data().list() { + if region.state == State::Destroyed { + continue; + } - if port == region.port_number { - regions.push(region.clone()); - } + if port == region.port_number { + regions.push(region.clone()); } } } @@ -928,12 +917,90 @@ impl Zpool { } } +/// Represents a nested dataset +pub struct NestedDatasetStorage { + config: NestedDatasetConfig, + // We intentionally store the children before the mountpoint, + // so they are deleted first. + children: BTreeMap, + // We store this directory as a temporary directory so it gets + // removed when this struct is dropped. + #[allow(dead_code)] + mountpoint: Utf8TempDir, +} + +impl NestedDatasetStorage { + fn new( + zpool_root: &Utf8Path, + dataset_root: DatasetName, + path: String, + shared_config: SharedDatasetConfig, + ) -> Self { + let name = NestedDatasetLocation { path, root: dataset_root }; + + // Create a mountpoint for the nested dataset storage that lasts + // as long as the nested dataset does. + let mountpoint = name.mountpoint(zpool_root); + println!("NestedDatasetStorage: Mountpoint {mountpoint}"); + let parent = mountpoint.as_path().parent().unwrap(); + println!("NestedDatasetStorage: Creating parent dir: {parent}"); + std::fs::create_dir_all(&parent).unwrap(); + + let new_dir_name = mountpoint.as_path().file_name().unwrap(); + println!("NestedDatasetStorage: New dir name: {new_dir_name}"); + let mountpoint = camino_tempfile::Builder::new() + .rand_bytes(0) + .prefix(new_dir_name) + .tempdir_in(parent) + .unwrap(); + + Self { + config: NestedDatasetConfig { name, inner: shared_config }, + children: BTreeMap::new(), + mountpoint, + } + } +} + /// Simulated representation of all storage on a sled. +#[derive(Clone)] pub struct Storage { - sled_id: Uuid, + inner: Arc>, +} + +impl Storage { + pub fn new(sled_id: Uuid, crucible_ip: IpAddr, log: Logger) -> Self { + Self { + inner: Arc::new(Mutex::new(StorageInner::new( + sled_id, + crucible_ip, + log, + ))), + } + } + + pub fn lock(&self) -> std::sync::MutexGuard { + self.inner.lock().unwrap() + } + + pub fn as_support_bundle_storage<'a>( + &'a self, + log: &'a Logger, + ) -> SupportBundleManager<'a> { + SupportBundleManager::new(log, self) + } +} + +/// Simulated representation of all storage on a sled. +/// +/// Guarded by a mutex from [Storage]. +pub struct StorageInner { log: Logger, + sled_id: Uuid, + root: Utf8TempDir, config: Option, dataset_config: Option, + nested_datasets: HashMap, physical_disks: HashMap, next_disk_slot: i64, zpools: HashMap, @@ -941,13 +1008,15 @@ pub struct Storage { next_crucible_port: u16, } -impl Storage { +impl StorageInner { pub fn new(sled_id: Uuid, crucible_ip: IpAddr, log: Logger) -> Self { Self { sled_id, log, + root: camino_tempfile::tempdir().unwrap(), config: None, dataset_config: None, + nested_datasets: HashMap::new(), physical_disks: HashMap::new(), next_disk_slot: 0, zpools: HashMap::new(), @@ -956,14 +1025,17 @@ impl Storage { } } + /// Returns a path to the "zpool root" for storage. + pub fn root(&self) -> &Utf8Path { + self.root.path() + } + /// Returns an immutable reference to all (currently known) physical disks pub fn physical_disks(&self) -> &HashMap { &self.physical_disks } - pub async fn datasets_config_list( - &self, - ) -> Result { + pub fn datasets_config_list(&self) -> Result { let Some(config) = self.dataset_config.as_ref() else { return Err(HttpError::for_not_found( None, @@ -973,7 +1045,7 @@ impl Storage { Ok(config.clone()) } - pub async fn datasets_ensure( + pub fn datasets_ensure( &mut self, config: DatasetsConfig, ) -> Result { @@ -997,6 +1069,27 @@ impl Storage { } self.dataset_config.replace(config.clone()); + // Add a "nested dataset" entry for all datasets that should exist, + // and remove it for all datasets that have been removed. + let dataset_names: HashSet<_> = config + .datasets + .values() + .map(|config| config.name.clone()) + .collect(); + for dataset in &dataset_names { + let root = self.root().to_path_buf(); + self.nested_datasets.entry(dataset.clone()).or_insert_with(|| { + NestedDatasetStorage::new( + &root, + dataset.clone(), + String::new(), + SharedDatasetConfig::default(), + ) + }); + } + self.nested_datasets + .retain(|dataset, _| dataset_names.contains(&dataset)); + Ok(DatasetsManagementResult { status: config .datasets @@ -1009,7 +1102,149 @@ impl Storage { }) } - pub async fn omicron_physical_disks_list( + pub fn nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, HttpError> { + let Some(mut nested_dataset) = self.nested_datasets.get(&name.root) + else { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )); + }; + + for path_component in name.path.split('/') { + if path_component.is_empty() { + continue; + } + match nested_dataset.children.get(path_component) { + Some(dataset) => nested_dataset = dataset, + None => { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )) + } + }; + } + + let mut children: Vec<_> = nested_dataset + .children + .values() + .map(|storage| storage.config.clone()) + .collect(); + + match options { + NestedDatasetListOptions::ChildrenOnly => return Ok(children), + NestedDatasetListOptions::SelfAndChildren => { + children.insert(0, nested_dataset.config.clone()); + return Ok(children); + } + } + } + + pub fn nested_dataset_ensure( + &mut self, + config: NestedDatasetConfig, + ) -> Result<(), HttpError> { + let name = &config.name; + let nested_path = name.path.to_string(); + let zpool_root = self.root().to_path_buf(); + let Some(mut nested_dataset) = self.nested_datasets.get_mut(&name.root) + else { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )); + }; + + for path_component in nested_path.split('/') { + if path_component.is_empty() { + continue; + } + + // Final component of path -- insert it here if it doesn't exist + // already. + if !path_component.contains('/') { + let entry = + nested_dataset.children.entry(path_component.to_string()); + entry + .and_modify(|storage| { + storage.config = config.clone(); + }) + .or_insert_with(|| { + NestedDatasetStorage::new( + &zpool_root, + config.name.root, + nested_path, + config.inner, + ) + }); + return Ok(()); + } + + match nested_dataset.children.get_mut(path_component) { + Some(dataset) => nested_dataset = dataset, + None => { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )) + } + }; + } + return Err(HttpError::for_not_found( + None, + "Nested Dataset not found".to_string(), + )); + } + + pub fn nested_dataset_destroy( + &mut self, + name: NestedDatasetLocation, + ) -> Result<(), HttpError> { + let Some(mut nested_dataset) = self.nested_datasets.get_mut(&name.root) + else { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )); + }; + + for path_component in name.path.split('/') { + if path_component.is_empty() { + continue; + } + + // Final component of path -- remove it if it exists. + if !path_component.contains('/') { + if nested_dataset.children.remove(path_component).is_none() { + return Err(HttpError::for_not_found( + None, + "Nested Dataset not found".to_string(), + )); + }; + return Ok(()); + } + match nested_dataset.children.get_mut(path_component) { + Some(dataset) => nested_dataset = dataset, + None => { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )) + } + }; + } + return Err(HttpError::for_not_found( + None, + "Nested Dataset not found".to_string(), + )); + } + + pub fn omicron_physical_disks_list( &mut self, ) -> Result { let Some(config) = self.config.as_ref() else { @@ -1021,7 +1256,7 @@ impl Storage { Ok(config.clone()) } - pub async fn omicron_physical_disks_ensure( + pub fn omicron_physical_disks_ensure( &mut self, config: OmicronPhysicalDisksConfig, ) -> Result { @@ -1057,7 +1292,7 @@ impl Storage { }) } - pub async fn insert_physical_disk( + pub fn insert_physical_disk( &mut self, id: PhysicalDiskUuid, identity: DiskIdentity, @@ -1070,7 +1305,7 @@ impl Storage { } /// Adds a Zpool to the sled's simulated storage. - pub async fn insert_zpool( + pub fn insert_zpool( &mut self, zpool_id: ZpoolUuid, disk_id: PhysicalDiskUuid, @@ -1085,143 +1320,8 @@ impl Storage { &self.zpools } - fn get_debug_dataset( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) -> Result<&DebugData, HttpError> { - let Some(zpool) = self.zpools.get(&zpool_id) else { - return Err(HttpError::for_not_found( - None, - format!("zpool does not exist {zpool_id}"), - )); - }; - let Some(dataset) = zpool.datasets.get(&dataset_id) else { - return Err(HttpError::for_not_found( - None, - format!("dataset does not exist {dataset_id}"), - )); - }; - - let DatasetContents::Debug(debug) = dataset else { - return Err(HttpError::for_bad_request( - None, - format!("Not a debug dataset: {zpool_id} / {dataset_id}"), - )); - }; - - Ok(debug) - } - - fn get_debug_dataset_mut( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) -> Result<&mut DebugData, HttpError> { - let Some(zpool) = self.zpools.get_mut(&zpool_id) else { - return Err(HttpError::for_not_found( - None, - format!("zpool does not exist {zpool_id}"), - )); - }; - let Some(dataset) = zpool.datasets.get_mut(&dataset_id) else { - return Err(HttpError::for_not_found( - None, - format!("dataset does not exist {dataset_id}"), - )); - }; - - let DatasetContents::Debug(debug) = dataset else { - return Err(HttpError::for_bad_request( - None, - format!("Not a debug dataset: {zpool_id} / {dataset_id}"), - )); - }; - - Ok(debug) - } - - pub async fn support_bundle_list( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) -> Result, HttpError> { - let debug = self.get_debug_dataset(zpool_id, dataset_id)?; - - Ok(debug - .bundles - .keys() - .map(|id| SupportBundleMetadata { - support_bundle_id: *id, - state: SupportBundleState::Complete, - }) - .collect()) - } - - pub async fn support_bundle_create( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - support_bundle_id: SupportBundleUuid, - hash: ArtifactHash, - ) -> Result<(), HttpError> { - let debug = self.get_debug_dataset_mut(zpool_id, dataset_id)?; - - // This is for the simulated server, so we totally ignore the "contents" - // of the bundle and just accept that it should exist. - debug.bundles.insert(support_bundle_id, hash); - - Ok(()) - } - - pub async fn support_bundle_exists( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - support_bundle_id: SupportBundleUuid, - ) -> Result<(), HttpError> { - let debug = self.get_debug_dataset(zpool_id, dataset_id)?; - - if !debug.bundles.contains_key(&support_bundle_id) { - return Err(HttpError::for_not_found( - None, - format!("Support bundle not found {support_bundle_id}"), - )); - } - Ok(()) - } - - pub async fn support_bundle_delete( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - support_bundle_id: SupportBundleUuid, - ) -> Result<(), HttpError> { - let debug = self.get_debug_dataset_mut(zpool_id, dataset_id)?; - - if debug.bundles.remove(&support_bundle_id).is_none() { - return Err(HttpError::for_not_found( - None, - format!("Support bundle not found {support_bundle_id}"), - )); - } - Ok(()) - } - - /// Adds a debug dataset to the sled's simulated storage - pub async fn insert_debug_dataset( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) { - self.zpools - .get_mut(&zpool_id) - .expect("Zpool does not exist") - .insert_debug_dataset(dataset_id); - } - /// Adds a Crucible dataset to the sled's simulated storage. - pub async fn insert_crucible_dataset( + pub fn insert_crucible_dataset( &mut self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, @@ -1295,12 +1395,11 @@ impl Storage { DatasetContents::Crucible(server) => { Some((*id, server.address())) } - _ => None, }) .collect() } - pub async fn get_dataset( + pub fn get_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, @@ -1313,24 +1412,22 @@ impl Storage { .expect("Dataset does not exist") } - pub async fn get_crucible_dataset( + pub fn get_crucible_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> Arc { - match self.get_dataset(zpool_id, dataset_id).await { + match self.get_dataset(zpool_id, dataset_id) { DatasetContents::Crucible(crucible) => crucible.data.clone(), - _ => panic!("{zpool_id} / {dataset_id} is not a crucible dataset"), } } - pub async fn get_dataset_for_region( + pub fn get_dataset_for_region( &self, region_id: Uuid, ) -> Option> { for zpool in self.zpools.values() { - if let Some(dataset) = zpool.get_dataset_for_region(region_id).await - { + if let Some(dataset) = zpool.get_dataset_for_region(region_id) { return Some(dataset); } } @@ -1338,10 +1435,10 @@ impl Storage { None } - pub async fn get_region_for_port(&self, port: u16) -> Option { + pub fn get_region_for_port(&self, port: u16) -> Option { let mut regions = vec![]; for zpool in self.zpools.values() { - if let Some(region) = zpool.get_region_for_port(port).await { + if let Some(region) = zpool.get_region_for_port(port) { regions.push(region); } } @@ -1389,18 +1486,18 @@ impl Pantry { } } - pub async fn status(&self) -> Result { + pub fn status(&self) -> Result { Ok(PantryStatus { - volumes: self.volumes.lock().await.keys().cloned().collect(), - num_job_handles: self.jobs.lock().await.len(), + volumes: self.volumes.lock().unwrap().keys().cloned().collect(), + num_job_handles: self.jobs.lock().unwrap().len(), }) } - pub async fn entry( + pub fn entry( &self, volume_id: String, ) -> Result { - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); match volumes.get(&volume_id) { Some(entry) => Ok(entry.vcr.clone()), @@ -1408,12 +1505,12 @@ impl Pantry { } } - pub async fn attach( + pub fn attach( &self, volume_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<()> { - let mut volumes = self.volumes.lock().await; + let mut volumes = self.volumes.lock().unwrap(); volumes.insert( volume_id, @@ -1431,14 +1528,14 @@ impl Pantry { Ok(()) } - pub async fn attach_activate_background( + pub fn attach_activate_background( &self, volume_id: String, activate_job_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<(), HttpError> { - let mut volumes = self.volumes.lock().await; - let mut jobs = self.jobs.lock().await; + let mut volumes = self.volumes.lock().unwrap(); + let mut jobs = self.jobs.lock().unwrap(); volumes.insert( volume_id, @@ -1458,30 +1555,30 @@ impl Pantry { Ok(()) } - pub async fn activate_background_attachment( + pub fn activate_background_attachment( &self, volume_id: String, ) -> Result { let activate_job = { - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); volumes.get(&volume_id).unwrap().activate_job.clone().unwrap() }; - let mut status = self.volume_status(volume_id.clone()).await?; + let mut status = self.volume_status(volume_id.clone())?; status.active = true; status.seen_active = true; - self.update_volume_status(volume_id, status).await?; + self.update_volume_status(volume_id, status)?; Ok(activate_job) } - pub async fn volume_status( + pub fn volume_status( &self, volume_id: String, ) -> Result { - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); match volumes.get(&volume_id) { Some(pantry_volume) => Ok(pantry_volume.status.clone()), @@ -1490,12 +1587,12 @@ impl Pantry { } } - pub async fn update_volume_status( + pub fn update_volume_status( &self, volume_id: String, status: VolumeStatus, ) -> Result<(), HttpError> { - let mut volumes = self.volumes.lock().await; + let mut volumes = self.volumes.lock().unwrap(); match volumes.get_mut(&volume_id) { Some(pantry_volume) => { @@ -1507,22 +1604,19 @@ impl Pantry { } } - pub async fn is_job_finished( - &self, - job_id: String, - ) -> Result { - let jobs = self.jobs.lock().await; + pub fn is_job_finished(&self, job_id: String) -> Result { + let jobs = self.jobs.lock().unwrap(); if !jobs.contains(&job_id) { return Err(HttpError::for_not_found(None, job_id)); } Ok(true) } - pub async fn get_job_result( + pub fn get_job_result( &self, job_id: String, ) -> Result, HttpError> { - let mut jobs = self.jobs.lock().await; + let mut jobs = self.jobs.lock().unwrap(); if !jobs.contains(&job_id) { return Err(HttpError::for_not_found(None, job_id)); } @@ -1530,23 +1624,23 @@ impl Pantry { Ok(Ok(true)) } - pub async fn import_from_url( + pub fn import_from_url( &self, volume_id: String, _url: String, _expected_digest: Option, ) -> Result { - self.entry(volume_id).await?; + self.entry(volume_id)?; // Make up job - let mut jobs = self.jobs.lock().await; + let mut jobs = self.jobs.lock().unwrap(); let job_id = Uuid::new_v4().to_string(); jobs.insert(job_id.clone()); Ok(job_id) } - pub async fn snapshot( + pub fn snapshot( &self, volume_id: String, snapshot_id: String, @@ -1555,12 +1649,11 @@ impl Pantry { // the simulated instance ensure, then call // [`instance_issue_disk_snapshot_request`] as the snapshot logic is the // same. - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); let volume_construction_request = &volumes.get(&volume_id).unwrap().vcr; self.sled_agent - .map_disk_ids_to_region_ids(volume_construction_request) - .await?; + .map_disk_ids_to_region_ids(volume_construction_request)?; self.sled_agent .instance_issue_disk_snapshot_request( @@ -1568,17 +1661,16 @@ impl Pantry { volume_id.parse().unwrap(), snapshot_id.parse().unwrap(), ) - .await .map_err(|e| HttpError::for_internal_error(e.to_string())) } - pub async fn bulk_write( + pub fn bulk_write( &self, volume_id: String, offset: u64, data: Vec, ) -> Result<(), HttpError> { - let vcr = self.entry(volume_id).await?; + let vcr = self.entry(volume_id)?; // Currently, Nexus will only make volumes where the first subvolume is // a Region. This will change in the future! @@ -1632,19 +1724,19 @@ impl Pantry { Ok(()) } - pub async fn scrub(&self, volume_id: String) -> Result { - self.entry(volume_id).await?; + pub fn scrub(&self, volume_id: String) -> Result { + self.entry(volume_id)?; // Make up job - let mut jobs = self.jobs.lock().await; + let mut jobs = self.jobs.lock().unwrap(); let job_id = Uuid::new_v4().to_string(); jobs.insert(job_id.clone()); Ok(job_id) } - pub async fn detach(&self, volume_id: String) -> Result<()> { - let mut volumes = self.volumes.lock().await; + pub fn detach(&self, volume_id: String) -> Result<()> { + let mut volumes = self.volumes.lock().unwrap(); volumes.remove(&volume_id); Ok(()) } @@ -1656,11 +1748,7 @@ pub struct PantryServer { } impl PantryServer { - pub async fn new( - log: Logger, - ip: IpAddr, - sled_agent: Arc, - ) -> Self { + pub fn new(log: Logger, ip: IpAddr, sled_agent: Arc) -> Self { let pantry = Arc::new(Pantry::new(sled_agent)); let server = dropshot::ServerBuilder::new( diff --git a/sled-agent/src/support_bundle/storage.rs b/sled-agent/src/support_bundle/storage.rs index e51f35e1460..ff761372e4e 100644 --- a/sled-agent/src/support_bundle/storage.rs +++ b/sled-agent/src/support_bundle/storage.rs @@ -4,6 +4,7 @@ //! Management of and access to Support Bundles +use async_trait::async_trait; use bytes::Bytes; use camino::Utf8Path; use dropshot::Body; @@ -13,6 +14,7 @@ use futures::StreamExt; use omicron_common::api::external::Error as ExternalError; use omicron_common::disk::CompressionAlgorithm; use omicron_common::disk::DatasetConfig; +use omicron_common::disk::DatasetsConfig; use omicron_common::disk::SharedDatasetConfig; use omicron_common::update::ArtifactHash; use omicron_uuid_kinds::DatasetUuid; @@ -30,6 +32,7 @@ use sled_storage::manager::NestedDatasetLocation; use sled_storage::manager::StorageHandle; use slog::Logger; use slog_error_chain::InlineErrorChain; +use std::borrow::Cow; use std::io::Write; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; @@ -37,6 +40,16 @@ use tokio::io::AsyncWriteExt; use tokio_util::io::ReaderStream; use zip::result::ZipError; +// The final name of the bundle, as it is stored within the dedicated +// datasets. +// +// The full path is of the form: +// +// /pool/ext/$(POOL_UUID)/crypt/$(DATASET_TYPE)/$(BUNDLE_UUID)/bundle.zip +// | | This is a per-bundle nested dataset +// | This is a Debug dataset +const BUNDLE_FILE_NAME: &str = "bundle.zip"; + #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] @@ -100,6 +113,116 @@ impl From for HttpError { } } +/// Abstracts the storage APIs for accessing datasets. +/// +/// Allows support bundle storage to work on both simulated and non-simulated +/// sled agents. +#[async_trait] +pub trait LocalStorage: Sync { + // These methods are all prefixed as "dyn_" to avoid duplicating the name + // with the real implementations. + // + // Dispatch is a little silly; if I use the same name as the real + // implementation, then a "missing function" dispatches to the trait instead + // and results in infinite recursion. + + /// Returns all configured datasets + async fn dyn_datasets_config_list(&self) -> Result; + + /// Returns all nested datasets within an existing dataset + async fn dyn_nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, Error>; + + /// Ensures a nested dataset exists + async fn dyn_nested_dataset_ensure( + &self, + config: NestedDatasetConfig, + ) -> Result<(), Error>; + + /// Destroys a nested dataset + async fn dyn_nested_dataset_destroy( + &self, + name: NestedDatasetLocation, + ) -> Result<(), Error>; + + /// Returns the root filesystem path where datasets are mounted. + /// + /// This is typically "/" in prod, but can be a temporary directory + /// for tests to isolate storage that typically appears globally. + fn zpool_mountpoint_root(&self) -> Cow; +} + +/// This implementation is effectively a pass-through to the real methods +#[async_trait] +impl LocalStorage for StorageHandle { + async fn dyn_datasets_config_list(&self) -> Result { + self.datasets_config_list().await.map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, Error> { + self.nested_dataset_list(name, options).await.map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_ensure( + &self, + config: NestedDatasetConfig, + ) -> Result<(), Error> { + self.nested_dataset_ensure(config).await.map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_destroy( + &self, + name: NestedDatasetLocation, + ) -> Result<(), Error> { + self.nested_dataset_destroy(name).await.map_err(|err| err.into()) + } + + fn zpool_mountpoint_root(&self) -> Cow { + Cow::Borrowed(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()) + } +} + +/// This implementation allows storage bundles to be stored on simulated storage +#[async_trait] +impl LocalStorage for crate::sim::Storage { + async fn dyn_datasets_config_list(&self) -> Result { + self.lock().datasets_config_list().map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, Error> { + self.lock().nested_dataset_list(name, options).map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_ensure( + &self, + config: NestedDatasetConfig, + ) -> Result<(), Error> { + self.lock().nested_dataset_ensure(config).map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_destroy( + &self, + name: NestedDatasetLocation, + ) -> Result<(), Error> { + self.lock().nested_dataset_destroy(name).map_err(|err| err.into()) + } + + fn zpool_mountpoint_root(&self) -> Cow { + Cow::Owned(self.lock().root().to_path_buf()) + } +} + // Implements "seeking" and "putting a capacity on a file" manually. // // TODO: When https://github.com/zip-rs/zip2/issues/231 is resolved, @@ -226,7 +349,7 @@ fn stream_zip_entry( /// APIs to manage support bundle storage. pub struct SupportBundleManager<'a> { log: &'a Logger, - storage: &'a StorageHandle, + storage: &'a dyn LocalStorage, } impl<'a> SupportBundleManager<'a> { @@ -234,7 +357,7 @@ impl<'a> SupportBundleManager<'a> { /// to support bundle CRUD APIs. pub fn new( log: &'a Logger, - storage: &'a StorageHandle, + storage: &'a dyn LocalStorage, ) -> SupportBundleManager<'a> { Self { log, storage } } @@ -245,7 +368,7 @@ impl<'a> SupportBundleManager<'a> { zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> Result { - let datasets_config = self.storage.datasets_config_list().await?; + let datasets_config = self.storage.dyn_datasets_config_list().await?; let dataset = datasets_config .datasets .get(&dataset_id) @@ -279,7 +402,7 @@ impl<'a> SupportBundleManager<'a> { NestedDatasetLocation { path: String::from(""), root }; let datasets = self .storage - .nested_dataset_list( + .dyn_nested_dataset_list( dataset_location, NestedDatasetListOptions::ChildrenOnly, ) @@ -298,8 +421,8 @@ impl<'a> SupportBundleManager<'a> { // The dataset for a support bundle exists. let support_bundle_path = dataset .name - .mountpoint(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()) - .join("bundle"); + .mountpoint(&*self.storage.zpool_mountpoint_root()) + .join(BUNDLE_FILE_NAME); // Identify whether or not the final "bundle" file exists. // @@ -388,9 +511,9 @@ impl<'a> SupportBundleManager<'a> { let dataset = NestedDatasetLocation { path: support_bundle_id.to_string(), root }; // The mounted root of the support bundle dataset - let support_bundle_dir = dataset - .mountpoint(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()); - let support_bundle_path = support_bundle_dir.join("bundle"); + let support_bundle_dir = + dataset.mountpoint(&*self.storage.zpool_mountpoint_root()); + let support_bundle_path = support_bundle_dir.join(BUNDLE_FILE_NAME); let support_bundle_path_tmp = support_bundle_dir.join(format!( "bundle-{}.tmp", thread_rng() @@ -403,7 +526,7 @@ impl<'a> SupportBundleManager<'a> { // Ensure that the dataset exists. info!(log, "Ensuring dataset exists for bundle"); self.storage - .nested_dataset_ensure(NestedDatasetConfig { + .dyn_nested_dataset_ensure(NestedDatasetConfig { name: dataset, inner: SharedDatasetConfig { compression: CompressionAlgorithm::On, @@ -412,6 +535,7 @@ impl<'a> SupportBundleManager<'a> { }, }) .await?; + info!(log, "Dataset does exist for bundle"); // Exit early if the support bundle already exists if tokio::fs::try_exists(&support_bundle_path).await? { @@ -435,9 +559,14 @@ impl<'a> SupportBundleManager<'a> { // Stream the file into the dataset, first as a temporary file, // and then renaming to the final location. - info!(log, "Streaming bundle to storage"); + info!( + log, + "Streaming bundle to storage"; + "path" => ?support_bundle_path_tmp, + ); let tmp_file = tokio::fs::File::create(&support_bundle_path_tmp).await?; + if let Err(err) = Self::write_and_finalize_bundle( tmp_file, &support_bundle_path_tmp, @@ -481,7 +610,7 @@ impl<'a> SupportBundleManager<'a> { let root = self.get_configured_dataset(zpool_id, dataset_id).await?.name; self.storage - .nested_dataset_destroy(NestedDatasetLocation { + .dyn_nested_dataset_destroy(NestedDatasetLocation { path: support_bundle_id.to_string(), root, }) @@ -501,9 +630,9 @@ impl<'a> SupportBundleManager<'a> { let dataset = NestedDatasetLocation { path: support_bundle_id.to_string(), root }; // The mounted root of the support bundle dataset - let support_bundle_dir = dataset - .mountpoint(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()); - let path = support_bundle_dir.join("bundle"); + let support_bundle_dir = + dataset.mountpoint(&*self.storage.zpool_mountpoint_root()); + let path = support_bundle_dir.join(BUNDLE_FILE_NAME); let f = tokio::fs::File::open(&path).await?; Ok(f) From 3c5047d6e238ede2975799f3242e6e1c3e508176 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 5 Dec 2024 13:17:03 -0800 Subject: [PATCH 27/61] Fixing clippy --- .../tasks/support_bundle_collector.rs | 68 +++++++++++++++++-- nexus/src/app/sagas/instance_ip_attach.rs | 18 ++--- nexus/src/app/sagas/instance_ip_detach.rs | 8 ++- sled-agent/src/sim/http_entrypoints.rs | 34 +++++----- sled-agent/src/sim/storage.rs | 6 +- sled-agent/src/support_bundle/storage.rs | 6 +- 6 files changed, 99 insertions(+), 41 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 7a6e9f06046..46061c4ce21 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -910,7 +910,6 @@ mod test { // NOTE: The support bundle querying interface isn't supported on // the simulated sled agent (yet?) so we're skipping this step. skip_sled_info: true, - ..Default::default() }; let report = collector .collect_bundle(&opctx, &request) @@ -964,8 +963,7 @@ mod test { SupportBundleCollector::new(datastore.clone(), false, nexus.id()); // Each time we call "collect_bundle", we collect a SINGLE bundle. - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { skip_sled_info: true }; let report = collector .collect_bundle(&opctx, &request) .await @@ -1065,9 +1063,69 @@ mod test { ); } - // TODO: Delete the bundle after collection? + #[nexus_test(server = crate::Server)] + async fn test_bundle_cleanup_cancel_after_collect( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 1).await; - // TODO: Try to cancel a bundle mid-collection + // We can allocate a support bundle and collect it + let bundle = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a support bundle"); + assert_eq!(bundle.state, SupportBundleState::Collecting); + + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + let request = BundleRequest { skip_sled_info: true }; + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should have succeeded under test") + .expect("Collecting the bundle should have generated a report"); + assert_eq!(report.bundle, bundle.id.into()); + assert!(report.listed_in_service_sleds); + assert!(report.activated_in_db_ok); + + // Cancel the bundle after collection has completed + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .unwrap(); + + // When we perform cleanup, we should see that it was removed from the + // underlying sled. + let report = collector + .cleanup_destroyed_bundles(&opctx) + .await + .expect("Cleanup should succeed with no work to do"); + assert_eq!( + report, + CleanupReport { + // Nothing was provisioned on the sled, since we hadn't started + // collection yet. + sled_bundles_deleted_ok: 1, + // The database state was "destroying", and now it's gone. + db_destroying_bundles_removed: 1, + ..Default::default() + } + ); + } // TODO: Collect a bundle from... // ... an expunged disk? diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index c890e65d333..b0d51f72014 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -435,14 +435,16 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().unwrap(); - let my_eips = eips.entry(vmm_id).or_default(); - assert!(my_eips - .iter() - .any(|v| matches!(v, InstanceExternalIpBody::Floating(_)))); - assert!(my_eips - .iter() - .any(|v| matches!(v, InstanceExternalIpBody::Ephemeral(_)))); + { + let mut eips = sled_agent.external_ips.lock().unwrap(); + let my_eips = eips.entry(vmm_id).or_default(); + assert!(my_eips + .iter() + .any(|v| matches!(v, InstanceExternalIpBody::Floating(_)))); + assert!(my_eips + .iter() + .any(|v| matches!(v, InstanceExternalIpBody::Ephemeral(_)))); + } // DB has records for SNAT plus the new IPs. let db_eips = datastore diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index f96fbdf702c..bec46f0269e 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -405,9 +405,11 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().unwrap(); - let my_eips = eips.entry(vmm_id).or_default(); - assert!(my_eips.is_empty()); + { + let mut eips = sled_agent.external_ips.lock().unwrap(); + let my_eips = eips.entry(vmm_id).or_default(); + assert!(my_eips.is_empty()); + } // DB only has record for SNAT. let db_eips = datastore diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index ac4a9a2605b..67f9851725e 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -449,15 +449,14 @@ impl SledAgentApi for SledAgentSimImpl { let range = rqctx.range(); let query = body.into_inner().query_type; - Ok(sa - .support_bundle_get( - zpool_id, - dataset_id, - support_bundle_id, - range, - query, - ) - .await?) + sa.support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + query, + ) + .await } async fn support_bundle_head( @@ -471,15 +470,14 @@ impl SledAgentApi for SledAgentSimImpl { let range = rqctx.range(); let query = body.into_inner().query_type; - Ok(sa - .support_bundle_get( - zpool_id, - dataset_id, - support_bundle_id, - range, - query, - ) - .await?) + sa.support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + query, + ) + .await } async fn support_bundle_delete( diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 2c44e4a4891..e6d999e7e8b 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -1391,10 +1391,8 @@ impl StorageInner { zpool .datasets .iter() - .filter_map(|(id, dataset)| match dataset { - DatasetContents::Crucible(server) => { - Some((*id, server.address())) - } + .map(|(id, dataset)| match dataset { + DatasetContents::Crucible(server) => (*id, server.address()), }) .collect() } diff --git a/sled-agent/src/support_bundle/storage.rs b/sled-agent/src/support_bundle/storage.rs index ff761372e4e..bc19811c613 100644 --- a/sled-agent/src/support_bundle/storage.rs +++ b/sled-agent/src/support_bundle/storage.rs @@ -421,7 +421,7 @@ impl<'a> SupportBundleManager<'a> { // The dataset for a support bundle exists. let support_bundle_path = dataset .name - .mountpoint(&*self.storage.zpool_mountpoint_root()) + .mountpoint(&self.storage.zpool_mountpoint_root()) .join(BUNDLE_FILE_NAME); // Identify whether or not the final "bundle" file exists. @@ -512,7 +512,7 @@ impl<'a> SupportBundleManager<'a> { NestedDatasetLocation { path: support_bundle_id.to_string(), root }; // The mounted root of the support bundle dataset let support_bundle_dir = - dataset.mountpoint(&*self.storage.zpool_mountpoint_root()); + dataset.mountpoint(&self.storage.zpool_mountpoint_root()); let support_bundle_path = support_bundle_dir.join(BUNDLE_FILE_NAME); let support_bundle_path_tmp = support_bundle_dir.join(format!( "bundle-{}.tmp", @@ -631,7 +631,7 @@ impl<'a> SupportBundleManager<'a> { NestedDatasetLocation { path: support_bundle_id.to_string(), root }; // The mounted root of the support bundle dataset let support_bundle_dir = - dataset.mountpoint(&*self.storage.zpool_mountpoint_root()); + dataset.mountpoint(&self.storage.zpool_mountpoint_root()); let path = support_bundle_dir.join(BUNDLE_FILE_NAME); let f = tokio::fs::File::open(&path).await?; From 59612b02de52bb286b2277add5e129dedfa139e7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 5 Dec 2024 20:19:28 -0800 Subject: [PATCH 28/61] Fix (and test) ordering, output paths --- .../tasks/support_bundle_collector.rs | 47 ++++++++++++++++--- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 46061c4ce21..823e06bb4ae 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -5,6 +5,7 @@ //! Background task for managing Support Bundles use crate::app::background::BackgroundTask; +use camino::Utf8DirEntry; use camino::Utf8Path; use camino_tempfile::tempdir; use camino_tempfile::tempfile; @@ -572,6 +573,8 @@ impl BackgroundTask for SupportBundleCollector { } } +// TODO: Could maybe test this in isolation + // Takes a directory "dir", and zips the contents into a single zipfile. fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { let tempfile = tempfile()?; @@ -582,20 +585,28 @@ fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { Ok(zip.finish()?) } -// TODO: Could maybe test this in isolation - fn recursively_add_directory_to_zipfile( zip: &mut ZipWriter, root_path: &Utf8Path, dir_path: &Utf8Path, ) -> anyhow::Result<()> { - for entry in dir_path.read_dir_utf8()? { - let entry = entry?; + // Readdir might return entries in a non-deterministic order. + // Let's sort it for the zipfile, to be nice. + let mut entries = dir_path + .read_dir_utf8()? + .filter_map(Result::ok) + .collect::>(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); + + for entry in &entries { + // Remove the "/tmp/..." prefix from the path when we're storing it in the + // zipfile. + let dst = entry.path().strip_prefix(root_path)?; + let file_type = entry.file_type()?; if file_type.is_file() { let opts = FullFileOptions::default(); let src = entry.path(); - let dst = entry.path().strip_prefix(root_path)?; zip.start_file_from_path(dst, opts)?; let buf = std::fs::read(&src)?; @@ -603,7 +614,7 @@ fn recursively_add_directory_to_zipfile( } if file_type.is_dir() { let opts = FullFileOptions::default(); - zip.add_directory_from_path(entry.path(), opts)?; + zip.add_directory_from_path(dst, opts)?; recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; } } @@ -701,6 +712,30 @@ mod test { type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; + // Ensure that we can convert a temporary directory into a zipfile + #[test] + fn test_zipfile_creation() { + let dir = tempdir().unwrap(); + + std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); + std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); + std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") + .unwrap(); + std::fs::write(dir.path().join("file-b"), "more data").unwrap(); + + let zipfile = bundle_to_zipfile(&dir) + .expect("Should have been able to bundle zipfile"); + let archive = zip::read::ZipArchive::new(zipfile).unwrap(); + + // We expect the order to be deterministically alphabetical + let mut names = archive.file_names(); + assert_eq!(names.next(), Some("dir-a/")); + assert_eq!(names.next(), Some("dir-a/file-a")); + assert_eq!(names.next(), Some("dir-b/")); + assert_eq!(names.next(), Some("file-b")); + assert_eq!(names.next(), None); + } + // If we have not populated any bundles needing cleanup, the cleanup // process should succeed with an empty cleanup report. #[nexus_test(server = crate::Server)] From 569a28c343d0ff95095d3683d50464bacaa34fba Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 9 Dec 2024 18:02:41 -0800 Subject: [PATCH 29/61] Wire up bundles to blueprints, more tests --- nexus/db-queries/src/db/datastore/mod.rs | 1 + .../src/db/datastore/support_bundle.rs | 4 +- nexus/reconfigurator/execution/src/lib.rs | 31 ++++ .../tasks/support_bundle_collector.rs | 155 +++++++++++++++--- nexus/types/src/deployment/execution/spec.rs | 1 + 5 files changed, 167 insertions(+), 25 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 6d3f3f071e3..2b4556d604f 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -123,6 +123,7 @@ pub use region::RegionAllocationParameters; pub use silo::Discoverability; pub use sled::SledTransition; pub use sled::TransitionError; +pub use support_bundle::SupportBundleExpungementReport; pub use switch_port::SwitchPortSettingsCombinedResult; pub use virtual_provisioning_collection::StorageType; pub use vmm::VmmStateUpdateResult; diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 3f04cb11352..e5467a77a6d 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -331,7 +331,7 @@ impl DataStore { .filter(dsl::id.eq_any(bundles_to_delete)) // This check should be redundant (we already // partitioned above based on this state) but out of - // an abundance of catuion we don't auto-delete a + // an abundance of caution we don't auto-delete a // bundle in any other state. .filter(dsl::state.eq(SupportBundleState::Destroying)) .execute_async(conn) @@ -364,7 +364,7 @@ impl DataStore { } }).collect::>(); - // Mark these support bundles as failing, and assign then + // Mark these support bundles as failing, and assign them // to a nexus that should still exist. // // This should lead to their storage being freed, if it diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 5ba1665483c..096e77b7119 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -104,6 +104,13 @@ pub async fn realize_blueprint_with_overrides( blueprint, ); + register_support_bundle_failure_step( + &engine.for_component(ExecutionComponent::SupportBundles), + &opctx, + datastore, + blueprint, + ); + let sled_list = register_sled_list_step( &engine.for_component(ExecutionComponent::SledList), &opctx, @@ -244,6 +251,30 @@ fn register_zone_external_networking_step<'a>( .register(); } +fn register_support_bundle_failure_step<'a>( + registrar: &ComponentRegistrar<'_, 'a>, + opctx: &'a OpContext, + datastore: &'a DataStore, + blueprint: &'a Blueprint, +) { + registrar + .new_step( + ExecutionStepId::Ensure, + "Mark support bundles as failed if they rely on an expunged disk or sled", + move |_cx| async move { + datastore + .support_bundle_fail_expunged( + &opctx, blueprint, + ) + .await + .map_err(|err| anyhow!(err))?; + + StepSuccess::new(()).into() + }, + ) + .register(); +} + fn register_sled_list_step<'a>( registrar: &ComponentRegistrar<'_, 'a>, opctx: &'a OpContext, diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 823e06bb4ae..3e29aa6d717 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -695,7 +695,6 @@ mod test { use nexus_db_model::Dataset; use nexus_db_model::PhysicalDisk; use nexus_db_model::PhysicalDiskKind; - use nexus_db_model::PhysicalDiskPolicy; use nexus_db_model::Zpool; use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; @@ -827,7 +826,6 @@ mod test { } struct TestDataset { - disk_id: PhysicalDiskUuid, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, } @@ -865,14 +863,14 @@ mod test { disk_id, 1 << 40, ); - disks.push(Self { disk_id, zpool_id, dataset_id }) + disks.push(Self { zpool_id, dataset_id }) } // Create a configuration for the sled agent consisting of all these // debug datasets. let datasets = disks .iter() - .map(|TestDataset { zpool_id, dataset_id, .. }| { + .map(|TestDataset { zpool_id, dataset_id }| { ( *dataset_id, DatasetConfig { @@ -898,21 +896,6 @@ mod test { disks } - - async fn expunge_disk( - &self, - datastore: &Arc, - opctx: &OpContext, - ) { - datastore - .physical_disk_update_policy( - &opctx, - self.disk_id, - PhysicalDiskPolicy::Expunged, - ) - .await - .unwrap(); - } } #[nexus_test(server = crate::Server)] @@ -1162,9 +1145,135 @@ mod test { ); } - // TODO: Collect a bundle from... - // ... an expunged disk? - // ... an expunged sled? + #[nexus_test(server = crate::Server)] + async fn test_bundle_cleanup_failed_bundle_before_collection( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 1).await; + + // We can allocate a support bundle, though we'll fail it before it gets + // collected. + let bundle = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a support bundle"); + assert_eq!(bundle.state, SupportBundleState::Collecting); + + // Mark the bundle as "failing" - this should be triggered + // automatically by the blueprint executor if the corresponding + // storage has been expunged. + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Failing, + ) + .await + .unwrap(); + + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + + let report = collector + .cleanup_destroyed_bundles(&opctx) + .await + .expect("Cleanup should delete failing bundle"); + assert_eq!( + report, + CleanupReport { + // Nothing was provisioned on the sled, since we hadn't started + // collection yet. + sled_bundles_deleted_not_found: 1, + // The database state was "failing", and now it's updated. + // + // Note that it isn't immediately deleted, so the end-user + // should have a chance to observe the new state. + db_failing_bundles_updated: 1, + ..Default::default() + } + ); + } + + #[nexus_test(server = crate::Server)] + async fn test_bundle_cleanup_failed_bundle_after_collection( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); - // TODO: Verify nested dataset storage is cleared? + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 1).await; + + // We can allocate a support bundle and collect it + let bundle = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a support bundle"); + assert_eq!(bundle.state, SupportBundleState::Collecting); + + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + let request = BundleRequest { skip_sled_info: true }; + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should have succeeded under test") + .expect("Collecting the bundle should have generated a report"); + assert_eq!(report.bundle, bundle.id.into()); + + // Mark the bundle as "failing" - this should be triggered + // automatically by the blueprint executor if the corresponding + // storage has been expunged. + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Failing, + ) + .await + .unwrap(); + + // We can cleanup this bundle, even though it has already been + // collected. + let report = collector + .cleanup_destroyed_bundles(&opctx) + .await + .expect("Cleanup should delete failing bundle"); + assert_eq!( + report, + CleanupReport { + // The bundle was provisioned on the sled, so we should have + // successfully removed it when we later talk to the sled. + // + // This won't always be the case for removal - if the entire + // underlying sled was expunged, we won't get an affirmative + // HTTP response from it. But in our simulated environment, + // we have simply marked the bundle "failed" and kept the + // simulated server running. + sled_bundles_deleted_ok: 1, + // The database state was "failing", and now it's updated. + // + // Note that it isn't immediately deleted, so the end-user + // should have a chance to observe the new state. + db_failing_bundles_updated: 1, + ..Default::default() + } + ); + } } diff --git a/nexus/types/src/deployment/execution/spec.rs b/nexus/types/src/deployment/execution/spec.rs index 2472a47d03e..f6264d4deee 100644 --- a/nexus/types/src/deployment/execution/spec.rs +++ b/nexus/types/src/deployment/execution/spec.rs @@ -28,6 +28,7 @@ impl StepSpec for ReconfiguratorExecutionSpec { )] pub enum ExecutionComponent { ExternalNetworking, + SupportBundles, SledList, PhysicalDisks, OmicronZones, From 83bdc2e02594ad6b7b5ac145682f9e64d0f5feb1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 10 Dec 2024 09:18:13 -0800 Subject: [PATCH 30/61] Support bundle re-assignment is no-op without any bundles to fail --- .../src/db/datastore/support_bundle.rs | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index e5467a77a6d..26822c7f5fc 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -337,15 +337,6 @@ impl DataStore { .execute_async(conn) .await?; - let Some(arbitrary_valid_nexus) = - valid_nexus_zones.get(0).cloned() - else { - return Err(external::Error::internal_error( - "No valid Nexuses, we cannot re-assign this support bundle", - ) - .into()); - }; - // Find all bundles on nexuses that no longer exist. let bundles_with_bad_nexuses = dsl::support_bundle .filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones)) @@ -379,18 +370,41 @@ impl DataStore { )) .execute_async(conn) .await?; - let bundles_reassigned = diesel::update(dsl::support_bundle) + + let mut report = SupportBundleExpungementReport { + bundles_failed_missing_datasets, + bundles_deleted_missing_datasets, + bundles_failing_missing_nexus, + bundles_reassigned: 0, + }; + + // Exit a little early if there are no bundles to re-assign. + // + // This is a tiny optimization, but really, it means that + // tests without Nexuses in their blueprints can succeed if + // they also have no support bundles. In practice, this is + // rare, but in our existing test framework, it's fairly + // common. + if bundles_to_reassign.is_empty() { + return Ok(report); + } + + let Some(arbitrary_valid_nexus) = + valid_nexus_zones.get(0).cloned() + else { + return Err(external::Error::internal_error( + "No valid Nexuses, we cannot re-assign this support bundle", + ) + .into()); + }; + + report.bundles_reassigned = diesel::update(dsl::support_bundle) .filter(dsl::id.eq_any(bundles_to_reassign)) .set(dsl::assigned_nexus.eq(arbitrary_valid_nexus)) .execute_async(conn) .await?; - Ok(SupportBundleExpungementReport { - bundles_failed_missing_datasets, - bundles_deleted_missing_datasets, - bundles_failing_missing_nexus, - bundles_reassigned, - }) + Ok(report) } .boxed() }, From dcf4bed7a072efccf18b6a188cfd16252e3907c5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 10 Dec 2024 14:56:38 -0800 Subject: [PATCH 31/61] comment cleaning --- nexus/src/app/background/tasks/support_bundle_collector.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 3e29aa6d717..6533c0854bd 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -573,8 +573,6 @@ impl BackgroundTask for SupportBundleCollector { } } -// TODO: Could maybe test this in isolation - // Takes a directory "dir", and zips the contents into a single zipfile. fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { let tempfile = tempfile()?; From 55e9c24a7d44ea9768041e6ce623ea86683e852c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 13:14:38 -0800 Subject: [PATCH 32/61] use path params more --- nexus/external-api/output/nexus_tags.txt | 3 + nexus/external-api/src/lib.rs | 41 ++++- nexus/src/external_api/http_entrypoints.rs | 72 +++++++- nexus/types/src/external_api/params.rs | 9 + openapi/nexus.json | 193 ++++++++++++--------- 5 files changed, 225 insertions(+), 93 deletions(-) diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 4c266f87b5f..4fc92b18d8a 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -33,7 +33,10 @@ probe_view GET /experimental/v1/probes/{probe support_bundle_create POST /experimental/v1/system/support-bundles support_bundle_delete DELETE /experimental/v1/system/support-bundles/{support_bundle} support_bundle_download GET /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_download_file GET /experimental/v1/system/support-bundles/{support_bundle}/download/{file} support_bundle_head HEAD /experimental/v1/system/support-bundles/{support_bundle}/download +support_bundle_head_file HEAD /experimental/v1/system/support-bundles/{support_bundle}/download/{file} +support_bundle_index GET /experimental/v1/system/support-bundles/{support_bundle}/index support_bundle_list GET /experimental/v1/system/support-bundles support_bundle_view GET /experimental/v1/system/support-bundles/{support_bundle} timeseries_query POST /v1/timeseries/query diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 6429cfd06b5..54ba3ab34bc 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2793,7 +2793,7 @@ pub trait NexusExternalApi { query_params: Query, ) -> Result>, HttpError>; - /// View a single support bundle + /// View a support bundle #[endpoint { method = GET, path = "/experimental/v1/system/support-bundles/{support_bundle}", @@ -2804,7 +2804,18 @@ pub trait NexusExternalApi { path_params: Path, ) -> Result, HttpError>; - /// Download the contents of a single support bundle + /// Download the index of a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/index", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_index( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the contents of a support bundle #[endpoint { method = GET, path = "/experimental/v1/system/support-bundles/{support_bundle}/download", @@ -2813,10 +2824,20 @@ pub trait NexusExternalApi { async fn support_bundle_download( rqctx: RequestContext, path_params: Path, - body: TypedBody, ) -> Result, HttpError>; - /// Download the metadata of a single support bundle + /// Download a file within a support bundle + #[endpoint { + method = GET, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_download_file( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Download the metadata of a support bundle #[endpoint { method = HEAD, path = "/experimental/v1/system/support-bundles/{support_bundle}/download", @@ -2825,7 +2846,17 @@ pub trait NexusExternalApi { async fn support_bundle_head( rqctx: RequestContext, path_params: Path, - body: TypedBody, + ) -> Result, HttpError>; + + /// Download the metadata of a file within the support bundle + #[endpoint { + method = HEAD, + path = "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}", + tags = ["hidden"], // system/support-bundles: only one tag is allowed + }] + async fn support_bundle_head_file( + rqctx: RequestContext, + path_params: Path, ) -> Result, HttpError>; /// Create a new support bundle diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index bc451e2a763..cfc9f99851a 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -81,7 +81,6 @@ use omicron_common::api::external::NameOrId; use omicron_common::api::external::Probe; use omicron_common::api::external::RouterRoute; use omicron_common::api::external::RouterRouteKind; -use omicron_common::api::external::SupportBundleGetQueryParams; use omicron_common::api::external::SwitchPort; use omicron_common::api::external::SwitchPortSettings; use omicron_common::api::external::SwitchPortSettingsView; @@ -6074,10 +6073,55 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn support_bundle_index( + rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + async fn support_bundle_download( rqctx: RequestContext, _path_params: Path, - _body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_download_file( + rqctx: RequestContext, + _path_params: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { @@ -6101,7 +6145,29 @@ impl NexusExternalApi for NexusExternalApiImpl { async fn support_bundle_head( rqctx: RequestContext, _path_params: Path, - _body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + + Err(nexus + .unimplemented_todo(&opctx, crate::app::Unimpl::Public) + .await + .into()) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn support_bundle_head_file( + rqctx: RequestContext, + _path_params: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 4a4af8b052f..4e616e698f9 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -143,6 +143,15 @@ impl From for SiloSelector { } } +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SupportBundleFilePath { + #[serde(flatten)] + pub bundle: SupportBundlePath, + + /// The file within the bundle to download + pub file: String, +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq)] pub struct OptionalSiloSelector { /// Name or ID of the silo diff --git a/openapi/nexus.json b/openapi/nexus.json index fff479b9df0..bc043059dd2 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -390,7 +390,7 @@ "tags": [ "hidden" ], - "summary": "View a single support bundle", + "summary": "View a support bundle", "operationId": "support_bundle_view", "parameters": [ { @@ -460,7 +460,7 @@ "tags": [ "hidden" ], - "summary": "Download the contents of a single support bundle", + "summary": "Download the contents of a support bundle", "operationId": "support_bundle_download", "parameters": [ { @@ -474,16 +474,75 @@ } } ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SupportBundleGetQueryParams" + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + } + } + }, + "head": { + "tags": [ + "hidden" + ], + "summary": "Download the metadata of a support bundle", + "operationId": "support_bundle_head", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} } } + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download a file within a support bundle", + "operationId": "support_bundle_download_file", + "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } }, - "required": true - }, + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], "responses": { "default": { "description": "", @@ -499,9 +558,18 @@ "tags": [ "hidden" ], - "summary": "Download the metadata of a single support bundle", - "operationId": "support_bundle_head", + "summary": "Download the metadata of a file within the support bundle", + "operationId": "support_bundle_head_file", "parameters": [ + { + "in": "path", + "name": "file", + "description": "The file within the bundle to download", + "required": true, + "schema": { + "type": "string" + } + }, { "in": "path", "name": "support_bundle", @@ -513,16 +581,37 @@ } } ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SupportBundleGetQueryParams" + "responses": { + "default": { + "description": "", + "content": { + "*/*": { + "schema": {} } } - }, - "required": true - }, + } + } + } + }, + "/experimental/v1/system/support-bundles/{support_bundle}/index": { + "get": { + "tags": [ + "hidden" + ], + "summary": "Download the index of a support bundle", + "operationId": "support_bundle_index", + "parameters": [ + { + "in": "path", + "name": "support_bundle", + "description": "ID of the support bundle", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], "responses": { "default": { "description": "", @@ -20293,18 +20382,6 @@ "items" ] }, - "SupportBundleGetQueryParams": { - "description": "Query parameters for reading the support bundle", - "type": "object", - "properties": { - "query_type": { - "$ref": "#/components/schemas/SupportBundleQueryType" - } - }, - "required": [ - "query_type" - ] - }, "SupportBundleInfo": { "type": "object", "properties": { @@ -20354,60 +20431,6 @@ "items" ] }, - "SupportBundleQueryType": { - "description": "Describes the type of access to the support bundle", - "oneOf": [ - { - "description": "Access the whole support bundle", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "whole" - ] - } - }, - "required": [ - "type" - ] - }, - { - "description": "Access the names of all files within the support bundle", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "index" - ] - } - }, - "required": [ - "type" - ] - }, - { - "description": "Access a specific file within the support bundle", - "type": "object", - "properties": { - "file_path": { - "type": "string" - }, - "type": { - "type": "string", - "enum": [ - "path" - ] - } - }, - "required": [ - "file_path", - "type" - ] - } - ] - }, "SupportBundleState": { "oneOf": [ { From efca59637727022106bf19f7f9db0f4c28a3afdb Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 14:02:55 -0800 Subject: [PATCH 33/61] Review feedback --- nexus/db-model/src/support_bundle.rs | 5 +- .../src/db/datastore/support_bundle.rs | 142 ++++++++++++++++-- schema/crdb/dbinit.sql | 2 +- schema/crdb/support-bundles/up04.sql | 2 +- 4 files changed, 133 insertions(+), 18 deletions(-) diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index 0e530708770..a4b14d363b2 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -51,7 +51,7 @@ impl SupportBundleState { Destroying => vec![Active, Collecting, Failing], Failing => vec![Collecting, Active], // The "Failed" state is terminal. - Failed => vec![Collecting, Active, Failing], + Failed => vec![Active, Collecting, Failing], } } } @@ -69,8 +69,9 @@ impl From for SupportBundleStateView { // whether or not the bundle record can be safely deleted. // // Either way, it should be possible to delete the bundle. + // If a user requests that we delete a bundle in these states: // - "Failing" bundles will become "Destroying" - // - "Failed" bundles will be destroyed immediately + // - "Failed" bundles can be deleted immediately Failing => SupportBundleStateView::Failed, Failed => SupportBundleStateView::Failed, } diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 3f04cb11352..8f96c4fd7c2 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -36,8 +36,8 @@ use uuid::Uuid; const CANNOT_ALLOCATE_ERR_MSG: &'static str = "Current policy limits support bundle creation to 'one per external disk', and \ - no disks are available. Either delete old support bundles or add additional \ - disks"; + no disks are available. You must delete old support bundles before new ones \ + can be created"; const FAILURE_REASON_NO_DATASET: &'static str = "Allocated dataset no longer exists"; @@ -50,8 +50,8 @@ pub struct SupportBundleExpungementReport { /// Bundles marked "failed" because the datasets storing them have been /// expunged. pub bundles_failed_missing_datasets: usize, - /// Bundles deleted because the datasets storing them have been - /// expunged. + /// Bundles already in the "destroying" state that have been deleted because + /// the datasets storing them have been expunged. pub bundles_deleted_missing_datasets: usize, /// Bundles marked "destroying" because the nexuses managing them have been @@ -79,7 +79,7 @@ impl DataStore { reason_for_creation: &'static str, this_nexus_id: OmicronZoneUuid, ) -> CreateResult { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; #[derive(Debug)] @@ -164,7 +164,7 @@ impl DataStore { opctx: &OpContext, id: SupportBundleUuid, ) -> LookupResult { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -182,7 +182,7 @@ impl DataStore { opctx: &OpContext, pagparams: &DataPageParams<'_, Uuid>, ) -> ListResultVec { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -202,7 +202,7 @@ impl DataStore { nexus_id: OmicronZoneUuid, states: Vec, ) -> ListResultVec { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -223,7 +223,7 @@ impl DataStore { opctx: &OpContext, blueprint: &nexus_types::deployment::Blueprint, ) -> Result { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; // For this blueprint: The set of all expunged Nexus zones let invalid_nexus_zones = blueprint @@ -327,11 +327,10 @@ impl DataStore { // the dataset expungement speeds up the process. let bundles_deleted_missing_datasets = diesel::delete(dsl::support_bundle) - .filter(dsl::state.eq_any(state.valid_old_states())) .filter(dsl::id.eq_any(bundles_to_delete)) // This check should be redundant (we already // partitioned above based on this state) but out of - // an abundance of catuion we don't auto-delete a + // an abundance of caution we don't auto-delete a // bundle in any other state. .filter(dsl::state.eq(SupportBundleState::Destroying)) .execute_async(conn) @@ -410,7 +409,7 @@ impl DataStore { id: SupportBundleUuid, state: SupportBundleState, ) -> Result<(), Error> { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; use db::schema::support_bundle::dsl; let conn = self.pool_connection_authorized(opctx).await?; @@ -443,7 +442,7 @@ impl DataStore { opctx: &OpContext, id: SupportBundleUuid, ) -> Result<(), Error> { - opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; use db::schema::support_bundle::dsl; @@ -899,7 +898,7 @@ mod test { let observed_bundles = datastore .support_bundle_list(&opctx, &pagparams) .await - .expect("Should be able to get bundle we just created"); + .expect("Should be able to query when no bundles exist"); assert!(observed_bundles.is_empty()); db.terminate().await; @@ -1073,6 +1072,121 @@ mod test { logctx.cleanup_successful(); } + #[tokio::test] + async fn test_bundle_deleted_from_expunged_dataset() { + static TEST_NAME: &str = "test_bundle_deleted_from_expunged_dataset"; + let logctx = dev::test_setup_log(TEST_NAME); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut rng = SimRngState::from_seed(TEST_NAME); + let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + + // Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint, + // but which isn't exposed through the API. Since we're only able to see + // the blueprint it emits, that means we can't actually make it the + // target because "the parent blueprint is not the current target". + // + // Instead of dealing with that, we lie: claim this is the primordial + // blueprint, with no parent. + // + // Regardless, make this starter blueprint our target. + bp1.parent_blueprint_id = None; + bp_insert_and_make_target(&opctx, &datastore, &bp1).await; + + // Manually perform the equivalent of blueprint execution to populate + // database records. + let sleds = TestSled::new_from_blueprint(&bp1); + for sled in &sleds { + sled.create_database_records(&datastore, &opctx).await; + } + + // Extract Nexus and Dataset information from the generated blueprint. + let this_nexus_id = get_nexuses_from_blueprint( + &bp1, + BlueprintZoneFilter::ShouldBeRunning, + ) + .get(0) + .map(|id| *id) + .expect("There should be a Nexus in the example blueprint"); + let debug_datasets = get_debug_datasets_from_blueprint( + &bp1, + BlueprintDatasetFilter::InService, + ); + assert!(!debug_datasets.is_empty()); + + // When we create a bundle, it should exist on a dataset provisioned by + // the blueprint. + let bundle = datastore + .support_bundle_create(&opctx, "for the test", this_nexus_id) + .await + .expect("Should be able to create bundle"); + assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into())); + assert!( + debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)), + "Bundle should have been allocated from a blueprint dataset" + ); + + // Start the deletion of this bundle + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Destroying, + ) + .await + .expect("Should have been able to update state"); + + // If we try to "fail support bundles" from expunged datasets/nexuses, + // we should see a no-op. Nothing has been expunged yet! + let report = + datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect( + "Should have been able to perform no-op support bundle failure", + ); + assert_eq!(SupportBundleExpungementReport::default(), report); + + // Expunge the bundle's dataset (manually) + let bp2 = { + let mut bp2 = bp1.clone(); + bp2.id = Uuid::new_v4(); + bp2.parent_blueprint_id = Some(bp1.id); + expunge_dataset_for_bundle(&mut bp2, &bundle); + bp2 + }; + bp_insert_and_make_target(&opctx, &datastore, &bp2).await; + + datastore + .support_bundle_fail_expunged(&opctx, &bp1) + .await + .expect_err("bp1 is no longer the target; this should fail"); + let report = datastore + .support_bundle_fail_expunged(&opctx, &bp2) + .await + .expect("Should have been able to mark bundle state as failed"); + assert_eq!( + SupportBundleExpungementReport { + bundles_deleted_missing_datasets: 1, + ..Default::default() + }, + report + ); + + // Should observe no bundles (it should have been deleted) + let pagparams = DataPageParams::max_page(); + let observed_bundles = datastore + .support_bundle_list(&opctx, &pagparams) + .await + .expect("Should be able to query when no bundles exist"); + assert!(observed_bundles.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_bundle_failed_from_expunged_nexus_no_reassign() { static TEST_NAME: &str = diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 399b06570a8..37380f6e432 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2449,7 +2449,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.suppo dataset_id ); -CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( assigned_nexus ); diff --git a/schema/crdb/support-bundles/up04.sql b/schema/crdb/support-bundles/up04.sql index 6df95640fd9..58b903e5009 100644 --- a/schema/crdb/support-bundles/up04.sql +++ b/schema/crdb/support-bundles/up04.sql @@ -1,4 +1,4 @@ -CREATE INDEX IF NOT EXISTS lookup_bundle_by_neuxs ON omicron.public.support_bundle ( +CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle ( assigned_nexus ); From e29733de837068a67fa8c1e367cb6291dc964db6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 14:25:51 -0800 Subject: [PATCH 34/61] Add to uncovered endpoints, since these aren't impl'd yet --- nexus/tests/output/uncovered-authz-endpoints.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index 13fb389d6ec..0a9e62707f0 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -6,9 +6,12 @@ probe_view (get "/experimental/v1/probes/{probe support_bundle_list (get "/experimental/v1/system/support-bundles") support_bundle_view (get "/experimental/v1/system/support-bundles/{support_bundle}") support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_download_file (get "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") +support_bundle_index (get "/experimental/v1/system/support-bundles/{support_bundle}/index") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_head_file (head "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") From 51be611b154d6523a051f358540870a80b18cc71 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 16 Dec 2024 14:51:29 -0800 Subject: [PATCH 35/61] Handle bundle cleanup from deleted zpool --- nexus/db-queries/src/db/datastore/zpool.rs | 10 +- .../tasks/support_bundle_collector.rs | 380 ++++++++++++------ 2 files changed, 275 insertions(+), 115 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/zpool.rs b/nexus/db-queries/src/db/datastore/zpool.rs index 9d128b2541c..0938ed5ba60 100644 --- a/nexus/db-queries/src/db/datastore/zpool.rs +++ b/nexus/db-queries/src/db/datastore/zpool.rs @@ -288,7 +288,15 @@ impl DataStore { .select(dsl::sled_id) .first_async::(&*conn) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Zpool, + LookupType::by_id(id.into_untyped_uuid()), + ), + ) + })?; Ok(SledUuid::from_untyped_uuid(id)) } diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 6533c0854bd..abc3ccefb70 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -21,10 +21,12 @@ use nexus_types::deployment::SledFilter; use nexus_types::identity::Asset; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::api::external::ResourceType; use omicron_common::update::ArtifactHash; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; use serde::Serialize; @@ -58,6 +60,20 @@ struct CleanupReport { db_failing_bundles_updated: usize, } +// Result of asking a sled agent to clean up a bundle +enum SledAgentBundleCleanupResult { + Deleted, + NotFound, + Failed, +} + +// Result of asking the database to delete a bundle +enum DatabaseBundleCleanupResult { + DestroyingBundleRemoved, + FailingBundleUpdated, + BadState, +} + /// The background task responsible for cleaning and collecting support bundles pub struct SupportBundleCollector { datastore: Arc, @@ -74,6 +90,135 @@ impl SupportBundleCollector { SupportBundleCollector { datastore, disable, nexus_id } } + // Tells a sled agent to delete a support bundle + async fn cleanup_bundle_from_sled_agent( + &self, + opctx: &OpContext, + sled_id: SledUuid, + bundle: &SupportBundle, + ) -> anyhow::Result { + let sled_client = nexus_networking::sled_client( + &self.datastore, + &opctx, + sled_id.into_untyped_uuid(), + &opctx.log, + ) + .await?; + + let result = sled_client + .support_bundle_delete( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + ) + .await; + + match result { + Ok(_) => { + info!( + &opctx.log, + "SupportBundleCollector deleted bundle"; + "id" => %bundle.id + ); + return Ok(SledAgentBundleCleanupResult::Deleted); + } + Err(progenitor_client::Error::ErrorResponse(err)) + if err.status() == http::StatusCode::NOT_FOUND => + { + warn!( + &opctx.log, + "SupportBundleCollector could not delete bundle (not found)"; + "id" => %bundle.id + ); + + return Ok(SledAgentBundleCleanupResult::NotFound); + } + err => { + warn!( + &opctx.log, + "SupportBundleCollector could not delete bundle"; + "id" => %bundle.id, + "err" => ?err, + ); + + return Ok(SledAgentBundleCleanupResult::Failed); + } + } + } + + async fn cleanup_bundle_from_database( + &self, + opctx: &OpContext, + bundle: &SupportBundle, + ) -> anyhow::Result { + match bundle.state { + SupportBundleState::Destroying => { + // Destroying is a terminal state; no one should be able to + // change this state from underneath us. + self.datastore.support_bundle_delete( + opctx, + bundle.id.into(), + ).await.map_err(|err| { + warn!( + &opctx.log, + "SupportBundleCollector: Could not delete 'destroying' bundle"; + "err" => %err + ); + anyhow::anyhow!("Could not delete 'destroying' bundle: {:#}", err) + })?; + + return Ok( + DatabaseBundleCleanupResult::DestroyingBundleRemoved, + ); + } + SupportBundleState::Failing => { + if let Err(err) = self + .datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Failed, + ) + .await + { + if matches!(err, Error::InvalidRequest { .. }) { + // It's possible that the bundle is marked "destroying" by a + // user request, concurrently with our operation. + // + // In this case, we log that this happened, but do nothing. + // The next iteration of this background task should treat + // this as the "Destroying" case, and delete the bundle. + info!( + &opctx.log, + "SupportBundleCollector: Concurrent state change failing bundle"; + "bundle" => %bundle.id, + "err" => ?err, + ); + return Ok(DatabaseBundleCleanupResult::BadState); + } else { + anyhow::bail!( + "Could not delete 'failing' bundle: {:#}", + err + ); + } + } + + return Ok(DatabaseBundleCleanupResult::FailingBundleUpdated); + } + other => { + // We should be filtering to only see "Destroying" and + // "Failing" bundles in our database request above. + error!( + &opctx.log, + "SupportBundleCollector: Cleaning bundle in unexpected state"; + "id" => %bundle.id, + "state" => ?other + ); + return Ok(DatabaseBundleCleanupResult::BadState); + } + } + } + // Monitors all bundles that are "destroying" or "failing" and assigned to // this Nexus, and attempts to clear their storage from Sled Agents. async fn cleanup_destroyed_bundles( @@ -117,130 +262,65 @@ impl SupportBundleCollector { ); // Find the sled where we're storing this bundle. - // - // TODO: What happens if the zpool is concurrently expunged here? - // TODO: Should we "continue" to other bundles rather than bailing - // early? - let sled_id = self + let result = self .datastore .zpool_get_sled(&opctx, bundle.zpool_id.into()) - .await?; - let sled_client = nexus_networking::sled_client( - &self.datastore, - &opctx, - sled_id.into_untyped_uuid(), - &opctx.log, - ) - .await?; - - let result = sled_client - .support_bundle_delete( - &ZpoolUuid::from(bundle.zpool_id), - &DatasetUuid::from(bundle.dataset_id), - &SupportBundleUuid::from(bundle.id), - ) .await; - match result { - Ok(_) => { - info!( - &opctx.log, - "SupportBundleCollector deleted bundle"; - "id" => %bundle.id - ); - report.sled_bundles_deleted_ok += 1; - - // Safe to fall-through; we deleted sled-local state. - } - Err(progenitor_client::Error::ErrorResponse(err)) - if err.status() == http::StatusCode::NOT_FOUND => - { - warn!( - &opctx.log, - "SupportBundleCollector could not delete bundle (not found)"; - "id" => %bundle.id - ); - - report.sled_bundles_deleted_not_found += 1; - - // Safe to fall-through; sled-local state does not exist - } - err => { - warn!( - &opctx.log, - "SupportBundleCollector could not delete bundle"; - "id" => %bundle.id, - "err" => ?err, - ); - - report.sled_bundles_delete_failed += 1; - - // We don't delete this bundle -- the sled storage may be - // experiencing a transient error. - continue; - } - } - - match bundle.state { - SupportBundleState::Destroying => { - // Destroying is a terminal state; no one should be able to - // change this state from underneath us. - self.datastore.support_bundle_delete( - opctx, - bundle.id.into(), - ).await.map_err(|err| { - warn!( - &opctx.log, - "SupportBundleCollector: Could not delete 'destroying' bundle"; - "err" => %err - ); - anyhow::anyhow!("Could not delete 'destroying' bundle: {:#}", err) - })?; + println!("zpool_get_sled result: {result:?}"); - report.db_destroying_bundles_removed += 1; - } - SupportBundleState::Failing => { - if let Err(err) = self - .datastore - .support_bundle_update( - &opctx, - bundle.id.into(), - SupportBundleState::Failed, + let delete_from_db = match result { + Ok(sled_id) => { + match self + .cleanup_bundle_from_sled_agent( + &opctx, sled_id, &bundle, ) - .await + .await? { - if matches!(err, Error::InvalidRequest { .. }) { - // It's possible that the bundle is marked "destroying" by a - // user request, concurrently with our operation. - // - // In this case, we log that this happened, but do nothing. - // The next iteration of this background task should treat - // this as the "Destroying" case, and delete the bundle. - info!( - &opctx.log, - "SupportBundleCollector: Concurrent state change failing bundle"; - "bundle" => %bundle.id, - "err" => ?err, - ); - } else { - anyhow::bail!( - "Could not delete 'failing' bundle: {:#}", - err - ); + SledAgentBundleCleanupResult::Deleted => { + report.sled_bundles_deleted_ok += 1; + true } - } + SledAgentBundleCleanupResult::NotFound => { + report.sled_bundles_deleted_not_found += 1; + true + } + SledAgentBundleCleanupResult::Failed => { + report.sled_bundles_delete_failed += 1; - report.db_failing_bundles_updated += 1; + // If the sled agent reports any other error, don't + // delete the bundle from the database. It might be + // transiently unavailable. + false + } + } } - other => { - // We should be filtering to only see "Destroying" and - // "Failing" bundles in our database request above. - error!( - &opctx.log, - "SupportBundleCollector: Cleaning bundle in unexpected state"; - "id" => %bundle.id, - "state" => ?other - ); + Err(e) + if matches!( + e, + Error::ObjectNotFound { + type_name: ResourceType::Zpool, + .. + } + ) => + { + // If the pool wasn't found in the database, it was + // expunged. Delete the support bundle, since there is no + // sled agent state to manage anymore. + true + } + Err(_) => false, + }; + + if delete_from_db { + match self.cleanup_bundle_from_database(opctx, &bundle).await? { + DatabaseBundleCleanupResult::DestroyingBundleRemoved => { + report.db_destroying_bundles_removed += 1; + } + DatabaseBundleCleanupResult::FailingBundleUpdated => { + report.db_failing_bundles_updated += 1; + } + DatabaseBundleCleanupResult::BadState => {} } } } @@ -1274,4 +1354,76 @@ mod test { } ); } + + #[nexus_test(server = crate::Server)] + async fn test_bundle_cleanup_after_zpool_deletion( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 1).await; + + // We can allocate a support bundle and collect it + let bundle = datastore + .support_bundle_create(&opctx, "For collection testing", nexus.id()) + .await + .expect("Couldn't allocate a support bundle"); + assert_eq!(bundle.state, SupportBundleState::Collecting); + + let collector = + SupportBundleCollector::new(datastore.clone(), false, nexus.id()); + let request = BundleRequest { skip_sled_info: true }; + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should have succeeded under test") + .expect("Collecting the bundle should have generated a report"); + assert_eq!(report.bundle, bundle.id.into()); + + // Mark the bundle as "failing" - this should be triggered + // automatically by the blueprint executor if the corresponding + // storage has been expunged. + datastore + .support_bundle_update( + &opctx, + bundle.id.into(), + SupportBundleState::Failing, + ) + .await + .unwrap(); + + // Delete the zpool holding the bundle. + // + // This should call the "zpool_get_sled" call to fail! + datastore + .zpool_delete_self_and_all_datasets(&opctx, bundle.zpool_id.into()) + .await + .unwrap(); + + // We can cleanup this bundle, even though it has already been + // collected. + let report = collector + .cleanup_destroyed_bundles(&opctx) + .await + .expect("Cleanup should delete failing bundle"); + assert_eq!( + report, + CleanupReport { + // The database state was "failing", and now it's updated. + // + // Note that it isn't immediately deleted, so the end-user + // should have a chance to observe the new state. + db_failing_bundles_updated: 1, + ..Default::default() + } + ); + } } From 660fa66d587c78b64bed55401c41b28ff381f62b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 17 Dec 2024 13:06:35 -0800 Subject: [PATCH 36/61] using new sled agent API --- nexus/src/app/support_bundles.rs | 99 ++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 42 deletions(-) diff --git a/nexus/src/app/support_bundles.rs b/nexus/src/app/support_bundles.rs index 01e3d135b95..784a1596b2f 100644 --- a/nexus/src/app/support_bundles.rs +++ b/nexus/src/app/support_bundles.rs @@ -33,26 +33,6 @@ pub enum SupportBundleQueryType { Path { file_path: String }, } -// TODO: Remove me once https://github.com/oxidecomputer/omicron/pull/7259 -// merges -use omicron_common::api::external::SupportBundleQueryType as SledAgentSupportBundleQueryType; - -impl From for SledAgentSupportBundleQueryType { - fn from(query: SupportBundleQueryType) -> Self { - match query { - SupportBundleQueryType::Whole => { - SledAgentSupportBundleQueryType::Whole - } - SupportBundleQueryType::Index => { - SledAgentSupportBundleQueryType::Index - } - SupportBundleQueryType::Path { file_path } => { - SledAgentSupportBundleQueryType::Path { file_path } - } - } - } -} - impl super::Nexus { pub async fn support_bundle_list( &self, @@ -103,28 +83,63 @@ impl super::Nexus { // TODO: Use "range"? - let query = - omicron_common::api::external::SupportBundleGetQueryParams { - query_type: query.into(), - }; - let response = if head { - client - .support_bundle_head( - &ZpoolUuid::from(bundle.zpool_id), - &DatasetUuid::from(bundle.dataset_id), - &SupportBundleUuid::from(bundle.id), - &query, - ) - .await - } else { - client - .support_bundle_get( - &ZpoolUuid::from(bundle.zpool_id), - &DatasetUuid::from(bundle.dataset_id), - &SupportBundleUuid::from(bundle.id), - &query, - ) - .await + let response = match (query, head) { + (SupportBundleQueryType::Whole, true) => { + client + .support_bundle_head( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + ) + .await + } + (SupportBundleQueryType::Whole, false) => { + client + .support_bundle_download( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + ) + .await + } + (SupportBundleQueryType::Index, true) => { + client + .support_bundle_head_index( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + ) + .await + } + (SupportBundleQueryType::Index, false) => { + client + .support_bundle_index( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + ) + .await + } + (SupportBundleQueryType::Path { file_path }, true) => { + client + .support_bundle_head_file( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + &file_path, + ) + .await + } + (SupportBundleQueryType::Path { file_path }, false) => { + client + .support_bundle_download_file( + &ZpoolUuid::from(bundle.zpool_id), + &DatasetUuid::from(bundle.dataset_id), + &SupportBundleUuid::from(bundle.id), + &file_path, + ) + .await + } }; let response = From 80edd57883884011dda58153edaabdb60515902f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 17 Dec 2024 13:39:22 -0800 Subject: [PATCH 37/61] Make zpool_get_sled more paranoid --- nexus/db-queries/src/db/datastore/zpool.rs | 22 ++++++++++++++----- .../tasks/support_bundle_collector.rs | 11 ++++++---- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/zpool.rs b/nexus/db-queries/src/db/datastore/zpool.rs index 0938ed5ba60..8c85b43ed64 100644 --- a/nexus/db-queries/src/db/datastore/zpool.rs +++ b/nexus/db-queries/src/db/datastore/zpool.rs @@ -15,6 +15,7 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::PhysicalDisk; +use crate::db::model::PhysicalDiskPolicy; use crate::db::model::PhysicalDiskState; use crate::db::model::Sled; use crate::db::model::Zpool; @@ -273,19 +274,28 @@ impl DataStore { Ok(()) } - pub async fn zpool_get_sled( + pub async fn zpool_get_sled_if_in_service( &self, opctx: &OpContext, id: ZpoolUuid, ) -> LookupResult { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - use db::schema::zpool::dsl; + use db::schema::physical_disk::dsl as physical_disk_dsl; + use db::schema::zpool::dsl as zpool_dsl; let conn = self.pool_connection_authorized(opctx).await?; - let id = dsl::zpool - .filter(dsl::id.eq(id.into_untyped_uuid())) - .filter(dsl::time_deleted.is_null()) - .select(dsl::sled_id) + let id = zpool_dsl::zpool + .filter(zpool_dsl::id.eq(id.into_untyped_uuid())) + .filter(zpool_dsl::time_deleted.is_null()) + .inner_join( + physical_disk_dsl::physical_disk + .on(zpool_dsl::physical_disk_id.eq(physical_disk_dsl::id)), + ) + .filter( + physical_disk_dsl::disk_policy + .eq(PhysicalDiskPolicy::InService), + ) + .select(zpool_dsl::sled_id) .first_async::(&*conn) .await .map_err(|e| { diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index ad0e3d85b53..5a8a114a3e5 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -264,10 +264,10 @@ impl SupportBundleCollector { // Find the sled where we're storing this bundle. let result = self .datastore - .zpool_get_sled(&opctx, bundle.zpool_id.into()) + .zpool_get_sled_if_in_service(&opctx, bundle.zpool_id.into()) .await; - println!("zpool_get_sled result: {result:?}"); + println!("zpool_get_sled_if_in_service result: {result:?}"); let delete_from_db = match result { Ok(sled_id) => { @@ -473,7 +473,10 @@ impl<'a> BundleCollection<'a> { let sled_id = self .collector .datastore - .zpool_get_sled(&self.opctx, self.bundle.zpool_id.into()) + .zpool_get_sled_if_in_service( + &self.opctx, + self.bundle.zpool_id.into(), + ) .await?; let sled_client = nexus_networking::sled_client( &self.collector.datastore, @@ -1397,7 +1400,7 @@ mod test { // Delete the zpool holding the bundle. // - // This should call the "zpool_get_sled" call to fail! + // This should call the "zpool_get_sled_if_in_service" call to fail! datastore .zpool_delete_self_and_all_datasets(&opctx, bundle.zpool_id.into()) .await From d16fda0a6203306ee9d22ea42b9b9366057370da Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 17 Dec 2024 13:48:22 -0800 Subject: [PATCH 38/61] Use a shared support bundle implementation for the simulated sled agent --- nexus/src/app/sagas/disk_create.rs | 11 +- nexus/src/app/sagas/instance_create.rs | 2 +- nexus/src/app/sagas/instance_ip_attach.rs | 20 +- nexus/src/app/sagas/instance_ip_detach.rs | 10 +- nexus/test-utils/src/resource_helpers.rs | 65 +- .../crucible_replacements.rs | 1 - nexus/tests/integration_tests/disks.rs | 10 +- nexus/tests/integration_tests/instances.rs | 8 +- nexus/tests/integration_tests/snapshots.rs | 4 +- .../integration_tests/volume_management.rs | 12 +- sled-agent/src/sim/artifact_store.rs | 9 +- sled-agent/src/sim/http_entrypoints.rs | 149 ++-- sled-agent/src/sim/http_entrypoints_pantry.rs | 27 +- .../src/sim/http_entrypoints_storage.rs | 34 +- sled-agent/src/sim/mod.rs | 1 + sled-agent/src/sim/server.rs | 45 +- sled-agent/src/sim/sled_agent.rs | 230 +++---- sled-agent/src/sim/storage.rs | 638 ++++++++++-------- sled-agent/src/support_bundle/storage.rs | 159 ++++- 19 files changed, 787 insertions(+), 648 deletions(-) diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index 51f6d29de10..7e04dbe8578 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -1034,15 +1034,12 @@ pub(crate) mod test { true } - async fn no_regions_ensured( - sled_agent: &SledAgent, - test: &DiskTest<'_>, - ) -> bool { + fn no_regions_ensured(sled_agent: &SledAgent, test: &DiskTest<'_>) -> bool { for zpool in test.zpools() { for dataset in &zpool.datasets { let crucible_dataset = - sled_agent.get_crucible_dataset(zpool.id, dataset.id).await; - if !crucible_dataset.is_empty().await { + sled_agent.get_crucible_dataset(zpool.id, dataset.id); + if !crucible_dataset.is_empty() { return false; } } @@ -1073,7 +1070,7 @@ pub(crate) mod test { .await ); assert!(no_region_allocations_exist(datastore, &test).await); - assert!(no_regions_ensured(&sled_agent, &test).await); + assert!(no_regions_ensured(&sled_agent, &test)); assert!(test.crucible_resources_deleted().await); } diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 07f7911ef54..aa181d7b792 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -1346,7 +1346,7 @@ pub mod test { assert!(disk_is_detached(datastore).await); assert!(no_instances_or_disks_on_sled(&sled_agent).await); - let v2p_mappings = &*sled_agent.v2p_mappings.lock().await; + let v2p_mappings = &*sled_agent.v2p_mappings.lock().unwrap(); assert!(v2p_mappings.is_empty()); } diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index e6fb8654ea0..b0d51f72014 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -435,14 +435,16 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(vmm_id).or_default(); - assert!(my_eips - .iter() - .any(|v| matches!(v, InstanceExternalIpBody::Floating(_)))); - assert!(my_eips - .iter() - .any(|v| matches!(v, InstanceExternalIpBody::Ephemeral(_)))); + { + let mut eips = sled_agent.external_ips.lock().unwrap(); + let my_eips = eips.entry(vmm_id).or_default(); + assert!(my_eips + .iter() + .any(|v| matches!(v, InstanceExternalIpBody::Floating(_)))); + assert!(my_eips + .iter() + .any(|v| matches!(v, InstanceExternalIpBody::Ephemeral(_)))); + } // DB has records for SNAT plus the new IPs. let db_eips = datastore @@ -497,7 +499,7 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().await; + let mut eips = sled_agent.external_ips.lock().unwrap(); let my_eips = eips.entry(vmm_id).or_default(); assert!(my_eips.is_empty()); } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index d9da9fc05cf..bec46f0269e 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -405,9 +405,11 @@ pub(crate) mod test { &instance_id, ) .await; - let mut eips = sled_agent.external_ips.lock().await; - let my_eips = eips.entry(vmm_id).or_default(); - assert!(my_eips.is_empty()); + { + let mut eips = sled_agent.external_ips.lock().unwrap(); + let my_eips = eips.entry(vmm_id).or_default(); + assert!(my_eips.is_empty()); + } // DB only has record for SNAT. let db_eips = datastore @@ -467,7 +469,7 @@ pub(crate) mod test { assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); // No IP bindings remain on sled-agent. - let eips = &*sled_agent.external_ips.lock().await; + let eips = &*sled_agent.external_ips.lock().unwrap(); for (_nic_id, eip_set) in eips { assert_eq!(eip_set.len(), 2); } diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index c5cb0231d18..89f453c873c 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1282,26 +1282,24 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { // Tell the simulated sled agent to create the disk and zpool containing // these datasets. - sled_agent - .create_external_physical_disk( - physical_disk_id, - disk_identity.clone(), - ) - .await; - sled_agent - .create_zpool(zpool.id, physical_disk_id, zpool.size.to_bytes()) - .await; + sled_agent.create_external_physical_disk( + physical_disk_id, + disk_identity.clone(), + ); + sled_agent.create_zpool( + zpool.id, + physical_disk_id, + zpool.size.to_bytes(), + ); for dataset in &zpool.datasets { // Sled Agent side: Create the Dataset, make sure regions can be // created immediately if Nexus requests anything. let address = - sled_agent.create_crucible_dataset(zpool.id, dataset.id).await; + sled_agent.create_crucible_dataset(zpool.id, dataset.id); let crucible = - sled_agent.get_crucible_dataset(zpool.id, dataset.id).await; - crucible - .set_create_callback(Box::new(|_| RegionState::Created)) - .await; + sled_agent.get_crucible_dataset(zpool.id, dataset.id); + crucible.set_create_callback(Box::new(|_| RegionState::Created)); // Nexus side: Notify Nexus of the physical disk/zpool/dataset // combination that exists. @@ -1381,23 +1379,19 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for dataset in &zpool.datasets { let crucible = self .get_sled(*sled_id) - .get_crucible_dataset(zpool.id, dataset.id) - .await; + .get_crucible_dataset(zpool.id, dataset.id); let called = std::sync::atomic::AtomicBool::new(false); - crucible - .set_create_callback(Box::new(move |_| { - if !called.load(std::sync::atomic::Ordering::SeqCst) - { - called.store( - true, - std::sync::atomic::Ordering::SeqCst, - ); - RegionState::Requested - } else { - RegionState::Created - } - })) - .await; + crucible.set_create_callback(Box::new(move |_| { + if !called.load(std::sync::atomic::Ordering::SeqCst) { + called.store( + true, + std::sync::atomic::Ordering::SeqCst, + ); + RegionState::Requested + } else { + RegionState::Created + } + })); } } } @@ -1409,11 +1403,9 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for dataset in &zpool.datasets { let crucible = self .get_sled(*sled_id) - .get_crucible_dataset(zpool.id, dataset.id) - .await; + .get_crucible_dataset(zpool.id, dataset.id); crucible - .set_create_callback(Box::new(|_| RegionState::Failed)) - .await; + .set_create_callback(Box::new(|_| RegionState::Failed)); } } } @@ -1430,9 +1422,8 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { for dataset in &zpool.datasets { let crucible = self .get_sled(*sled_id) - .get_crucible_dataset(zpool.id, dataset.id) - .await; - if !crucible.is_empty().await { + .get_crucible_dataset(zpool.id, dataset.id); + if !crucible.is_empty() { return false; } } diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 5f7e92c3937..621df474482 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -373,7 +373,6 @@ impl<'a> RegionReplacementDeletedVolumeTest<'a> { .activate_background_attachment( region_replacement.volume_id.to_string(), ) - .await .unwrap(); } diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index d9888f9ccda..5da4e49a3a4 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -1341,9 +1341,7 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_deletion_error(true) - .await; + .set_region_deletion_error(true); // Delete the disk - expect this to fail NexusRequest::new( @@ -1379,9 +1377,7 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_deletion_error(false) - .await; + .set_region_deletion_error(false); // Request disk delete again NexusRequest::new( @@ -2479,7 +2475,7 @@ async fn test_no_halt_disk_delete_one_region_on_expunged_agent( let zpool = disk_test.zpools().next().expect("Expected at least one zpool"); let dataset = &zpool.datasets[0]; - cptestctx.sled_agent.sled_agent.drop_dataset(zpool.id, dataset.id).await; + cptestctx.sled_agent.sled_agent.drop_dataset(zpool.id, dataset.id); // Spawn a task that tries to delete the disk let disk_url = get_disk_url(DISK_NAME); diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 163869896f1..a1ffd730201 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -646,7 +646,7 @@ async fn test_instance_start_creates_networking_state( sled_agents.push(&cptestctx.sled_agent.sled_agent); for agent in &sled_agents { - agent.v2p_mappings.lock().await.clear(); + agent.v2p_mappings.lock().unwrap().clear(); } // Start the instance and make sure that it gets to Running. @@ -6244,7 +6244,7 @@ async fn test_instance_v2p_mappings(cptestctx: &ControlPlaneTestContext) { // Validate that every sled no longer has the V2P mapping for this instance for sled_agent in &sled_agents { let condition = || async { - let v2p_mappings = sled_agent.v2p_mappings.lock().await; + let v2p_mappings = sled_agent.v2p_mappings.lock().unwrap(); if v2p_mappings.is_empty() { Ok(()) } else { @@ -6501,7 +6501,7 @@ async fn assert_sled_v2p_mappings( vni: Vni, ) { let condition = || async { - let v2p_mappings = sled_agent.v2p_mappings.lock().await; + let v2p_mappings = sled_agent.v2p_mappings.lock().unwrap(); let mapping = v2p_mappings.iter().find(|mapping| { mapping.virtual_ip == nic.ip && mapping.virtual_mac == nic.mac @@ -6573,7 +6573,7 @@ pub async fn assert_sled_vpc_routes( kind: RouterKind::Custom(db_subnet.ipv4_block.0.into()), }; - let vpc_routes = sled_agent.vpc_routes.lock().await; + let vpc_routes = sled_agent.vpc_routes.lock().unwrap(); let sys_routes_found = vpc_routes .iter() .any(|(id, set)| *id == sys_key && set.routes == system_routes); diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index accb4470fb4..dff07732c7d 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -980,9 +980,7 @@ async fn test_snapshot_unwind(cptestctx: &ControlPlaneTestContext) { .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_creating_a_running_snapshot_should_fail() - .await; + .set_creating_a_running_snapshot_should_fail(); // Issue snapshot request, expecting it to fail let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 6a9ce283899..543e66ee303 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -2528,9 +2528,7 @@ async fn test_disk_create_saga_unwinds_correctly( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_creation_error(true) - .await; + .set_region_creation_error(true); let disk_size = ByteCount::from_gibibytes_u32(2); let base_disk = params::DiskCreate { @@ -2598,9 +2596,7 @@ async fn test_snapshot_create_saga_unwinds_correctly( .sled_agent .sled_agent .get_crucible_dataset(zpool.id, dataset.id) - .await - .set_region_creation_error(true) - .await; + .set_region_creation_error(true); // Create a snapshot let snapshot_create = params::SnapshotCreate { @@ -4225,11 +4221,9 @@ async fn test_read_only_region_reference_counting( TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), db_read_only_dataset.id(), ) - .await .get(crucible_agent_client::types::RegionId( read_only_region.id().to_string() )) - .await .unwrap() .state, crucible_agent_client::types::State::Created @@ -4297,11 +4291,9 @@ async fn test_read_only_region_reference_counting( TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), db_read_only_dataset.id(), ) - .await .get(crucible_agent_client::types::RegionId( read_only_region.id().to_string() )) - .await .unwrap() .state, crucible_agent_client::types::State::Destroyed diff --git a/sled-agent/src/sim/artifact_store.rs b/sled-agent/src/sim/artifact_store.rs index d73f5a28802..efb4fa9f36e 100644 --- a/sled-agent/src/sim/artifact_store.rs +++ b/sled-agent/src/sim/artifact_store.rs @@ -5,10 +5,7 @@ //! Implementation of `crate::artifact_store::StorageBackend` for our simulated //! storage. -use std::sync::Arc; - use camino_tempfile::Utf8TempDir; -use futures::lock::Mutex; use sled_storage::error::Error as StorageError; use super::storage::Storage; @@ -16,11 +13,11 @@ use crate::artifact_store::DatasetsManager; pub(super) struct SimArtifactStorage { root: Utf8TempDir, - backend: Arc>, + backend: Storage, } impl SimArtifactStorage { - pub(super) fn new(backend: Arc>) -> SimArtifactStorage { + pub(super) fn new(backend: Storage) -> SimArtifactStorage { SimArtifactStorage { root: camino_tempfile::tempdir().unwrap(), backend, @@ -36,9 +33,7 @@ impl DatasetsManager for SimArtifactStorage { let config = self .backend .lock() - .await .datasets_config_list() - .await .map_err(|_| StorageError::LedgerNotFound)?; Ok(crate::artifact_store::filter_dataset_mountpoints( config, diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 60dcb1be310..227270200dd 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -5,6 +5,7 @@ //! HTTP entrypoint functions for the sled agent's exposed API use super::collection::PokeMode; +use crate::support_bundle::storage::SupportBundleQueryType; use camino::Utf8PathBuf; use dropshot::endpoint; use dropshot::ApiDescription; @@ -37,6 +38,7 @@ use omicron_common::disk::DatasetsManagementResult; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; use omicron_common::update::ArtifactHash; +use range_requests::RequestContextEx; use sled_agent_api::*; use sled_agent_types::boot_disk::BootDiskOsWriteStatus; use sled_agent_types::boot_disk::BootDiskPathParams; @@ -240,7 +242,6 @@ impl SledAgentApi for SledAgentSimImpl { path_params.disk_id, body.snapshot_id, ) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(VmmIssueDiskSnapshotRequestResponse { @@ -268,7 +269,6 @@ impl SledAgentApi for SledAgentSimImpl { let body_args = body.into_inner(); sa.set_virtual_nic_host(&body_args) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseUpdatedNoContent()) @@ -282,7 +282,6 @@ impl SledAgentApi for SledAgentSimImpl { let body_args = body.into_inner(); sa.unset_virtual_nic_host(&body_args) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseUpdatedNoContent()) @@ -294,7 +293,7 @@ impl SledAgentApi for SledAgentSimImpl { { let sa = rqctx.context(); - let vnics = sa.list_virtual_nics().await.map_err(HttpError::from)?; + let vnics = sa.list_virtual_nics().map_err(HttpError::from)?; Ok(HttpResponseOk(vnics)) } @@ -310,7 +309,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result, HttpError> { let config = - rqctx.context().bootstore_network_config.lock().await.clone(); + rqctx.context().bootstore_network_config.lock().unwrap().clone(); Ok(HttpResponseOk(config)) } @@ -318,7 +317,8 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, body: TypedBody, ) -> Result { - let mut config = rqctx.context().bootstore_network_config.lock().await; + let mut config = + rqctx.context().bootstore_network_config.lock().unwrap(); *config = body.into_inner(); Ok(HttpResponseUpdatedNoContent()) } @@ -329,7 +329,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { let sa = rqctx.context(); Ok(HttpResponseOk( - sa.inventory(rqctx.server.local_addr).await.map_err(|e| { + sa.inventory(rqctx.server.local_addr).map_err(|e| { HttpError::for_internal_error(format!("{:#}", e)) })?, )) @@ -341,7 +341,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { let sa = rqctx.context(); let body_args = body.into_inner(); - let result = sa.datasets_ensure(body_args).await?; + let result = sa.datasets_ensure(body_args)?; Ok(HttpResponseOk(result)) } @@ -349,7 +349,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.datasets_config_list().await?)) + Ok(HttpResponseOk(sa.datasets_config_list()?)) } async fn omicron_physical_disks_put( @@ -358,7 +358,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { let sa = rqctx.context(); let body_args = body.into_inner(); - let result = sa.omicron_physical_disks_ensure(body_args).await?; + let result = sa.omicron_physical_disks_ensure(body_args)?; Ok(HttpResponseOk(result)) } @@ -366,7 +366,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) + Ok(HttpResponseOk(sa.omicron_physical_disks_list()?)) } async fn omicron_zones_put( @@ -375,7 +375,7 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result { let sa = rqctx.context(); let body_args = body.into_inner(); - sa.omicron_zones_ensure(body_args).await; + sa.omicron_zones_ensure(body_args); Ok(HttpResponseUpdatedNoContent()) } @@ -390,7 +390,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, ) -> Result>, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.list_vpc_routes().await)) + Ok(HttpResponseOk(sa.list_vpc_routes())) } async fn set_vpc_routes( @@ -398,7 +398,7 @@ impl SledAgentApi for SledAgentSimImpl { body: TypedBody>, ) -> Result { let sa = rqctx.context(); - sa.set_vpc_routes(body.into_inner()).await; + sa.set_vpc_routes(body.into_inner()); Ok(HttpResponseUpdatedNoContent()) } @@ -419,7 +419,7 @@ impl SledAgentApi for SledAgentSimImpl { rqctx: RequestContext, path_params: Path, query_params: Query, - _body: StreamingBody, + body: StreamingBody, ) -> Result, HttpError> { let sa = rqctx.context(); @@ -433,6 +433,7 @@ impl SledAgentApi for SledAgentSimImpl { dataset_id, support_bundle_id, hash, + body.into_stream(), ) .await?, )) @@ -446,15 +447,15 @@ impl SledAgentApi for SledAgentSimImpl { let SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .body(dropshot::Body::with_content( - "simulated support bundle; do not eat", - )) - .unwrap()) + let range = rqctx.range(); + sa.support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + SupportBundleQueryType::Whole, + ) + .await } async fn support_bundle_download_file( @@ -465,18 +466,18 @@ impl SledAgentApi for SledAgentSimImpl { let SupportBundleFilePathParam { parent: SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id }, - file: _, + file, } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .body(dropshot::Body::with_content( - "simulated support bundle file; do not eat", - )) - .unwrap()) + let range = rqctx.range(); + sa.support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + SupportBundleQueryType::Path { file_path: file }, + ) + .await } async fn support_bundle_index( @@ -487,15 +488,15 @@ impl SledAgentApi for SledAgentSimImpl { let SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .body(dropshot::Body::with_content( - "simulated support bundle index; do not eat", - )) - .unwrap()) + let range = rqctx.range(); + sa.support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + SupportBundleQueryType::Index, + ) + .await } async fn support_bundle_head( @@ -506,17 +507,15 @@ impl SledAgentApi for SledAgentSimImpl { let SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - let fictional_length = 10000; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .header(hyper::header::ACCEPT_RANGES, "bytes") - .header(hyper::header::CONTENT_LENGTH, fictional_length) - .body(dropshot::Body::empty()) - .unwrap()) + let range = rqctx.range(); + sa.support_bundle_head( + zpool_id, + dataset_id, + support_bundle_id, + range, + SupportBundleQueryType::Whole, + ) + .await } async fn support_bundle_head_file( @@ -527,20 +526,18 @@ impl SledAgentApi for SledAgentSimImpl { let SupportBundleFilePathParam { parent: SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id }, - file: _, + file, } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - let fictional_length = 10000; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .header(hyper::header::ACCEPT_RANGES, "bytes") - .header(hyper::header::CONTENT_LENGTH, fictional_length) - .body(dropshot::Body::empty()) - .unwrap()) + let range = rqctx.range(); + sa.support_bundle_get( + zpool_id, + dataset_id, + support_bundle_id, + range, + SupportBundleQueryType::Path { file_path: file }, + ) + .await } async fn support_bundle_head_index( @@ -551,17 +548,15 @@ impl SledAgentApi for SledAgentSimImpl { let SupportBundlePathParam { zpool_id, dataset_id, support_bundle_id } = path_params.into_inner(); - sa.support_bundle_get(zpool_id, dataset_id, support_bundle_id).await?; - - let fictional_length = 10000; - - Ok(http::Response::builder() - .status(http::StatusCode::OK) - .header(http::header::CONTENT_TYPE, "text/html") - .header(hyper::header::ACCEPT_RANGES, "bytes") - .header(hyper::header::CONTENT_LENGTH, fictional_length) - .body(dropshot::Body::empty()) - .unwrap()) + let range = rqctx.range(); + sa.support_bundle_head( + zpool_id, + dataset_id, + support_bundle_id, + range, + SupportBundleQueryType::Index, + ) + .await } async fn support_bundle_delete( diff --git a/sled-agent/src/sim/http_entrypoints_pantry.rs b/sled-agent/src/sim/http_entrypoints_pantry.rs index c98c7db665a..cbdc9329d46 100644 --- a/sled-agent/src/sim/http_entrypoints_pantry.rs +++ b/sled-agent/src/sim/http_entrypoints_pantry.rs @@ -69,7 +69,7 @@ async fn pantry_status( ) -> Result, HttpError> { let pantry = rc.context(); - let status = pantry.status().await?; + let status = pantry.status()?; Ok(HttpResponseOk(status)) } @@ -103,7 +103,7 @@ async fn volume_status( let path = path.into_inner(); let pantry = rc.context(); - let status = pantry.volume_status(path.id.clone()).await?; + let status = pantry.volume_status(path.id.clone())?; Ok(HttpResponseOk(status)) } @@ -134,7 +134,6 @@ async fn attach( pantry .attach(path.id.clone(), body.volume_construction_request) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(AttachResult { id: path.id })) @@ -161,13 +160,11 @@ async fn attach_activate_background( let body = body.into_inner(); let pantry = rc.context(); - pantry - .attach_activate_background( - path.id.clone(), - body.job_id, - body.volume_construction_request, - ) - .await?; + pantry.attach_activate_background( + path.id.clone(), + body.job_id, + body.volume_construction_request, + )?; Ok(HttpResponseUpdatedNoContent()) } @@ -194,7 +191,7 @@ async fn is_job_finished( let path = path.into_inner(); let pantry = rc.context(); - let job_is_finished = pantry.is_job_finished(path.id).await?; + let job_is_finished = pantry.is_job_finished(path.id)?; Ok(HttpResponseOk(JobPollResponse { job_is_finished })) } @@ -217,7 +214,7 @@ async fn job_result_ok( let path = path.into_inner(); let pantry = rc.context(); - let job_result = pantry.get_job_result(path.id).await?; + let job_result = pantry.get_job_result(path.id)?; match job_result { Ok(job_result_ok) => { @@ -260,7 +257,6 @@ async fn import_from_url( let job_id = pantry .import_from_url(path.id.clone(), body.url, body.expected_digest) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(ImportFromUrlResponse { job_id })) @@ -287,7 +283,6 @@ async fn snapshot( pantry .snapshot(path.id.clone(), body.snapshot_id) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseUpdatedNoContent()) @@ -320,7 +315,7 @@ async fn bulk_write( ) .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?; - pantry.bulk_write(path.id.clone(), body.offset, data).await?; + pantry.bulk_write(path.id.clone(), body.offset, data)?; Ok(HttpResponseUpdatedNoContent()) } @@ -344,7 +339,6 @@ async fn scrub( let job_id = pantry .scrub(path.id.clone()) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseOk(ScrubResponse { job_id })) @@ -364,7 +358,6 @@ async fn detach( pantry .detach(path.id) - .await .map_err(|e| HttpError::for_internal_error(e.to_string()))?; Ok(HttpResponseDeleted()) diff --git a/sled-agent/src/sim/http_entrypoints_storage.rs b/sled-agent/src/sim/http_entrypoints_storage.rs index 34e26f51915..cb53d96fd11 100644 --- a/sled-agent/src/sim/http_entrypoints_storage.rs +++ b/sled-agent/src/sim/http_entrypoints_storage.rs @@ -63,7 +63,7 @@ async fn region_list( rc: RequestContext>, ) -> Result>, HttpError> { let crucible = rc.context(); - Ok(HttpResponseOk(crucible.list().await)) + Ok(HttpResponseOk(crucible.list())) } #[endpoint { @@ -77,7 +77,7 @@ async fn region_create( let params = body.into_inner(); let crucible = rc.context(); - let region = crucible.create(params).await.map_err(|e| { + let region = crucible.create(params).map_err(|e| { HttpError::for_internal_error( format!("region create failure: {:?}", e,), ) @@ -97,7 +97,7 @@ async fn region_get( let id = path.into_inner().id; let crucible = rc.context(); - match crucible.get(id).await { + match crucible.get(id) { Some(region) => Ok(HttpResponseOk(region)), None => { Err(HttpError::for_not_found(None, "Region not found".to_string())) @@ -118,7 +118,6 @@ async fn region_delete( crucible .delete(id) - .await .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?; Ok(HttpResponseDeleted()) @@ -136,16 +135,16 @@ async fn region_get_snapshots( let crucible = rc.context(); - if crucible.get(id.clone()).await.is_none() { + if crucible.get(id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - let snapshots = crucible.snapshots_for_region(&id).await; + let snapshots = crucible.snapshots_for_region(&id); - let running_snapshots = crucible.running_snapshots_for_id(&id).await; + let running_snapshots = crucible.running_snapshots_for_id(&id); Ok(HttpResponseOk(GetSnapshotResponse { snapshots, running_snapshots })) } @@ -167,14 +166,14 @@ async fn region_get_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - for snapshot in &crucible.snapshots_for_region(&p.id).await { + for snapshot in &crucible.snapshots_for_region(&p.id) { if snapshot.name == p.name { return Ok(HttpResponseOk(snapshot.clone())); } @@ -203,7 +202,7 @@ async fn region_delete_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), @@ -212,7 +211,6 @@ async fn region_delete_snapshot( crucible .delete_snapshot(&p.id, &p.name) - .await .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?; Ok(HttpResponseDeleted()) @@ -235,14 +233,14 @@ async fn region_run_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - let snapshots = crucible.snapshots_for_region(&p.id).await; + let snapshots = crucible.snapshots_for_region(&p.id); if !snapshots.iter().any(|x| x.name == p.name) { return Err(HttpError::for_not_found( @@ -251,10 +249,8 @@ async fn region_run_snapshot( )); } - let running_snapshot = crucible - .create_running_snapshot(&p.id, &p.name) - .await - .map_err(|e| { + let running_snapshot = + crucible.create_running_snapshot(&p.id, &p.name).map_err(|e| { HttpError::for_internal_error(format!( "running snapshot create failure: {:?}", e, @@ -275,14 +271,14 @@ async fn region_delete_running_snapshot( let p = path.into_inner(); let crucible = rc.context(); - if crucible.get(p.id.clone()).await.is_none() { + if crucible.get(p.id.clone()).is_none() { return Err(HttpError::for_not_found( None, "Region not found".to_string(), )); } - crucible.delete_running_snapshot(&p.id, &p.name).await.map_err(|e| { + crucible.delete_running_snapshot(&p.id, &p.name).map_err(|e| { HttpError::for_internal_error(format!( "running snapshot create failure: {:?}", e, diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index ab3b155b36d..c59af4ccce7 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -24,3 +24,4 @@ pub use config::{ }; pub use server::{run_standalone_server, RssArgs, Server}; pub use sled_agent::SledAgent; +pub(crate) use storage::Storage; diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index 3833d5ca7c6..ef13cdfcdd8 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -188,23 +188,19 @@ impl Server { let vendor = "synthetic-vendor".to_string(); let serial = format!("synthetic-serial-{zpool_id}"); let model = "synthetic-model".to_string(); - sled_agent - .create_external_physical_disk( - physical_disk_id, - DiskIdentity { - vendor: vendor.clone(), - serial: serial.clone(), - model: model.clone(), - }, - ) - .await; + sled_agent.create_external_physical_disk( + physical_disk_id, + DiskIdentity { + vendor: vendor.clone(), + serial: serial.clone(), + model: model.clone(), + }, + ); - sled_agent - .create_zpool(zpool_id, physical_disk_id, zpool.size) - .await; + sled_agent.create_zpool(zpool_id, physical_disk_id, zpool.size); let dataset_id = DatasetUuid::new_v4(); let address = - sled_agent.create_crucible_dataset(zpool_id, dataset_id).await; + sled_agent.create_crucible_dataset(zpool_id, dataset_id); datasets.push(NexusTypes::DatasetCreateRequest { zpool_id: zpool_id.into_untyped_uuid(), @@ -218,10 +214,8 @@ impl Server { // Whenever Nexus tries to allocate a region, it should complete // immediately. What efficiency! let crucible = - sled_agent.get_crucible_dataset(zpool_id, dataset_id).await; - crucible - .set_create_callback(Box::new(|_| RegionState::Created)) - .await; + sled_agent.get_crucible_dataset(zpool_id, dataset_id); + crucible.set_create_callback(Box::new(|_| RegionState::Created)) } Ok(Server { @@ -240,8 +234,7 @@ impl Server { self.log.new(o!("kind" => "pantry")), self.config.storage.ip, self.sled_agent.clone(), - ) - .await; + ); self.pantry_server = Some(pantry_server); self.pantry_server.as_ref().unwrap() } @@ -370,7 +363,7 @@ pub async fn run_standalone_server( dns.initialize_with_config(&log, &dns_config).await?; let internal_dns_version = dns_config.generation; - let all_u2_zpools = server.sled_agent.get_zpools().await; + let all_u2_zpools = server.sled_agent.get_zpools(); let get_random_zpool = || { use rand::seq::SliceRandom; let pool = all_u2_zpools @@ -516,12 +509,12 @@ pub async fn run_standalone_server( }; let mut datasets = vec![]; - let physical_disks = server.sled_agent.get_all_physical_disks().await; - let zpools = server.sled_agent.get_zpools().await; + let physical_disks = server.sled_agent.get_all_physical_disks(); + let zpools = server.sled_agent.get_zpools(); for zpool in &zpools { let zpool_id = ZpoolUuid::from_untyped_uuid(zpool.id); for (dataset_id, address) in - server.sled_agent.get_crucible_datasets(zpool_id).await + server.sled_agent.get_crucible_datasets(zpool_id) { datasets.push(NexusTypes::DatasetCreateRequest { zpool_id: zpool.id, @@ -540,7 +533,7 @@ pub async fn run_standalone_server( }; let omicron_physical_disks_config = - server.sled_agent.omicron_physical_disks_list().await?; + server.sled_agent.omicron_physical_disks_list()?; let mut sled_configs = BTreeMap::new(); sled_configs.insert( config.id, @@ -559,7 +552,7 @@ pub async fn run_standalone_server( }) .collect(), }, - datasets: server.sled_agent.datasets_config_list().await?, + datasets: server.sled_agent.datasets_config_list()?, zones, }, ); diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 1f099fc0366..b0846c72163 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -14,11 +14,14 @@ use super::storage::Storage; use crate::artifact_store::ArtifactStore; use crate::nexus::NexusClient; use crate::sim::simulatable::Simulatable; +use crate::support_bundle::storage::SupportBundleQueryType; use crate::updates::UpdateManager; use anyhow::bail; use anyhow::Context; +use bytes::Bytes; +use dropshot::Body; use dropshot::HttpError; -use futures::lock::Mutex; +use futures::Stream; use nexus_sled_agent_shared::inventory::{ Inventory, InventoryDataset, InventoryDisk, InventoryZpool, OmicronZonesConfig, SledRole, @@ -47,8 +50,8 @@ use oxnet::Ipv6Net; use propolis_client::{ types::VolumeConstructionRequest, Client as PropolisClient, }; +use range_requests::PotentialRange; use sled_agent_api::SupportBundleMetadata; -use sled_agent_api::SupportBundleState; use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::{ EarlyNetworkConfig, EarlyNetworkConfigBody, @@ -62,6 +65,7 @@ use std::collections::{HashMap, HashSet, VecDeque}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::Arc; +use std::sync::Mutex; use std::time::Duration; use uuid::Uuid; @@ -79,14 +83,15 @@ pub struct SledAgent { vmms: Arc>, /// collection of simulated disks, indexed by disk uuid disks: Arc>, - storage: Arc>, + storage: Storage, updates: UpdateManager, nexus_address: SocketAddr, pub nexus_client: Arc, disk_id_to_region_ids: Mutex>>, pub v2p_mappings: Mutex>, - mock_propolis: - Mutex>, + mock_propolis: futures::lock::Mutex< + Option<(propolis_mock_server::Server, PropolisClient)>, + >, /// lists of external IPs assigned to instances pub external_ips: Mutex>>, @@ -173,11 +178,11 @@ impl SledAgent { }, }); - let storage = Arc::new(Mutex::new(Storage::new( + let storage = Storage::new( id.into_untyped_uuid(), config.storage.ip, storage_log, - ))); + ); let artifacts = ArtifactStore::new(&log, SimArtifactStorage::new(storage.clone())); @@ -202,7 +207,7 @@ impl SledAgent { v2p_mappings: Mutex::new(HashSet::new()), external_ips: Mutex::new(HashMap::new()), vpc_routes: Mutex::new(HashMap::new()), - mock_propolis: Mutex::new(None), + mock_propolis: futures::lock::Mutex::new(None), config: config.clone(), fake_zones: Mutex::new(OmicronZonesConfig { generation: Generation::new(), @@ -222,7 +227,7 @@ impl SledAgent { /// three crucible regions). Extract the region addresses, lookup the region /// from the port and pair disk id with region ids. This map is referred to /// later when making snapshots. - pub async fn map_disk_ids_to_region_ids( + pub fn map_disk_ids_to_region_ids( &self, volume_construction_request: &VolumeConstructionRequest, ) -> Result<(), Error> { @@ -243,11 +248,10 @@ impl SledAgent { let mut region_ids = Vec::new(); - let storage = self.storage.lock().await; + let storage = self.storage.lock(); for target in targets { let region = storage .get_region_for_port(target.port()) - .await .ok_or_else(|| { Error::internal_error(&format!( "no region for port {}", @@ -259,7 +263,8 @@ impl SledAgent { region_ids.push(region_id); } - let mut disk_id_to_region_ids = self.disk_id_to_region_ids.lock().await; + let mut disk_id_to_region_ids = + self.disk_id_to_region_ids.lock().unwrap(); disk_id_to_region_ids.insert(disk_id.to_string(), region_ids.clone()); Ok(()) @@ -398,10 +403,10 @@ impl SledAgent { for disk_request in &hardware.disks { let vcr = &disk_request.volume_construction_request; - self.map_disk_ids_to_region_ids(&vcr).await?; + self.map_disk_ids_to_region_ids(&vcr)?; } - let mut routes = self.vpc_routes.lock().await; + let mut routes = self.vpc_routes.lock().unwrap(); for nic in &hardware.nics { let my_routers = [ RouterId { vni: nic.vni, kind: RouterKind::System }, @@ -449,7 +454,8 @@ impl SledAgent { propolis_id: PropolisUuid, state: VmmStateRequested, ) -> Result { - if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref() + if let Some(e) = + self.instance_ensure_state_error.lock().unwrap().as_ref() { return Err(e.clone().into()); } @@ -552,7 +558,7 @@ impl SledAgent { } pub async fn set_instance_ensure_state_error(&self, error: Option) { - *self.instance_ensure_state_error.lock().await = error; + *self.instance_ensure_state_error.lock().unwrap() = error; } /// Idempotently ensures that the given API Disk (described by `api_disk`) @@ -592,89 +598,58 @@ impl SledAgent { } /// Adds a Physical Disk to the simulated sled agent. - pub async fn create_external_physical_disk( + pub fn create_external_physical_disk( &self, id: PhysicalDiskUuid, identity: DiskIdentity, ) { let variant = DiskVariant::U2; - self.storage - .lock() - .await - .insert_physical_disk(id, identity, variant) - .await; + self.storage.lock().insert_physical_disk(id, identity, variant); } - pub async fn get_all_physical_disks( + pub fn get_all_physical_disks( &self, ) -> Vec { - self.storage.lock().await.get_all_physical_disks() + self.storage.lock().get_all_physical_disks() } - pub async fn get_zpools( - &self, - ) -> Vec { - self.storage.lock().await.get_all_zpools() + pub fn get_zpools(&self) -> Vec { + self.storage.lock().get_all_zpools() } - pub async fn get_crucible_datasets( + pub fn get_crucible_datasets( &self, zpool_id: ZpoolUuid, ) -> Vec<(DatasetUuid, SocketAddr)> { - self.storage.lock().await.get_all_crucible_datasets(zpool_id) + self.storage.lock().get_all_crucible_datasets(zpool_id) } /// Adds a Zpool to the simulated sled agent. - pub async fn create_zpool( + pub fn create_zpool( &self, id: ZpoolUuid, physical_disk_id: PhysicalDiskUuid, size: u64, ) { - self.storage - .lock() - .await - .insert_zpool(id, physical_disk_id, size) - .await; - } - - /// Adds a debug dataset within a zpool - pub async fn create_debug_dataset( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) { - self.storage - .lock() - .await - .insert_debug_dataset(zpool_id, dataset_id) - .await + self.storage.lock().insert_zpool(id, physical_disk_id, size); } /// Adds a Crucible Dataset within a zpool. - pub async fn create_crucible_dataset( + pub fn create_crucible_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> SocketAddr { - self.storage - .lock() - .await - .insert_crucible_dataset(zpool_id, dataset_id) - .await + self.storage.lock().insert_crucible_dataset(zpool_id, dataset_id) } /// Returns a crucible dataset within a particular zpool. - pub async fn get_crucible_dataset( + pub fn get_crucible_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> Arc { - self.storage - .lock() - .await - .get_crucible_dataset(zpool_id, dataset_id) - .await + self.storage.lock().get_crucible_dataset(zpool_id, dataset_id) } /// Issue a snapshot request for a Crucible disk attached to an instance. @@ -686,7 +661,7 @@ impl SledAgent { /// /// We're not simulating the propolis server, so directly create a /// snapshot here. - pub async fn instance_issue_disk_snapshot_request( + pub fn instance_issue_disk_snapshot_request( &self, _propolis_id: PropolisUuid, disk_id: Uuid, @@ -696,7 +671,7 @@ impl SledAgent { // for each region that makes up the disk. Use the disk_id_to_region_ids // map to perform lookup based on this function's disk id argument. - let disk_id_to_region_ids = self.disk_id_to_region_ids.lock().await; + let disk_id_to_region_ids = self.disk_id_to_region_ids.lock().unwrap(); let region_ids = disk_id_to_region_ids.get(&disk_id.to_string()); let region_ids = region_ids.ok_or_else(|| { @@ -705,16 +680,14 @@ impl SledAgent { info!(self.log, "disk id {} region ids are {:?}", disk_id, region_ids); - let storage = self.storage.lock().await; + let storage = self.storage.lock(); for region_id in region_ids { - let crucible_data = - storage.get_dataset_for_region(*region_id).await; + let crucible_data = storage.get_dataset_for_region(*region_id); if let Some(crucible_data) = crucible_data { crucible_data .create_snapshot(*region_id, snapshot_id) - .await .map_err(|e| Error::internal_error(&e.to_string()))?; } else { return Err(Error::not_found_by_id( @@ -727,28 +700,28 @@ impl SledAgent { Ok(()) } - pub async fn set_virtual_nic_host( + pub fn set_virtual_nic_host( &self, mapping: &VirtualNetworkInterfaceHost, ) -> Result<(), Error> { - let mut v2p_mappings = self.v2p_mappings.lock().await; + let mut v2p_mappings = self.v2p_mappings.lock().unwrap(); v2p_mappings.insert(mapping.clone()); Ok(()) } - pub async fn unset_virtual_nic_host( + pub fn unset_virtual_nic_host( &self, mapping: &VirtualNetworkInterfaceHost, ) -> Result<(), Error> { - let mut v2p_mappings = self.v2p_mappings.lock().await; + let mut v2p_mappings = self.v2p_mappings.lock().unwrap(); v2p_mappings.remove(mapping); Ok(()) } - pub async fn list_virtual_nics( + pub fn list_virtual_nics( &self, ) -> Result, Error> { - let v2p_mappings = self.v2p_mappings.lock().await; + let v2p_mappings = self.v2p_mappings.lock().unwrap(); Ok(Vec::from_iter(v2p_mappings.clone())) } @@ -763,7 +736,7 @@ impl SledAgent { )); } - let mut eips = self.external_ips.lock().await; + let mut eips = self.external_ips.lock().unwrap(); let my_eips = eips.entry(propolis_id).or_default(); // High-level behaviour: this should always succeed UNLESS @@ -796,7 +769,7 @@ impl SledAgent { )); } - let mut eips = self.external_ips.lock().await; + let mut eips = self.external_ips.lock().unwrap(); let my_eips = eips.entry(propolis_id).or_default(); my_eips.remove(&body_args); @@ -841,10 +814,7 @@ impl SledAgent { Ok(addr) } - pub async fn inventory( - &self, - addr: SocketAddr, - ) -> anyhow::Result { + pub fn inventory(&self, addr: SocketAddr) -> anyhow::Result { let sled_agent_address = match addr { SocketAddr::V4(_) => { bail!("sled_agent_ip must be v6 for inventory") @@ -852,7 +822,7 @@ impl SledAgent { SocketAddr::V6(v6) => v6, }; - let storage = self.storage.lock().await; + let storage = self.storage.lock(); Ok(Inventory { sled_id: self.id, sled_agent_address, @@ -867,7 +837,7 @@ impl SledAgent { self.config.hardware.reservoir_ram, ) .context("reservoir_size")?, - omicron_zones: self.fake_zones.lock().await.clone(), + omicron_zones: self.fake_zones.lock().unwrap().clone(), disks: storage .physical_disks() .values() @@ -899,7 +869,6 @@ impl SledAgent { // represent the "real" datasets the sled agent can observe. datasets: storage .datasets_config_list() - .await .map(|config| { config .datasets @@ -926,10 +895,10 @@ impl SledAgent { dataset_id: DatasetUuid, ) -> Result, HttpError> { self.storage - .lock() - .await - .support_bundle_list(zpool_id, dataset_id) + .as_support_bundle_storage(&self.log) + .list(zpool_id, dataset_id) .await + .map_err(|err| err.into()) } pub async fn support_bundle_create( @@ -938,35 +907,49 @@ impl SledAgent { dataset_id: DatasetUuid, support_bundle_id: SupportBundleUuid, expected_hash: ArtifactHash, + stream: impl Stream>, ) -> Result { self.storage - .lock() - .await - .support_bundle_create( + .as_support_bundle_storage(&self.log) + .create( zpool_id, dataset_id, support_bundle_id, expected_hash, + stream, ) - .await?; - - Ok(SupportBundleMetadata { - support_bundle_id, - state: SupportBundleState::Complete, - }) + .await + .map_err(|err| err.into()) } - pub async fn support_bundle_get( + pub(crate) async fn support_bundle_get( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, support_bundle_id: SupportBundleUuid, - ) -> Result<(), HttpError> { + range: Option, + query: SupportBundleQueryType, + ) -> Result, HttpError> { self.storage - .lock() + .as_support_bundle_storage(&self.log) + .get(zpool_id, dataset_id, support_bundle_id, range, query) .await - .support_bundle_exists(zpool_id, dataset_id, support_bundle_id) + .map_err(|err| err.into()) + } + + pub(crate) async fn support_bundle_head( + &self, + zpool_id: ZpoolUuid, + dataset_id: DatasetUuid, + support_bundle_id: SupportBundleUuid, + range: Option, + query: SupportBundleQueryType, + ) -> Result, HttpError> { + self.storage + .as_support_bundle_storage(&self.log) + .head(zpool_id, dataset_id, support_bundle_id, range, query) .await + .map_err(|err| err.into()) } pub async fn support_bundle_delete( @@ -976,67 +959,58 @@ impl SledAgent { support_bundle_id: SupportBundleUuid, ) -> Result<(), HttpError> { self.storage - .lock() - .await - .support_bundle_delete(zpool_id, dataset_id, support_bundle_id) + .as_support_bundle_storage(&self.log) + .delete(zpool_id, dataset_id, support_bundle_id) .await + .map_err(|err| err.into()) } - pub async fn datasets_ensure( + pub fn datasets_ensure( &self, config: DatasetsConfig, ) -> Result { - self.storage.lock().await.datasets_ensure(config).await + self.storage.lock().datasets_ensure(config) } - pub async fn datasets_config_list( - &self, - ) -> Result { - self.storage.lock().await.datasets_config_list().await + pub fn datasets_config_list(&self) -> Result { + self.storage.lock().datasets_config_list() } - pub async fn omicron_physical_disks_list( + pub fn omicron_physical_disks_list( &self, ) -> Result { - self.storage.lock().await.omicron_physical_disks_list().await + self.storage.lock().omicron_physical_disks_list() } - pub async fn omicron_physical_disks_ensure( + pub fn omicron_physical_disks_ensure( &self, config: OmicronPhysicalDisksConfig, ) -> Result { - self.storage.lock().await.omicron_physical_disks_ensure(config).await + self.storage.lock().omicron_physical_disks_ensure(config) } - pub async fn omicron_zones_list(&self) -> OmicronZonesConfig { - self.fake_zones.lock().await.clone() + pub fn omicron_zones_list(&self) -> OmicronZonesConfig { + self.fake_zones.lock().unwrap().clone() } - pub async fn omicron_zones_ensure( - &self, - requested_zones: OmicronZonesConfig, - ) { - *self.fake_zones.lock().await = requested_zones; + pub fn omicron_zones_ensure(&self, requested_zones: OmicronZonesConfig) { + *self.fake_zones.lock().unwrap() = requested_zones; } - pub async fn drop_dataset( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) { - self.storage.lock().await.drop_dataset(zpool_id, dataset_id) + pub fn drop_dataset(&self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid) { + self.storage.lock().drop_dataset(zpool_id, dataset_id) } - pub async fn list_vpc_routes(&self) -> Vec { - let routes = self.vpc_routes.lock().await; + pub fn list_vpc_routes(&self) -> Vec { + let routes = self.vpc_routes.lock().unwrap(); routes .iter() .map(|(k, v)| ResolvedVpcRouteState { id: *k, version: v.version }) .collect() } - pub async fn set_vpc_routes(&self, new_routes: Vec) { - let mut routes = self.vpc_routes.lock().await; + pub fn set_vpc_routes(&self, new_routes: Vec) { + let mut routes = self.vpc_routes.lock().unwrap(); for new in new_routes { // Disregard any route information for a subnet we don't have. let Some(old) = routes.get(&new.id) else { diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index dc8cf63fe41..e6d999e7e8b 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -12,15 +12,18 @@ use crate::sim::http_entrypoints_pantry::ExpectedDigest; use crate::sim::http_entrypoints_pantry::PantryStatus; use crate::sim::http_entrypoints_pantry::VolumeStatus; use crate::sim::SledAgent; +use crate::support_bundle::storage::SupportBundleManager; use anyhow::{self, bail, Result}; +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; use chrono::prelude::*; use crucible_agent_client::types::{ CreateRegion, Region, RegionId, RunningSnapshot, Snapshot, State, }; use dropshot::HandlerTaskMode; use dropshot::HttpError; -use futures::lock::Mutex; use omicron_common::disk::DatasetManagementStatus; +use omicron_common::disk::DatasetName; use omicron_common::disk::DatasetsConfig; use omicron_common::disk::DatasetsManagementResult; use omicron_common::disk::DiskIdentity; @@ -28,24 +31,26 @@ use omicron_common::disk::DiskManagementStatus; use omicron_common::disk::DiskVariant; use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; -use omicron_common::update::ArtifactHash; +use omicron_common::disk::SharedDatasetConfig; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::PropolisUuid; -use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; use propolis_client::types::VolumeConstructionRequest; use serde::Serialize; -use sled_agent_api::SupportBundleMetadata; -use sled_agent_api::SupportBundleState; +use sled_storage::manager::NestedDatasetConfig; +use sled_storage::manager::NestedDatasetListOptions; +use sled_storage::manager::NestedDatasetLocation; use slog::Logger; +use std::collections::BTreeMap; use std::collections::HashMap; use std::collections::HashSet; use std::net::{IpAddr, SocketAddr}; use std::str::FromStr; use std::sync::Arc; +use std::sync::Mutex; use uuid::Uuid; type CreateCallback = Box State + Send + 'static>; @@ -676,91 +681,90 @@ impl CrucibleData { } } - pub async fn set_create_callback(&self, callback: CreateCallback) { - self.inner.lock().await.set_create_callback(callback); + pub fn set_create_callback(&self, callback: CreateCallback) { + self.inner.lock().unwrap().set_create_callback(callback); } - pub async fn list(&self) -> Vec { - self.inner.lock().await.list() + pub fn list(&self) -> Vec { + self.inner.lock().unwrap().list() } - pub async fn create(&self, params: CreateRegion) -> Result { - self.inner.lock().await.create(params) + pub fn create(&self, params: CreateRegion) -> Result { + self.inner.lock().unwrap().create(params) } - pub async fn get(&self, id: RegionId) -> Option { - self.inner.lock().await.get(id) + pub fn get(&self, id: RegionId) -> Option { + self.inner.lock().unwrap().get(id) } - pub async fn delete(&self, id: RegionId) -> Result> { - self.inner.lock().await.delete(id) + pub fn delete(&self, id: RegionId) -> Result> { + self.inner.lock().unwrap().delete(id) } - pub async fn create_snapshot( + pub fn create_snapshot( &self, id: Uuid, snapshot_id: Uuid, ) -> Result { - self.inner.lock().await.create_snapshot(id, snapshot_id) + self.inner.lock().unwrap().create_snapshot(id, snapshot_id) } - pub async fn snapshots_for_region(&self, id: &RegionId) -> Vec { - self.inner.lock().await.snapshots_for_region(id) + pub fn snapshots_for_region(&self, id: &RegionId) -> Vec { + self.inner.lock().unwrap().snapshots_for_region(id) } - pub async fn get_snapshot_for_region( + pub fn get_snapshot_for_region( &self, id: &RegionId, snapshot_id: &str, ) -> Option { - self.inner.lock().await.get_snapshot_for_region(id, snapshot_id) + self.inner.lock().unwrap().get_snapshot_for_region(id, snapshot_id) } - pub async fn running_snapshots_for_id( + pub fn running_snapshots_for_id( &self, id: &RegionId, ) -> HashMap { - self.inner.lock().await.running_snapshots_for_id(id) + self.inner.lock().unwrap().running_snapshots_for_id(id) } - pub async fn delete_snapshot( - &self, - id: &RegionId, - name: &str, - ) -> Result<()> { - self.inner.lock().await.delete_snapshot(id, name) + pub fn delete_snapshot(&self, id: &RegionId, name: &str) -> Result<()> { + self.inner.lock().unwrap().delete_snapshot(id, name) } - pub async fn set_creating_a_running_snapshot_should_fail(&self) { - self.inner.lock().await.set_creating_a_running_snapshot_should_fail(); + pub fn set_creating_a_running_snapshot_should_fail(&self) { + self.inner + .lock() + .unwrap() + .set_creating_a_running_snapshot_should_fail(); } - pub async fn set_region_creation_error(&self, value: bool) { - self.inner.lock().await.set_region_creation_error(value); + pub fn set_region_creation_error(&self, value: bool) { + self.inner.lock().unwrap().set_region_creation_error(value); } - pub async fn set_region_deletion_error(&self, value: bool) { - self.inner.lock().await.set_region_deletion_error(value); + pub fn set_region_deletion_error(&self, value: bool) { + self.inner.lock().unwrap().set_region_deletion_error(value); } - pub async fn create_running_snapshot( + pub fn create_running_snapshot( &self, id: &RegionId, name: &str, ) -> Result { - self.inner.lock().await.create_running_snapshot(id, name) + self.inner.lock().unwrap().create_running_snapshot(id, name) } - pub async fn delete_running_snapshot( + pub fn delete_running_snapshot( &self, id: &RegionId, name: &str, ) -> Result<()> { - self.inner.lock().await.delete_running_snapshot(id, name) + self.inner.lock().unwrap().delete_running_snapshot(id, name) } - pub async fn is_empty(&self) -> bool { - self.inner.lock().await.is_empty() + pub fn is_empty(&self) -> bool { + self.inner.lock().unwrap().is_empty() } } @@ -814,11 +818,6 @@ impl CrucibleServer { } } -#[derive(Default)] -pub(crate) struct DebugData { - bundles: HashMap, -} - pub(crate) struct PhysicalDisk { pub(crate) identity: DiskIdentity, pub(crate) variant: DiskVariant, @@ -828,7 +827,6 @@ pub(crate) struct PhysicalDisk { /// Describes data being simulated within a dataset. pub(crate) enum DatasetContents { Crucible(CrucibleServer), - Debug(DebugData), } pub(crate) struct Zpool { @@ -847,10 +845,6 @@ impl Zpool { Zpool { id, physical_disk_id, total_size, datasets: HashMap::new() } } - fn insert_debug_dataset(&mut self, id: DatasetUuid) { - self.datasets.insert(id, DatasetContents::Debug(DebugData::default())); - } - fn insert_crucible_dataset( &mut self, log: &Logger, @@ -871,10 +865,7 @@ impl Zpool { let DatasetContents::Crucible(crucible) = self .datasets .get(&id) - .expect("Failed to get the dataset we just inserted") - else { - panic!("Should have just inserted Crucible dataset"); - }; + .expect("Failed to get the dataset we just inserted"); crucible } @@ -882,17 +873,16 @@ impl Zpool { self.total_size } - pub async fn get_dataset_for_region( + pub fn get_dataset_for_region( &self, region_id: Uuid, ) -> Option> { for dataset in self.datasets.values() { - if let DatasetContents::Crucible(dataset) = dataset { - for region in &dataset.data().list().await { - let id = Uuid::from_str(®ion.id.0).unwrap(); - if id == region_id { - return Some(dataset.data()); - } + let DatasetContents::Crucible(dataset) = dataset; + for region in &dataset.data().list() { + let id = Uuid::from_str(®ion.id.0).unwrap(); + if id == region_id { + return Some(dataset.data()); } } } @@ -900,19 +890,18 @@ impl Zpool { None } - pub async fn get_region_for_port(&self, port: u16) -> Option { + pub fn get_region_for_port(&self, port: u16) -> Option { let mut regions = vec![]; for dataset in self.datasets.values() { - if let DatasetContents::Crucible(dataset) = dataset { - for region in &dataset.data().list().await { - if region.state == State::Destroyed { - continue; - } + let DatasetContents::Crucible(dataset) = dataset; + for region in &dataset.data().list() { + if region.state == State::Destroyed { + continue; + } - if port == region.port_number { - regions.push(region.clone()); - } + if port == region.port_number { + regions.push(region.clone()); } } } @@ -928,12 +917,90 @@ impl Zpool { } } +/// Represents a nested dataset +pub struct NestedDatasetStorage { + config: NestedDatasetConfig, + // We intentionally store the children before the mountpoint, + // so they are deleted first. + children: BTreeMap, + // We store this directory as a temporary directory so it gets + // removed when this struct is dropped. + #[allow(dead_code)] + mountpoint: Utf8TempDir, +} + +impl NestedDatasetStorage { + fn new( + zpool_root: &Utf8Path, + dataset_root: DatasetName, + path: String, + shared_config: SharedDatasetConfig, + ) -> Self { + let name = NestedDatasetLocation { path, root: dataset_root }; + + // Create a mountpoint for the nested dataset storage that lasts + // as long as the nested dataset does. + let mountpoint = name.mountpoint(zpool_root); + println!("NestedDatasetStorage: Mountpoint {mountpoint}"); + let parent = mountpoint.as_path().parent().unwrap(); + println!("NestedDatasetStorage: Creating parent dir: {parent}"); + std::fs::create_dir_all(&parent).unwrap(); + + let new_dir_name = mountpoint.as_path().file_name().unwrap(); + println!("NestedDatasetStorage: New dir name: {new_dir_name}"); + let mountpoint = camino_tempfile::Builder::new() + .rand_bytes(0) + .prefix(new_dir_name) + .tempdir_in(parent) + .unwrap(); + + Self { + config: NestedDatasetConfig { name, inner: shared_config }, + children: BTreeMap::new(), + mountpoint, + } + } +} + /// Simulated representation of all storage on a sled. +#[derive(Clone)] pub struct Storage { - sled_id: Uuid, + inner: Arc>, +} + +impl Storage { + pub fn new(sled_id: Uuid, crucible_ip: IpAddr, log: Logger) -> Self { + Self { + inner: Arc::new(Mutex::new(StorageInner::new( + sled_id, + crucible_ip, + log, + ))), + } + } + + pub fn lock(&self) -> std::sync::MutexGuard { + self.inner.lock().unwrap() + } + + pub fn as_support_bundle_storage<'a>( + &'a self, + log: &'a Logger, + ) -> SupportBundleManager<'a> { + SupportBundleManager::new(log, self) + } +} + +/// Simulated representation of all storage on a sled. +/// +/// Guarded by a mutex from [Storage]. +pub struct StorageInner { log: Logger, + sled_id: Uuid, + root: Utf8TempDir, config: Option, dataset_config: Option, + nested_datasets: HashMap, physical_disks: HashMap, next_disk_slot: i64, zpools: HashMap, @@ -941,13 +1008,15 @@ pub struct Storage { next_crucible_port: u16, } -impl Storage { +impl StorageInner { pub fn new(sled_id: Uuid, crucible_ip: IpAddr, log: Logger) -> Self { Self { sled_id, log, + root: camino_tempfile::tempdir().unwrap(), config: None, dataset_config: None, + nested_datasets: HashMap::new(), physical_disks: HashMap::new(), next_disk_slot: 0, zpools: HashMap::new(), @@ -956,14 +1025,17 @@ impl Storage { } } + /// Returns a path to the "zpool root" for storage. + pub fn root(&self) -> &Utf8Path { + self.root.path() + } + /// Returns an immutable reference to all (currently known) physical disks pub fn physical_disks(&self) -> &HashMap { &self.physical_disks } - pub async fn datasets_config_list( - &self, - ) -> Result { + pub fn datasets_config_list(&self) -> Result { let Some(config) = self.dataset_config.as_ref() else { return Err(HttpError::for_not_found( None, @@ -973,7 +1045,7 @@ impl Storage { Ok(config.clone()) } - pub async fn datasets_ensure( + pub fn datasets_ensure( &mut self, config: DatasetsConfig, ) -> Result { @@ -997,6 +1069,27 @@ impl Storage { } self.dataset_config.replace(config.clone()); + // Add a "nested dataset" entry for all datasets that should exist, + // and remove it for all datasets that have been removed. + let dataset_names: HashSet<_> = config + .datasets + .values() + .map(|config| config.name.clone()) + .collect(); + for dataset in &dataset_names { + let root = self.root().to_path_buf(); + self.nested_datasets.entry(dataset.clone()).or_insert_with(|| { + NestedDatasetStorage::new( + &root, + dataset.clone(), + String::new(), + SharedDatasetConfig::default(), + ) + }); + } + self.nested_datasets + .retain(|dataset, _| dataset_names.contains(&dataset)); + Ok(DatasetsManagementResult { status: config .datasets @@ -1009,7 +1102,149 @@ impl Storage { }) } - pub async fn omicron_physical_disks_list( + pub fn nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, HttpError> { + let Some(mut nested_dataset) = self.nested_datasets.get(&name.root) + else { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )); + }; + + for path_component in name.path.split('/') { + if path_component.is_empty() { + continue; + } + match nested_dataset.children.get(path_component) { + Some(dataset) => nested_dataset = dataset, + None => { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )) + } + }; + } + + let mut children: Vec<_> = nested_dataset + .children + .values() + .map(|storage| storage.config.clone()) + .collect(); + + match options { + NestedDatasetListOptions::ChildrenOnly => return Ok(children), + NestedDatasetListOptions::SelfAndChildren => { + children.insert(0, nested_dataset.config.clone()); + return Ok(children); + } + } + } + + pub fn nested_dataset_ensure( + &mut self, + config: NestedDatasetConfig, + ) -> Result<(), HttpError> { + let name = &config.name; + let nested_path = name.path.to_string(); + let zpool_root = self.root().to_path_buf(); + let Some(mut nested_dataset) = self.nested_datasets.get_mut(&name.root) + else { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )); + }; + + for path_component in nested_path.split('/') { + if path_component.is_empty() { + continue; + } + + // Final component of path -- insert it here if it doesn't exist + // already. + if !path_component.contains('/') { + let entry = + nested_dataset.children.entry(path_component.to_string()); + entry + .and_modify(|storage| { + storage.config = config.clone(); + }) + .or_insert_with(|| { + NestedDatasetStorage::new( + &zpool_root, + config.name.root, + nested_path, + config.inner, + ) + }); + return Ok(()); + } + + match nested_dataset.children.get_mut(path_component) { + Some(dataset) => nested_dataset = dataset, + None => { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )) + } + }; + } + return Err(HttpError::for_not_found( + None, + "Nested Dataset not found".to_string(), + )); + } + + pub fn nested_dataset_destroy( + &mut self, + name: NestedDatasetLocation, + ) -> Result<(), HttpError> { + let Some(mut nested_dataset) = self.nested_datasets.get_mut(&name.root) + else { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )); + }; + + for path_component in name.path.split('/') { + if path_component.is_empty() { + continue; + } + + // Final component of path -- remove it if it exists. + if !path_component.contains('/') { + if nested_dataset.children.remove(path_component).is_none() { + return Err(HttpError::for_not_found( + None, + "Nested Dataset not found".to_string(), + )); + }; + return Ok(()); + } + match nested_dataset.children.get_mut(path_component) { + Some(dataset) => nested_dataset = dataset, + None => { + return Err(HttpError::for_not_found( + None, + "Dataset not found".to_string(), + )) + } + }; + } + return Err(HttpError::for_not_found( + None, + "Nested Dataset not found".to_string(), + )); + } + + pub fn omicron_physical_disks_list( &mut self, ) -> Result { let Some(config) = self.config.as_ref() else { @@ -1021,7 +1256,7 @@ impl Storage { Ok(config.clone()) } - pub async fn omicron_physical_disks_ensure( + pub fn omicron_physical_disks_ensure( &mut self, config: OmicronPhysicalDisksConfig, ) -> Result { @@ -1057,7 +1292,7 @@ impl Storage { }) } - pub async fn insert_physical_disk( + pub fn insert_physical_disk( &mut self, id: PhysicalDiskUuid, identity: DiskIdentity, @@ -1070,7 +1305,7 @@ impl Storage { } /// Adds a Zpool to the sled's simulated storage. - pub async fn insert_zpool( + pub fn insert_zpool( &mut self, zpool_id: ZpoolUuid, disk_id: PhysicalDiskUuid, @@ -1085,143 +1320,8 @@ impl Storage { &self.zpools } - fn get_debug_dataset( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) -> Result<&DebugData, HttpError> { - let Some(zpool) = self.zpools.get(&zpool_id) else { - return Err(HttpError::for_not_found( - None, - format!("zpool does not exist {zpool_id}"), - )); - }; - let Some(dataset) = zpool.datasets.get(&dataset_id) else { - return Err(HttpError::for_not_found( - None, - format!("dataset does not exist {dataset_id}"), - )); - }; - - let DatasetContents::Debug(debug) = dataset else { - return Err(HttpError::for_bad_request( - None, - format!("Not a debug dataset: {zpool_id} / {dataset_id}"), - )); - }; - - Ok(debug) - } - - fn get_debug_dataset_mut( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) -> Result<&mut DebugData, HttpError> { - let Some(zpool) = self.zpools.get_mut(&zpool_id) else { - return Err(HttpError::for_not_found( - None, - format!("zpool does not exist {zpool_id}"), - )); - }; - let Some(dataset) = zpool.datasets.get_mut(&dataset_id) else { - return Err(HttpError::for_not_found( - None, - format!("dataset does not exist {dataset_id}"), - )); - }; - - let DatasetContents::Debug(debug) = dataset else { - return Err(HttpError::for_bad_request( - None, - format!("Not a debug dataset: {zpool_id} / {dataset_id}"), - )); - }; - - Ok(debug) - } - - pub async fn support_bundle_list( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) -> Result, HttpError> { - let debug = self.get_debug_dataset(zpool_id, dataset_id)?; - - Ok(debug - .bundles - .keys() - .map(|id| SupportBundleMetadata { - support_bundle_id: *id, - state: SupportBundleState::Complete, - }) - .collect()) - } - - pub async fn support_bundle_create( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - support_bundle_id: SupportBundleUuid, - hash: ArtifactHash, - ) -> Result<(), HttpError> { - let debug = self.get_debug_dataset_mut(zpool_id, dataset_id)?; - - // This is for the simulated server, so we totally ignore the "contents" - // of the bundle and just accept that it should exist. - debug.bundles.insert(support_bundle_id, hash); - - Ok(()) - } - - pub async fn support_bundle_exists( - &self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - support_bundle_id: SupportBundleUuid, - ) -> Result<(), HttpError> { - let debug = self.get_debug_dataset(zpool_id, dataset_id)?; - - if !debug.bundles.contains_key(&support_bundle_id) { - return Err(HttpError::for_not_found( - None, - format!("Support bundle not found {support_bundle_id}"), - )); - } - Ok(()) - } - - pub async fn support_bundle_delete( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - support_bundle_id: SupportBundleUuid, - ) -> Result<(), HttpError> { - let debug = self.get_debug_dataset_mut(zpool_id, dataset_id)?; - - if debug.bundles.remove(&support_bundle_id).is_none() { - return Err(HttpError::for_not_found( - None, - format!("Support bundle not found {support_bundle_id}"), - )); - } - Ok(()) - } - - /// Adds a debug dataset to the sled's simulated storage - pub async fn insert_debug_dataset( - &mut self, - zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, - ) { - self.zpools - .get_mut(&zpool_id) - .expect("Zpool does not exist") - .insert_debug_dataset(dataset_id); - } - /// Adds a Crucible dataset to the sled's simulated storage. - pub async fn insert_crucible_dataset( + pub fn insert_crucible_dataset( &mut self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, @@ -1291,16 +1391,13 @@ impl Storage { zpool .datasets .iter() - .filter_map(|(id, dataset)| match dataset { - DatasetContents::Crucible(server) => { - Some((*id, server.address())) - } - _ => None, + .map(|(id, dataset)| match dataset { + DatasetContents::Crucible(server) => (*id, server.address()), }) .collect() } - pub async fn get_dataset( + pub fn get_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, @@ -1313,24 +1410,22 @@ impl Storage { .expect("Dataset does not exist") } - pub async fn get_crucible_dataset( + pub fn get_crucible_dataset( &self, zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> Arc { - match self.get_dataset(zpool_id, dataset_id).await { + match self.get_dataset(zpool_id, dataset_id) { DatasetContents::Crucible(crucible) => crucible.data.clone(), - _ => panic!("{zpool_id} / {dataset_id} is not a crucible dataset"), } } - pub async fn get_dataset_for_region( + pub fn get_dataset_for_region( &self, region_id: Uuid, ) -> Option> { for zpool in self.zpools.values() { - if let Some(dataset) = zpool.get_dataset_for_region(region_id).await - { + if let Some(dataset) = zpool.get_dataset_for_region(region_id) { return Some(dataset); } } @@ -1338,10 +1433,10 @@ impl Storage { None } - pub async fn get_region_for_port(&self, port: u16) -> Option { + pub fn get_region_for_port(&self, port: u16) -> Option { let mut regions = vec![]; for zpool in self.zpools.values() { - if let Some(region) = zpool.get_region_for_port(port).await { + if let Some(region) = zpool.get_region_for_port(port) { regions.push(region); } } @@ -1389,18 +1484,18 @@ impl Pantry { } } - pub async fn status(&self) -> Result { + pub fn status(&self) -> Result { Ok(PantryStatus { - volumes: self.volumes.lock().await.keys().cloned().collect(), - num_job_handles: self.jobs.lock().await.len(), + volumes: self.volumes.lock().unwrap().keys().cloned().collect(), + num_job_handles: self.jobs.lock().unwrap().len(), }) } - pub async fn entry( + pub fn entry( &self, volume_id: String, ) -> Result { - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); match volumes.get(&volume_id) { Some(entry) => Ok(entry.vcr.clone()), @@ -1408,12 +1503,12 @@ impl Pantry { } } - pub async fn attach( + pub fn attach( &self, volume_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<()> { - let mut volumes = self.volumes.lock().await; + let mut volumes = self.volumes.lock().unwrap(); volumes.insert( volume_id, @@ -1431,14 +1526,14 @@ impl Pantry { Ok(()) } - pub async fn attach_activate_background( + pub fn attach_activate_background( &self, volume_id: String, activate_job_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<(), HttpError> { - let mut volumes = self.volumes.lock().await; - let mut jobs = self.jobs.lock().await; + let mut volumes = self.volumes.lock().unwrap(); + let mut jobs = self.jobs.lock().unwrap(); volumes.insert( volume_id, @@ -1458,30 +1553,30 @@ impl Pantry { Ok(()) } - pub async fn activate_background_attachment( + pub fn activate_background_attachment( &self, volume_id: String, ) -> Result { let activate_job = { - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); volumes.get(&volume_id).unwrap().activate_job.clone().unwrap() }; - let mut status = self.volume_status(volume_id.clone()).await?; + let mut status = self.volume_status(volume_id.clone())?; status.active = true; status.seen_active = true; - self.update_volume_status(volume_id, status).await?; + self.update_volume_status(volume_id, status)?; Ok(activate_job) } - pub async fn volume_status( + pub fn volume_status( &self, volume_id: String, ) -> Result { - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); match volumes.get(&volume_id) { Some(pantry_volume) => Ok(pantry_volume.status.clone()), @@ -1490,12 +1585,12 @@ impl Pantry { } } - pub async fn update_volume_status( + pub fn update_volume_status( &self, volume_id: String, status: VolumeStatus, ) -> Result<(), HttpError> { - let mut volumes = self.volumes.lock().await; + let mut volumes = self.volumes.lock().unwrap(); match volumes.get_mut(&volume_id) { Some(pantry_volume) => { @@ -1507,22 +1602,19 @@ impl Pantry { } } - pub async fn is_job_finished( - &self, - job_id: String, - ) -> Result { - let jobs = self.jobs.lock().await; + pub fn is_job_finished(&self, job_id: String) -> Result { + let jobs = self.jobs.lock().unwrap(); if !jobs.contains(&job_id) { return Err(HttpError::for_not_found(None, job_id)); } Ok(true) } - pub async fn get_job_result( + pub fn get_job_result( &self, job_id: String, ) -> Result, HttpError> { - let mut jobs = self.jobs.lock().await; + let mut jobs = self.jobs.lock().unwrap(); if !jobs.contains(&job_id) { return Err(HttpError::for_not_found(None, job_id)); } @@ -1530,23 +1622,23 @@ impl Pantry { Ok(Ok(true)) } - pub async fn import_from_url( + pub fn import_from_url( &self, volume_id: String, _url: String, _expected_digest: Option, ) -> Result { - self.entry(volume_id).await?; + self.entry(volume_id)?; // Make up job - let mut jobs = self.jobs.lock().await; + let mut jobs = self.jobs.lock().unwrap(); let job_id = Uuid::new_v4().to_string(); jobs.insert(job_id.clone()); Ok(job_id) } - pub async fn snapshot( + pub fn snapshot( &self, volume_id: String, snapshot_id: String, @@ -1555,12 +1647,11 @@ impl Pantry { // the simulated instance ensure, then call // [`instance_issue_disk_snapshot_request`] as the snapshot logic is the // same. - let volumes = self.volumes.lock().await; + let volumes = self.volumes.lock().unwrap(); let volume_construction_request = &volumes.get(&volume_id).unwrap().vcr; self.sled_agent - .map_disk_ids_to_region_ids(volume_construction_request) - .await?; + .map_disk_ids_to_region_ids(volume_construction_request)?; self.sled_agent .instance_issue_disk_snapshot_request( @@ -1568,17 +1659,16 @@ impl Pantry { volume_id.parse().unwrap(), snapshot_id.parse().unwrap(), ) - .await .map_err(|e| HttpError::for_internal_error(e.to_string())) } - pub async fn bulk_write( + pub fn bulk_write( &self, volume_id: String, offset: u64, data: Vec, ) -> Result<(), HttpError> { - let vcr = self.entry(volume_id).await?; + let vcr = self.entry(volume_id)?; // Currently, Nexus will only make volumes where the first subvolume is // a Region. This will change in the future! @@ -1632,19 +1722,19 @@ impl Pantry { Ok(()) } - pub async fn scrub(&self, volume_id: String) -> Result { - self.entry(volume_id).await?; + pub fn scrub(&self, volume_id: String) -> Result { + self.entry(volume_id)?; // Make up job - let mut jobs = self.jobs.lock().await; + let mut jobs = self.jobs.lock().unwrap(); let job_id = Uuid::new_v4().to_string(); jobs.insert(job_id.clone()); Ok(job_id) } - pub async fn detach(&self, volume_id: String) -> Result<()> { - let mut volumes = self.volumes.lock().await; + pub fn detach(&self, volume_id: String) -> Result<()> { + let mut volumes = self.volumes.lock().unwrap(); volumes.remove(&volume_id); Ok(()) } @@ -1656,11 +1746,7 @@ pub struct PantryServer { } impl PantryServer { - pub async fn new( - log: Logger, - ip: IpAddr, - sled_agent: Arc, - ) -> Self { + pub fn new(log: Logger, ip: IpAddr, sled_agent: Arc) -> Self { let pantry = Arc::new(Pantry::new(sled_agent)); let server = dropshot::ServerBuilder::new( diff --git a/sled-agent/src/support_bundle/storage.rs b/sled-agent/src/support_bundle/storage.rs index 97d345a8d2f..c1b0cfe42f6 100644 --- a/sled-agent/src/support_bundle/storage.rs +++ b/sled-agent/src/support_bundle/storage.rs @@ -4,6 +4,7 @@ //! Management of and access to Support Bundles +use async_trait::async_trait; use bytes::Bytes; use camino::Utf8Path; use dropshot::Body; @@ -13,6 +14,7 @@ use futures::StreamExt; use omicron_common::api::external::Error as ExternalError; use omicron_common::disk::CompressionAlgorithm; use omicron_common::disk::DatasetConfig; +use omicron_common::disk::DatasetsConfig; use omicron_common::disk::SharedDatasetConfig; use omicron_common::update::ArtifactHash; use omicron_uuid_kinds::DatasetUuid; @@ -30,6 +32,7 @@ use sled_storage::manager::NestedDatasetLocation; use sled_storage::manager::StorageHandle; use slog::Logger; use slog_error_chain::InlineErrorChain; +use std::borrow::Cow; use std::io::Write; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; @@ -37,6 +40,16 @@ use tokio::io::AsyncWriteExt; use tokio_util::io::ReaderStream; use zip::result::ZipError; +// The final name of the bundle, as it is stored within the dedicated +// datasets. +// +// The full path is of the form: +// +// /pool/ext/$(POOL_UUID)/crypt/$(DATASET_TYPE)/$(BUNDLE_UUID)/bundle.zip +// | | This is a per-bundle nested dataset +// | This is a Debug dataset +const BUNDLE_FILE_NAME: &str = "bundle.zip"; + #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] @@ -100,6 +113,116 @@ impl From for HttpError { } } +/// Abstracts the storage APIs for accessing datasets. +/// +/// Allows support bundle storage to work on both simulated and non-simulated +/// sled agents. +#[async_trait] +pub trait LocalStorage: Sync { + // These methods are all prefixed as "dyn_" to avoid duplicating the name + // with the real implementations. + // + // Dispatch is a little silly; if I use the same name as the real + // implementation, then a "missing function" dispatches to the trait instead + // and results in infinite recursion. + + /// Returns all configured datasets + async fn dyn_datasets_config_list(&self) -> Result; + + /// Returns all nested datasets within an existing dataset + async fn dyn_nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, Error>; + + /// Ensures a nested dataset exists + async fn dyn_nested_dataset_ensure( + &self, + config: NestedDatasetConfig, + ) -> Result<(), Error>; + + /// Destroys a nested dataset + async fn dyn_nested_dataset_destroy( + &self, + name: NestedDatasetLocation, + ) -> Result<(), Error>; + + /// Returns the root filesystem path where datasets are mounted. + /// + /// This is typically "/" in prod, but can be a temporary directory + /// for tests to isolate storage that typically appears globally. + fn zpool_mountpoint_root(&self) -> Cow; +} + +/// This implementation is effectively a pass-through to the real methods +#[async_trait] +impl LocalStorage for StorageHandle { + async fn dyn_datasets_config_list(&self) -> Result { + self.datasets_config_list().await.map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, Error> { + self.nested_dataset_list(name, options).await.map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_ensure( + &self, + config: NestedDatasetConfig, + ) -> Result<(), Error> { + self.nested_dataset_ensure(config).await.map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_destroy( + &self, + name: NestedDatasetLocation, + ) -> Result<(), Error> { + self.nested_dataset_destroy(name).await.map_err(|err| err.into()) + } + + fn zpool_mountpoint_root(&self) -> Cow { + Cow::Borrowed(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()) + } +} + +/// This implementation allows storage bundles to be stored on simulated storage +#[async_trait] +impl LocalStorage for crate::sim::Storage { + async fn dyn_datasets_config_list(&self) -> Result { + self.lock().datasets_config_list().map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_list( + &self, + name: NestedDatasetLocation, + options: NestedDatasetListOptions, + ) -> Result, Error> { + self.lock().nested_dataset_list(name, options).map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_ensure( + &self, + config: NestedDatasetConfig, + ) -> Result<(), Error> { + self.lock().nested_dataset_ensure(config).map_err(|err| err.into()) + } + + async fn dyn_nested_dataset_destroy( + &self, + name: NestedDatasetLocation, + ) -> Result<(), Error> { + self.lock().nested_dataset_destroy(name).map_err(|err| err.into()) + } + + fn zpool_mountpoint_root(&self) -> Cow { + Cow::Owned(self.lock().root().to_path_buf()) + } +} + /// Describes the type of access to the support bundle #[derive(Clone, Debug)] pub(crate) enum SupportBundleQueryType { @@ -237,7 +360,7 @@ fn stream_zip_entry( /// APIs to manage support bundle storage. pub struct SupportBundleManager<'a> { log: &'a Logger, - storage: &'a StorageHandle, + storage: &'a dyn LocalStorage, } impl<'a> SupportBundleManager<'a> { @@ -245,7 +368,7 @@ impl<'a> SupportBundleManager<'a> { /// to support bundle CRUD APIs. pub fn new( log: &'a Logger, - storage: &'a StorageHandle, + storage: &'a dyn LocalStorage, ) -> SupportBundleManager<'a> { Self { log, storage } } @@ -256,7 +379,7 @@ impl<'a> SupportBundleManager<'a> { zpool_id: ZpoolUuid, dataset_id: DatasetUuid, ) -> Result { - let datasets_config = self.storage.datasets_config_list().await?; + let datasets_config = self.storage.dyn_datasets_config_list().await?; let dataset = datasets_config .datasets .get(&dataset_id) @@ -290,7 +413,7 @@ impl<'a> SupportBundleManager<'a> { NestedDatasetLocation { path: String::from(""), root }; let datasets = self .storage - .nested_dataset_list( + .dyn_nested_dataset_list( dataset_location, NestedDatasetListOptions::ChildrenOnly, ) @@ -309,8 +432,8 @@ impl<'a> SupportBundleManager<'a> { // The dataset for a support bundle exists. let support_bundle_path = dataset .name - .mountpoint(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()) - .join("bundle"); + .mountpoint(&self.storage.zpool_mountpoint_root()) + .join(BUNDLE_FILE_NAME); // Identify whether or not the final "bundle" file exists. // @@ -399,9 +522,9 @@ impl<'a> SupportBundleManager<'a> { let dataset = NestedDatasetLocation { path: support_bundle_id.to_string(), root }; // The mounted root of the support bundle dataset - let support_bundle_dir = dataset - .mountpoint(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()); - let support_bundle_path = support_bundle_dir.join("bundle"); + let support_bundle_dir = + dataset.mountpoint(&self.storage.zpool_mountpoint_root()); + let support_bundle_path = support_bundle_dir.join(BUNDLE_FILE_NAME); let support_bundle_path_tmp = support_bundle_dir.join(format!( "bundle-{}.tmp", thread_rng() @@ -414,7 +537,7 @@ impl<'a> SupportBundleManager<'a> { // Ensure that the dataset exists. info!(log, "Ensuring dataset exists for bundle"); self.storage - .nested_dataset_ensure(NestedDatasetConfig { + .dyn_nested_dataset_ensure(NestedDatasetConfig { name: dataset, inner: SharedDatasetConfig { compression: CompressionAlgorithm::On, @@ -423,6 +546,7 @@ impl<'a> SupportBundleManager<'a> { }, }) .await?; + info!(log, "Dataset does exist for bundle"); // Exit early if the support bundle already exists if tokio::fs::try_exists(&support_bundle_path).await? { @@ -446,9 +570,14 @@ impl<'a> SupportBundleManager<'a> { // Stream the file into the dataset, first as a temporary file, // and then renaming to the final location. - info!(log, "Streaming bundle to storage"); + info!( + log, + "Streaming bundle to storage"; + "path" => ?support_bundle_path_tmp, + ); let tmp_file = tokio::fs::File::create(&support_bundle_path_tmp).await?; + if let Err(err) = Self::write_and_finalize_bundle( tmp_file, &support_bundle_path_tmp, @@ -492,7 +621,7 @@ impl<'a> SupportBundleManager<'a> { let root = self.get_configured_dataset(zpool_id, dataset_id).await?.name; self.storage - .nested_dataset_destroy(NestedDatasetLocation { + .dyn_nested_dataset_destroy(NestedDatasetLocation { path: support_bundle_id.to_string(), root, }) @@ -512,9 +641,9 @@ impl<'a> SupportBundleManager<'a> { let dataset = NestedDatasetLocation { path: support_bundle_id.to_string(), root }; // The mounted root of the support bundle dataset - let support_bundle_dir = dataset - .mountpoint(illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into()); - let path = support_bundle_dir.join("bundle"); + let support_bundle_dir = + dataset.mountpoint(&self.storage.zpool_mountpoint_root()); + let path = support_bundle_dir.join(BUNDLE_FILE_NAME); let f = tokio::fs::File::open(&path).await?; Ok(f) From 60c27d32e5daf1e454ecee6d1998aad565435471 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 17 Dec 2024 13:56:05 -0800 Subject: [PATCH 39/61] cleanup println --- sled-agent/src/sim/storage.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index e6d999e7e8b..07cb4e1ccf4 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -941,13 +941,10 @@ impl NestedDatasetStorage { // Create a mountpoint for the nested dataset storage that lasts // as long as the nested dataset does. let mountpoint = name.mountpoint(zpool_root); - println!("NestedDatasetStorage: Mountpoint {mountpoint}"); let parent = mountpoint.as_path().parent().unwrap(); - println!("NestedDatasetStorage: Creating parent dir: {parent}"); std::fs::create_dir_all(&parent).unwrap(); let new_dir_name = mountpoint.as_path().file_name().unwrap(); - println!("NestedDatasetStorage: New dir name: {new_dir_name}"); let mountpoint = camino_tempfile::Builder::new() .rand_bytes(0) .prefix(new_dir_name) From 759a9f818b02bf7f40147761441b840cd8a4f2bc Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 17 Dec 2024 14:07:13 -0800 Subject: [PATCH 40/61] Fix merge (renamed method) --- nexus/src/app/support_bundles.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/support_bundles.rs b/nexus/src/app/support_bundles.rs index 784a1596b2f..8461ffb209d 100644 --- a/nexus/src/app/support_bundles.rs +++ b/nexus/src/app/support_bundles.rs @@ -77,7 +77,7 @@ impl super::Nexus { // Lookup the sled holding the bundle, forward the request there let sled_id = self .db_datastore - .zpool_get_sled(&opctx, bundle.zpool_id.into()) + .zpool_get_sled_if_in_service(&opctx, bundle.zpool_id.into()) .await?; let client = self.sled_client(&sled_id).await?; From 053fddcf66800b4c99ebad8d6c3062fbdb3e7a51 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 17 Dec 2024 16:06:33 -0800 Subject: [PATCH 41/61] Starting to wire up authz for SupportBundle --- common/src/api/external/mod.rs | 1 + nexus/auth/src/authz/api_resources.rs | 8 ++++++++ nexus/auth/src/authz/oso_generic.rs | 1 + nexus/db-model/src/support_bundle.rs | 4 ++++ nexus/db-queries/src/db/lookup.rs | 15 +++++++++++++++ .../src/policy_test/resource_builder.rs | 1 + nexus/db-queries/src/policy_test/resources.rs | 9 +++++++++ nexus/tests/output/uncovered-authz-endpoints.txt | 9 --------- 8 files changed, 39 insertions(+), 9 deletions(-) diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 38a9de05648..b8dfec4e802 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -975,6 +975,7 @@ pub enum ResourceType { Instance, LoopbackAddress, SwitchPortSettings, + SupportBundle, IpPool, IpPoolResource, InstanceNetworkInterface, diff --git a/nexus/auth/src/authz/api_resources.rs b/nexus/auth/src/authz/api_resources.rs index bfde3d3b972..fa2e7bb1f35 100644 --- a/nexus/auth/src/authz/api_resources.rs +++ b/nexus/auth/src/authz/api_resources.rs @@ -995,6 +995,14 @@ authz_resource! { polar_snippet = FleetChild, } +authz_resource! { + name = "SupportBundle", + parent = "Fleet", + primary_key = { uuid_kind = SupportBundleKind }, + roles_allowed = false, + polar_snippet = FleetChild, +} + authz_resource! { name = "PhysicalDisk", parent = "Fleet", diff --git a/nexus/auth/src/authz/oso_generic.rs b/nexus/auth/src/authz/oso_generic.rs index acd74b2167f..321bb98b1c6 100644 --- a/nexus/auth/src/authz/oso_generic.rs +++ b/nexus/auth/src/authz/oso_generic.rs @@ -154,6 +154,7 @@ pub fn make_omicron_oso(log: &slog::Logger) -> Result { Silo::init(), SiloUser::init(), SiloGroup::init(), + SupportBundle::init(), IdentityProvider::init(), SamlIdentityProvider::init(), Sled::init(), diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index a4b14d363b2..cb125362e11 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -118,6 +118,10 @@ impl SupportBundle { assigned_nexus: Some(nexus_id.into()), } } + + pub fn id(&self) -> SupportBundleUuid { + self.id.into() + } } impl From for SupportBundleView { diff --git a/nexus/db-queries/src/db/lookup.rs b/nexus/db-queries/src/db/lookup.rs index e3b118f94fe..d8445e7c01a 100644 --- a/nexus/db-queries/src/db/lookup.rs +++ b/nexus/db-queries/src/db/lookup.rs @@ -22,6 +22,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; use omicron_common::api::external::{LookupResult, LookupType, ResourceType}; use omicron_uuid_kinds::PhysicalDiskUuid; +use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; use uuid::Uuid; @@ -390,6 +391,11 @@ impl<'a> LookupPath<'a> { PhysicalDisk::PrimaryKey(Root { lookup_root: self }, id) } + /// Select a resource of type SupportBundle, identified by its id + pub fn support_bundle(self, id: SupportBundleUuid) -> SupportBundle<'a> { + SupportBundle::PrimaryKey(Root { lookup_root: self }, id) + } + pub fn silo_image_id(self, id: Uuid) -> SiloImage<'a> { SiloImage::PrimaryKey(Root { lookup_root: self }, id) } @@ -878,6 +884,15 @@ lookup_resource! { primary_key_columns = [ { column_name = "id", uuid_kind = PhysicalDiskKind } ] } +lookup_resource! { + name = "SupportBundle", + ancestors = [], + children = [], + lookup_by_name = false, + soft_deletes = false, + primary_key_columns = [ { column_name = "id", uuid_kind = SupportBundleKind } ] +} + lookup_resource! { name = "TufRepo", ancestors = [], diff --git a/nexus/db-queries/src/policy_test/resource_builder.rs b/nexus/db-queries/src/policy_test/resource_builder.rs index 304b9703778..b6d7d97553e 100644 --- a/nexus/db-queries/src/policy_test/resource_builder.rs +++ b/nexus/db-queries/src/policy_test/resource_builder.rs @@ -271,6 +271,7 @@ impl_dyn_authorized_resource_for_resource!(authz::SiloUser); impl_dyn_authorized_resource_for_resource!(authz::Sled); impl_dyn_authorized_resource_for_resource!(authz::Snapshot); impl_dyn_authorized_resource_for_resource!(authz::SshKey); +impl_dyn_authorized_resource_for_resource!(authz::SupportBundle); impl_dyn_authorized_resource_for_resource!(authz::TufArtifact); impl_dyn_authorized_resource_for_resource!(authz::TufRepo); impl_dyn_authorized_resource_for_resource!(authz::Vpc); diff --git a/nexus/db-queries/src/policy_test/resources.rs b/nexus/db-queries/src/policy_test/resources.rs index 8075908d05a..21779e1ee64 100644 --- a/nexus/db-queries/src/policy_test/resources.rs +++ b/nexus/db-queries/src/policy_test/resources.rs @@ -12,6 +12,7 @@ use nexus_db_model::SemverVersion; use omicron_common::api::external::LookupType; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PhysicalDiskUuid; +use omicron_uuid_kinds::SupportBundleUuid; use oso::PolarClass; use std::collections::BTreeSet; use uuid::Uuid; @@ -111,6 +112,14 @@ pub async fn make_resources( LookupType::ById(physical_disk_id.into_untyped_uuid()), )); + let support_bundle_id: SupportBundleUuid = + "d9f923f6-caf3-4c83-96f9-8ffe8c627dd2".parse().unwrap(); + builder.new_resource(authz::SupportBundle::new( + authz::FLEET, + support_bundle_id, + LookupType::ById(support_bundle_id.into_untyped_uuid()), + )); + let device_user_code = String::from("a-device-user-code"); builder.new_resource(authz::DeviceAuthRequest::new( authz::FLEET, diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index 0a9e62707f0..c5091c5a3bc 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -1,22 +1,13 @@ API endpoints with no coverage in authz tests: probe_delete (delete "/experimental/v1/probes/{probe}") -support_bundle_delete (delete "/experimental/v1/system/support-bundles/{support_bundle}") probe_list (get "/experimental/v1/probes") probe_view (get "/experimental/v1/probes/{probe}") -support_bundle_list (get "/experimental/v1/system/support-bundles") -support_bundle_view (get "/experimental/v1/system/support-bundles/{support_bundle}") -support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") -support_bundle_download_file (get "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") -support_bundle_index (get "/experimental/v1/system/support-bundles/{support_bundle}/index") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") -support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") -support_bundle_head_file (head "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") probe_create (post "/experimental/v1/probes") -support_bundle_create (post "/experimental/v1/system/support-bundles") login_saml (post "/login/{silo_name}/saml/{provider_name}") login_local (post "/v1/login/{silo_name}/local") logout (post "/v1/logout") From bbeb75d0e89ee189153627e237d6e5e713c8b0c7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 20 Dec 2024 13:55:28 -0800 Subject: [PATCH 42/61] Working through authz tests --- docs/adding-an-endpoint.adoc | 2 +- .../src/db/datastore/support_bundle.rs | 67 ++++++++----- nexus/db-queries/tests/output/authz-roles.out | 14 +++ .../tasks/support_bundle_collector.rs | 33 +++++-- nexus/src/app/sled.rs | 45 +++++++-- nexus/src/app/support_bundles.rs | 26 ++++- nexus/src/lib.rs | 10 +- nexus/test-interface/src/lib.rs | 6 +- nexus/test-utils/src/resource_helpers.rs | 95 +++++++++++++++---- nexus/tests/integration_tests/endpoints.rs | 30 +++++- nexus/tests/integration_tests/unauthorized.rs | 25 ++++- sled-agent/src/sim/storage.rs | 5 + 12 files changed, 282 insertions(+), 76 deletions(-) diff --git a/docs/adding-an-endpoint.adoc b/docs/adding-an-endpoint.adoc index d9e5c559b40..e2f19c1fe70 100644 --- a/docs/adding-an-endpoint.adoc +++ b/docs/adding-an-endpoint.adoc @@ -53,7 +53,7 @@ this document should act as a jumping-off point. === **Testing** * Authorization -** There exists a https://github.com/oxidecomputer/omicron/blob/main/nexus/src/authz/policy_test[policy test] which compares all Oso objects against an expected policy. New resources are usually added to https://github.com/oxidecomputer/omicron/blob/main/nexus/src/authz/policy_test/resources.rs[resources.rs] to get coverage. +** There exists a https://github.com/oxidecomputer/omicron/blob/main/nexus/db-queries/src/policy_test[policy test] which compares all Oso objects against an expected policy. New resources are usually added to https://github.com/oxidecomputer/omicron/blob/main/nexus/db-queries/src/policy_test/resources.rs[resources.rs] to get coverage. * OpenAPI ** Once you've added or changed endpoint definitions in `nexus-external-api` or `nexus-internal-api`, you'll need to update the corresponding OpenAPI documents (the JSON files in `openapi/`). ** To update all OpenAPI documents, run `cargo xtask openapi generate`. diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 3f2af6d65a3..9d28f461deb 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -10,6 +10,7 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; +use crate::db::lookup::LookupPath; use crate::db::model::Dataset; use crate::db::model::DatasetKind; use crate::db::model::SupportBundle; @@ -164,16 +165,10 @@ impl DataStore { opctx: &OpContext, id: SupportBundleUuid, ) -> LookupResult { - opctx.authorize(authz::Action::Read, &authz::FLEET).await?; - use db::schema::support_bundle::dsl; + let (.., db_bundle) = + LookupPath::new(opctx, self).support_bundle(id).fetch().await?; - let conn = self.pool_connection_authorized(opctx).await?; - dsl::support_bundle - .filter(dsl::id.eq(id.into_untyped_uuid())) - .select(SupportBundle::as_select()) - .first_async::(&*conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + Ok(db_bundle) } /// Lists one page of support bundles @@ -420,18 +415,20 @@ impl DataStore { pub async fn support_bundle_update( &self, opctx: &OpContext, - id: SupportBundleUuid, + authz_bundle: &authz::SupportBundle, state: SupportBundleState, ) -> Result<(), Error> { - opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + opctx.authorize(authz::Action::Modify, authz_bundle).await?; use db::schema::support_bundle::dsl; + + let id = authz_bundle.id().into_untyped_uuid(); let conn = self.pool_connection_authorized(opctx).await?; let result = diesel::update(dsl::support_bundle) - .filter(dsl::id.eq(id.into_untyped_uuid())) + .filter(dsl::id.eq(id)) .filter(dsl::state.eq_any(state.valid_old_states())) .set(dsl::state.eq(state)) - .check_if_exists::(id.into_untyped_uuid()) + .check_if_exists::(id) .execute_and_check(&conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -454,12 +451,13 @@ impl DataStore { pub async fn support_bundle_delete( &self, opctx: &OpContext, - id: SupportBundleUuid, + authz_bundle: &authz::SupportBundle, ) -> Result<(), Error> { - opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + opctx.authorize(authz::Action::Delete, authz_bundle).await?; use db::schema::support_bundle::dsl; + let id = authz_bundle.id().into_untyped_uuid(); let conn = self.pool_connection_authorized(opctx).await?; diesel::delete(dsl::support_bundle) .filter( @@ -467,7 +465,7 @@ impl DataStore { .eq(SupportBundleState::Destroying) .or(dsl::state.eq(SupportBundleState::Failed)), ) - .filter(dsl::id.eq(id.into_untyped_uuid())) + .filter(dsl::id.eq(id)) .execute_async(&*conn) .await .map(|_rows_modified| ()) @@ -495,6 +493,7 @@ mod test { use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; + use omicron_common::api::external::LookupType; use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; @@ -502,6 +501,16 @@ mod test { use omicron_uuid_kinds::SledUuid; use rand::Rng; + fn authz_support_bundle_from_id( + id: SupportBundleUuid, + ) -> authz::SupportBundle { + authz::SupportBundle::new( + authz::FLEET, + id, + LookupType::ById(id.into_untyped_uuid()), + ) + } + // Pool/Dataset pairs, for debug datasets only. struct TestPool { pool: ZpoolUuid, @@ -715,10 +724,11 @@ mod test { // When we update the state of the bundles, the list results // should also be filtered. + let authz_bundle = authz_support_bundle_from_id(bundle_a1.id.into()); datastore .support_bundle_update( &opctx, - bundle_a1.id.into(), + &authz_bundle, SupportBundleState::Active, ) .await @@ -816,11 +826,11 @@ mod test { // database. // // We should still expect to hit capacity limits. - + let authz_bundle = authz_support_bundle_from_id(bundles[0].id.into()); datastore .support_bundle_update( &opctx, - bundles[0].id.into(), + &authz_bundle, SupportBundleState::Destroying, ) .await @@ -835,8 +845,9 @@ mod test { // If we delete a bundle, it should be gone. This means we can // re-allocate from that dataset which was just freed up. + let authz_bundle = authz_support_bundle_from_id(bundles[0].id.into()); datastore - .support_bundle_delete(&opctx, bundles[0].id.into()) + .support_bundle_delete(&opctx, &authz_bundle) .await .expect("Should be able to destroy this bundle"); datastore @@ -888,11 +899,11 @@ mod test { assert_eq!(bundle, observed_bundles[0]); // Destroy the bundle, observe the new state - + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Destroying, ) .await @@ -905,8 +916,9 @@ mod test { // Delete the bundle, observe that it's gone + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore - .support_bundle_delete(&opctx, bundle.id.into()) + .support_bundle_delete(&opctx, &authz_bundle) .await .expect("Should be able to destroy our bundle"); let observed_bundles = datastore @@ -1146,10 +1158,11 @@ mod test { ); // Start the deletion of this bundle + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Destroying, ) .await @@ -1314,8 +1327,9 @@ mod test { .unwrap() .contains(FAILURE_REASON_NO_DATASET)); + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore - .support_bundle_delete(&opctx, bundle.id.into()) + .support_bundle_delete(&opctx, &authz_bundle) .await .expect("Should have been able to delete support bundle"); @@ -1377,10 +1391,11 @@ mod test { // // This is what we would do when we finish collecting, and // provisioned storage on a sled. + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Active, ) .await diff --git a/nexus/db-queries/tests/output/authz-roles.out b/nexus/db-queries/tests/output/authz-roles.out index 36403fa700b..111bade55c7 100644 --- a/nexus/db-queries/tests/output/authz-roles.out +++ b/nexus/db-queries/tests/output/authz-roles.out @@ -1034,6 +1034,20 @@ resource: PhysicalDisk id "c9f923f6-caf3-4c83-96f9-8ffe8c627dd2" silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ unauthenticated ! ! ! ! ! ! ! ! +resource: SupportBundle id "d9f923f6-caf3-4c83-96f9-8ffe8c627dd2" + + USER Q R LC RP M MP CC D + fleet-admin ✘ ✔ ✔ ✔ ✔ ✔ ✔ ✔ + fleet-collaborator ✘ ✔ ✔ ✔ ✘ ✘ ✘ ✘ + fleet-viewer ✘ ✔ ✔ ✔ ✘ ✘ ✘ ✘ + silo1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + unauthenticated ! ! ! ! ! ! ! ! + resource: DeviceAuthRequest "a-device-user-code" USER Q R LC RP M MP CC D diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 5a8a114a3e5..73f8454dd4d 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -15,12 +15,14 @@ use futures::FutureExt; use futures::TryStreamExt; use nexus_db_model::SupportBundle; use nexus_db_model::SupportBundleState; +use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::SledFilter; use nexus_types::identity::Asset; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::update::ArtifactHash; use omicron_uuid_kinds::DatasetUuid; @@ -40,6 +42,14 @@ use tokio::io::SeekFrom; use zip::write::FullFileOptions; use zip::ZipWriter; +fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { + authz::SupportBundle::new( + authz::FLEET, + id, + LookupType::ById(id.into_untyped_uuid()), + ) +} + // Specifies the data to be collected within the Support Bundle. #[derive(Default)] struct BundleRequest { @@ -151,13 +161,14 @@ impl SupportBundleCollector { opctx: &OpContext, bundle: &SupportBundle, ) -> anyhow::Result { + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); match bundle.state { SupportBundleState::Destroying => { // Destroying is a terminal state; no one should be able to // change this state from underneath us. self.datastore.support_bundle_delete( opctx, - bundle.id.into(), + &authz_bundle, ).await.map_err(|err| { warn!( &opctx.log, @@ -176,7 +187,7 @@ impl SupportBundleCollector { .datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Failed, ) .await @@ -371,12 +382,13 @@ impl SupportBundleCollector { bundle: &bundle, }; + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); let mut report = collection.collect_bundle_and_store_on_sled().await?; if let Err(err) = self .datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Active, ) .await @@ -1128,10 +1140,11 @@ mod test { assert_eq!(bundle.state, SupportBundleState::Collecting); // Cancel the bundle immediately + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Destroying, ) .await @@ -1193,10 +1206,11 @@ mod test { assert!(report.activated_in_db_ok); // Cancel the bundle after collection has completed + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Destroying, ) .await @@ -1248,10 +1262,11 @@ mod test { // Mark the bundle as "failing" - this should be triggered // automatically by the blueprint executor if the corresponding // storage has been expunged. + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Failing, ) .await @@ -1316,10 +1331,11 @@ mod test { // Mark the bundle as "failing" - this should be triggered // automatically by the blueprint executor if the corresponding // storage has been expunged. + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Failing, ) .await @@ -1389,10 +1405,11 @@ mod test { // Mark the bundle as "failing" - this should be triggered // automatically by the blueprint executor if the corresponding // storage has been expunged. + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); datastore .support_bundle_update( &opctx, - bundle.id.into(), + &authz_bundle, SupportBundleState::Failing, ) .await diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 61a032f8bd8..45596175228 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -12,6 +12,7 @@ use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::lookup; +use nexus_db_queries::db::lookup::LookupPath; use nexus_sled_agent_shared::inventory::SledRole; use nexus_types::deployment::DiskFilter; use nexus_types::deployment::SledFilter; @@ -236,20 +237,45 @@ impl super::Nexus { .await } - /// Upserts a physical disk into the database, updating it if it already exists. - pub(crate) async fn upsert_physical_disk( + /// Inserts a physical disk into the database unless it already exists. + /// + /// NOTE: I'd like to re-work this to avoid the upsert-like behavior - can + /// we restructure our tests to ensure they ask for this physical disk + /// exactly once? + pub(crate) async fn insert_test_physical_disk_if_not_exists( &self, opctx: &OpContext, request: PhysicalDiskPutRequest, ) -> Result<(), Error> { info!( - self.log, "upserting physical disk"; + self.log, "inserting test physical disk"; "physical_disk_id" => %request.id, "sled_id" => %request.sled_id, "vendor" => %request.vendor, "serial" => %request.serial, "model" => %request.model, ); + + match LookupPath::new(&opctx, &self.db_datastore) + .physical_disk(request.id) + .fetch() + .await + { + Ok((_authz_disk, existing_disk)) => { + if existing_disk.vendor != request.vendor + || existing_disk.serial != request.serial + || existing_disk.model != request.model + { + return Err(Error::internal_error( + "Invalid Physical Disk update (was: {existing_disk:?}, asking for {request:?})" + )); + } + return Ok(()); + } + Err(err) if matches!(err, Error::ObjectNotFound { .. }) => {} + Err(err) => return Err(err), + } + let disk = db::model::PhysicalDisk::new( request.id, request.vendor, @@ -309,23 +335,22 @@ impl super::Nexus { // Datasets (contained within zpools) - /// Upserts a crucible dataset into the database, updating it if it already exists. - pub(crate) async fn upsert_crucible_dataset( + /// Upserts a dataset into the database, updating it if it already exists. + pub(crate) async fn upsert_dataset( &self, id: DatasetUuid, zpool_id: Uuid, - address: SocketAddrV6, + kind: DatasetKind, + address: Option, ) -> Result<(), Error> { info!( self.log, "upserting dataset"; "zpool_id" => zpool_id.to_string(), "dataset_id" => id.to_string(), - "address" => address.to_string() + "kind" => kind.to_string(), ); - let kind = DatasetKind::Crucible; - let dataset = - db::model::Dataset::new(id, zpool_id, Some(address), kind); + let dataset = db::model::Dataset::new(id, zpool_id, address, kind); self.db_datastore.dataset_upsert(dataset).await?; Ok(()) } diff --git a/nexus/src/app/support_bundles.rs b/nexus/src/app/support_bundles.rs index 8461ffb209d..2d7f3882ef1 100644 --- a/nexus/src/app/support_bundles.rs +++ b/nexus/src/app/support_bundles.rs @@ -9,7 +9,9 @@ use futures::TryStreamExt; use http::Response; use nexus_db_model::SupportBundle; use nexus_db_model::SupportBundleState; +use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::DeleteResult; @@ -47,7 +49,12 @@ impl super::Nexus { opctx: &OpContext, id: SupportBundleUuid, ) -> LookupResult { - self.db_datastore.support_bundle_get(&opctx, id).await + let (.., db_bundle) = LookupPath::new(opctx, &self.db_datastore) + .support_bundle(id) + .fetch() + .await?; + + Ok(db_bundle) } pub async fn support_bundle_create( @@ -67,7 +74,11 @@ impl super::Nexus { _range: Option, ) -> Result, Error> { // Lookup the bundle, confirm it's accessible - let bundle = self.db_datastore.support_bundle_get(&opctx, id).await?; + let (.., bundle) = LookupPath::new(opctx, &self.db_datastore) + .support_bundle(id) + .fetch() + .await?; + if !matches!(bundle.state, SupportBundleState::Active) { return Err(Error::invalid_request( "Cannot download bundle in non-active state", @@ -166,13 +177,22 @@ impl super::Nexus { opctx: &OpContext, id: SupportBundleUuid, ) -> DeleteResult { + let (authz_bundle, ..) = LookupPath::new(opctx, &self.db_datastore) + .support_bundle(id) + .lookup_for(authz::Action::Delete) + .await?; + // NOTE: We can't necessarily delete the support bundle // immediately - it might have state that needs cleanup // by a background task - so, instead, we mark it deleting. // // This is a terminal state self.db_datastore - .support_bundle_update(&opctx, id, SupportBundleState::Destroying) + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Destroying, + ) .await?; Ok(()) } diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 1a25b2e9aae..715b0c3377a 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -41,6 +41,7 @@ use omicron_common::api::internal::nexus::{ProducerEndpoint, ProducerKind}; use omicron_common::api::internal::shared::{ AllowedSourceIps, ExternalPortDiscovery, RackNetworkConfig, SwitchLocation, }; +use omicron_common::disk::DatasetKind; use omicron_common::FileKv; use omicron_uuid_kinds::DatasetUuid; use oximeter::types::ProducerRegistry; @@ -365,18 +366,19 @@ impl nexus_test_interface::NexusServer for Server { self.apictx.context.nexus.get_internal_server_address().await.unwrap() } - async fn upsert_crucible_dataset( + async fn upsert_test_dataset( &self, physical_disk: PhysicalDiskPutRequest, zpool: ZpoolPutRequest, dataset_id: DatasetUuid, - address: SocketAddrV6, + kind: DatasetKind, + address: Option, ) { let opctx = self.apictx.context.nexus.opctx_for_internal_api(); self.apictx .context .nexus - .upsert_physical_disk(&opctx, physical_disk) + .insert_test_physical_disk_if_not_exists(&opctx, physical_disk) .await .unwrap(); @@ -387,7 +389,7 @@ impl nexus_test_interface::NexusServer for Server { self.apictx .context .nexus - .upsert_crucible_dataset(dataset_id, zpool_id, address) + .upsert_dataset(dataset_id, zpool_id, kind, address) .await .unwrap(); } diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 68f462551cc..07e36068992 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -39,6 +39,7 @@ use nexus_types::internal_api::params::{ }; use nexus_types::inventory::Collection; use omicron_common::api::external::Error; +use omicron_common::disk::DatasetKind; use omicron_uuid_kinds::DatasetUuid; use slog::Logger; use std::net::{SocketAddr, SocketAddrV6}; @@ -108,12 +109,13 @@ pub trait NexusServer: Send + Sync + 'static { // use the "RackInitializationRequest" handoff, but this would require // creating all our Zpools and Datasets before performing handoff to Nexus. // However, doing so would let us remove this test-only API. - async fn upsert_crucible_dataset( + async fn upsert_test_dataset( &self, physical_disk: PhysicalDiskPutRequest, zpool: ZpoolPutRequest, dataset_id: DatasetUuid, - address: SocketAddrV6, + kind: DatasetKind, + address: Option, ); async fn inventory_collect_and_get_latest_collection( diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 89f453c873c..365906f8b2f 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -36,6 +36,7 @@ use nexus_types::internal_api::params as internal_params; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::Error; +use omicron_common::api::external::Generation; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Instance; use omicron_common::api::external::InstanceAutoRestartPolicy; @@ -46,7 +47,13 @@ use omicron_common::api::external::RouteDestination; use omicron_common::api::external::RouteTarget; use omicron_common::api::external::RouterRoute; use omicron_common::api::external::UserId; +use omicron_common::disk::DatasetConfig; +use omicron_common::disk::DatasetKind; +use omicron_common::disk::DatasetName; +use omicron_common::disk::DatasetsConfig; use omicron_common::disk::DiskIdentity; +use omicron_common::disk::SharedDatasetConfig; +use omicron_common::zpool_name::ZpoolName; use omicron_sled_agent::sim::SledAgent; use omicron_test_utils::dev::poll::wait_for_condition; use omicron_test_utils::dev::poll::CondCheckError; @@ -1012,8 +1019,10 @@ pub async fn projects_list( .collect() } +#[derive(Debug)] pub struct TestDataset { pub id: DatasetUuid, + pub kind: DatasetKind, } pub struct TestZpool { @@ -1177,7 +1186,10 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { *sled_id, disk.id, disk.pool_id, - DatasetUuid::new_v4(), + vec![TestDataset { + id: DatasetUuid::new_v4(), + kind: DatasetKind::Crucible, + }], Self::DEFAULT_ZPOOL_SIZE_GIB, ) .await; @@ -1190,7 +1202,10 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { sled_id, PhysicalDiskUuid::new_v4(), ZpoolUuid::new_v4(), - DatasetUuid::new_v4(), + vec![TestDataset { + id: DatasetUuid::new_v4(), + kind: DatasetKind::Crucible, + }], Self::DEFAULT_ZPOOL_SIZE_GIB, ) .await @@ -1223,7 +1238,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { sled_id: SledUuid, physical_disk_id: PhysicalDiskUuid, zpool_id: ZpoolUuid, - dataset_id: DatasetUuid, + datasets: Vec, gibibytes: u32, ) { let cptestctx = self.cptestctx; @@ -1233,7 +1248,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { let zpool = TestZpool { id: zpool_id, size: ByteCount::from_gibibytes_u32(gibibytes), - datasets: vec![TestDataset { id: dataset_id }], + datasets, }; let disk_identity = DiskIdentity { @@ -1293,28 +1308,34 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { ); for dataset in &zpool.datasets { - // Sled Agent side: Create the Dataset, make sure regions can be - // created immediately if Nexus requests anything. - let address = - sled_agent.create_crucible_dataset(zpool.id, dataset.id); - let crucible = - sled_agent.get_crucible_dataset(zpool.id, dataset.id); - crucible.set_create_callback(Box::new(|_| RegionState::Created)); - - // Nexus side: Notify Nexus of the physical disk/zpool/dataset - // combination that exists. - - let address = match address { - std::net::SocketAddr::V6(addr) => addr, - _ => panic!("Unsupported address type: {address} "), + let address = if matches!(dataset.kind, DatasetKind::Crucible) { + // Sled Agent side: Create the Dataset, make sure regions can be + // created immediately if Nexus requests anything. + let address = + sled_agent.create_crucible_dataset(zpool.id, dataset.id); + let crucible = + sled_agent.get_crucible_dataset(zpool.id, dataset.id); + crucible + .set_create_callback(Box::new(|_| RegionState::Created)); + + // Nexus side: Notify Nexus of the physical disk/zpool/dataset + // combination that exists. + + match address { + std::net::SocketAddr::V6(addr) => Some(addr), + _ => panic!("Unsupported address type: {address} "), + } + } else { + None }; cptestctx .server - .upsert_crucible_dataset( + .upsert_test_dataset( physical_disk_request.clone(), zpool_request.clone(), dataset.id, + dataset.kind.clone(), address, ) .await; @@ -1365,12 +1386,46 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { } }, &Duration::from_millis(50), - &Duration::from_secs(30), + // TODO: put me back to 30 before merging + &Duration::from_secs(3), ) .await .expect("expected to find inventory collection"); zpools.push(zpool); + + // Configure the Sled to use all datasets + let datasets = zpools + .iter() + .flat_map(|zpool| zpool.datasets.iter().map(|d| (zpool.id, d))) + .map(|(zpool_id, TestDataset { id, kind })| { + ( + *id, + DatasetConfig { + id: *id, + name: DatasetName::new( + ZpoolName::new_external(zpool_id), + kind.clone(), + ), + inner: SharedDatasetConfig::default(), + }, + ) + }) + .collect(); + + // If there was any prior configuration, bump the generation. + // Otherwise, set it for the first time. + let generation = { + if let Ok(config) = sled_agent.datasets_config_list() { + config.generation.next() + } else { + Generation::new() + } + }; + + let dataset_config = DatasetsConfig { generation, datasets }; + let res = sled_agent.datasets_ensure(dataset_config).unwrap(); + assert!(!res.has_error()); } pub async fn set_requested_then_created_callback(&self) { diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 466cae17a81..6ed5760b883 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -76,6 +76,11 @@ pub static DEMO_UNINITIALIZED_SLED: Lazy = part: "demo-part".to_string(), }); +pub const SUPPORT_BUNDLES_URL: &'static str = + "/experimental/v1/system/support-bundles"; +pub static SUPPORT_BUNDLE_URL: Lazy = + Lazy::new(|| format!("{SUPPORT_BUNDLES_URL}/{{id}}")); + // Global policy pub const SYSTEM_POLICY_URL: &'static str = "/v1/system/policy"; @@ -1086,6 +1091,8 @@ pub enum AllowedMethod { /// HTTP "PUT" method, with sample input (which should be valid input for /// this endpoint) Put(serde_json::Value), + /// HTTP "HEAD" method + Head, } impl AllowedMethod { @@ -1100,6 +1107,7 @@ impl AllowedMethod { | AllowedMethod::GetWebsocket => &Method::GET, AllowedMethod::Post(_) => &Method::POST, AllowedMethod::Put(_) => &Method::PUT, + AllowedMethod::Head => &Method::HEAD, } } @@ -1114,7 +1122,8 @@ impl AllowedMethod { | AllowedMethod::GetNonexistent | AllowedMethod::GetUnimplemented | AllowedMethod::GetVolatile - | AllowedMethod::GetWebsocket => None, + | AllowedMethod::GetWebsocket + | AllowedMethod::Head => None, AllowedMethod::Post(body) => Some(&body), AllowedMethod::Put(body) => Some(&body), } @@ -2168,6 +2177,25 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { allowed_methods: vec![AllowedMethod::Get], }, + /* Support Bundles */ + + VerifyEndpoint { + url: &SUPPORT_BUNDLES_URL, + visibility: Visibility::Public, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Get, + AllowedMethod::Post(serde_json::to_value(()).unwrap()) + ], + }, + + VerifyEndpoint { + url: &SUPPORT_BUNDLE_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![AllowedMethod::Get], + }, + /* Updates */ VerifyEndpoint { diff --git a/nexus/tests/integration_tests/unauthorized.rs b/nexus/tests/integration_tests/unauthorized.rs index 6581dcb9e93..abb90ff7706 100644 --- a/nexus/tests/integration_tests/unauthorized.rs +++ b/nexus/tests/integration_tests/unauthorized.rs @@ -18,7 +18,9 @@ use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::http_testing::TestResponse; +use nexus_test_utils::resource_helpers::TestDataset; use nexus_test_utils_macros::nexus_test; +use omicron_common::disk::DatasetKind; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::ZpoolUuid; use once_cell::sync::Lazy; @@ -57,14 +59,25 @@ type DiskTest<'a> = // 403). #[nexus_test] async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { + eprintln!("----- test unauthorized started"); let mut disk_test = DiskTest::new(cptestctx).await; + eprintln!("----- test unauthorized finished making new disk test"); let sled_id = cptestctx.first_sled(); disk_test .add_zpool_with_dataset_ext( sled_id, nexus_test_utils::PHYSICAL_DISK_UUID.parse().unwrap(), ZpoolUuid::new_v4(), - DatasetUuid::new_v4(), + vec![ + TestDataset { + id: DatasetUuid::new_v4(), + kind: DatasetKind::Crucible, + }, + TestDataset { + id: DatasetUuid::new_v4(), + kind: DatasetKind::Debug, + }, + ], DiskTest::DEFAULT_ZPOOL_SIZE_GIB, ) .await; @@ -73,6 +86,8 @@ async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { let log = &cptestctx.logctx.log; let mut setup_results = std::collections::BTreeMap::new(); + eprintln!("----- test unauthorized creating test data"); + // Create test data. info!(log, "setting up resource hierarchy"); for request in &*SETUP_REQUESTS { @@ -105,6 +120,8 @@ async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { }); } + eprintln!("----- test unauthorized verifying endpoints"); + // Verify the hardcoded endpoints. info!(log, "verifying endpoints"); print!("{}", VERIFY_HEADER); @@ -322,6 +339,12 @@ static SETUP_REQUESTS: Lazy> = Lazy::new(|| { body: serde_json::to_value(&*DEMO_CERTIFICATE_CREATE).unwrap(), id_routes: vec![], }, + // Create a Support Bundle + SetupReq::Post { + url: &SUPPORT_BUNDLES_URL, + body: serde_json::to_value(()).unwrap(), + id_routes: vec![], + }, ] }); diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index ed5b99b7ad8..be5deeaa164 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -1068,6 +1068,11 @@ impl StorageInner { .map(|config| config.name.clone()) .collect(); for dataset in &dataset_names { + // Datasets delegated to zones manage their own storage. + if dataset.dataset().zoned() { + continue; + } + let root = self.root().to_path_buf(); self.nested_datasets.entry(dataset.clone()).or_insert_with(|| { NestedDatasetStorage::new( From 6a2fa5944e3e1a9a01c1434432fa57765a2258b7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 20 Dec 2024 14:06:12 -0800 Subject: [PATCH 43/61] unauthorized tests working for subset of endpoints --- nexus/test-utils/src/resource_helpers.rs | 3 +- nexus/tests/integration_tests/endpoints.rs | 28 ++++++++++++++++++- nexus/tests/integration_tests/unauthorized.rs | 8 +----- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 365906f8b2f..5e071c7497f 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1386,8 +1386,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { } }, &Duration::from_millis(50), - // TODO: put me back to 30 before merging - &Duration::from_secs(3), + &Duration::from_secs(30), ) .await .expect("expected to find inventory collection"); diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 6ed5760b883..13cd7ddf28d 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -80,6 +80,10 @@ pub const SUPPORT_BUNDLES_URL: &'static str = "/experimental/v1/system/support-bundles"; pub static SUPPORT_BUNDLE_URL: Lazy = Lazy::new(|| format!("{SUPPORT_BUNDLES_URL}/{{id}}")); +pub static SUPPORT_BUNDLE_INDEX_URL: Lazy = + Lazy::new(|| format!("{SUPPORT_BUNDLES_URL}/{{id}}/index")); +pub static SUPPORT_BUNDLE_DOWNLOAD_URL: Lazy = + Lazy::new(|| format!("{SUPPORT_BUNDLES_URL}/{{id}}/download")); // Global policy pub const SYSTEM_POLICY_URL: &'static str = "/v1/system/policy"; @@ -2189,11 +2193,33 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { ], }, +// VerifyEndpoint { +// url: &SUPPORT_BUNDLE_INDEX_URL, +// visibility: Visibility::Protected, +// unprivileged_access: UnprivilegedAccess::None, +// allowed_methods: vec![ +// AllowedMethod::Get, +// ], +// }, +// +// VerifyEndpoint { +// url: &SUPPORT_BUNDLE_DOWNLOAD_URL, +// visibility: Visibility::Protected, +// unprivileged_access: UnprivilegedAccess::None, +// allowed_methods: vec![ +// AllowedMethod::Get, +// AllowedMethod::Head, +// ], +// }, + VerifyEndpoint { url: &SUPPORT_BUNDLE_URL, visibility: Visibility::Protected, unprivileged_access: UnprivilegedAccess::None, - allowed_methods: vec![AllowedMethod::Get], + allowed_methods: vec![ + AllowedMethod::Get, + AllowedMethod::Delete, + ], }, /* Updates */ diff --git a/nexus/tests/integration_tests/unauthorized.rs b/nexus/tests/integration_tests/unauthorized.rs index abb90ff7706..bbbbc0df99b 100644 --- a/nexus/tests/integration_tests/unauthorized.rs +++ b/nexus/tests/integration_tests/unauthorized.rs @@ -59,9 +59,7 @@ type DiskTest<'a> = // 403). #[nexus_test] async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { - eprintln!("----- test unauthorized started"); let mut disk_test = DiskTest::new(cptestctx).await; - eprintln!("----- test unauthorized finished making new disk test"); let sled_id = cptestctx.first_sled(); disk_test .add_zpool_with_dataset_ext( @@ -86,8 +84,6 @@ async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { let log = &cptestctx.logctx.log; let mut setup_results = std::collections::BTreeMap::new(); - eprintln!("----- test unauthorized creating test data"); - // Create test data. info!(log, "setting up resource hierarchy"); for request in &*SETUP_REQUESTS { @@ -120,8 +116,6 @@ async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { }); } - eprintln!("----- test unauthorized verifying endpoints"); - // Verify the hardcoded endpoints. info!(log, "verifying endpoints"); print!("{}", VERIFY_HEADER); @@ -343,7 +337,7 @@ static SETUP_REQUESTS: Lazy> = Lazy::new(|| { SetupReq::Post { url: &SUPPORT_BUNDLES_URL, body: serde_json::to_value(()).unwrap(), - id_routes: vec![], + id_routes: vec!["/experimental/v1/system/support-bundles/{id}"], }, ] }); From a0e5326b4ebb616960d45d0e23b1ac0e70f5ab9d Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 20 Dec 2024 14:29:08 -0800 Subject: [PATCH 44/61] Update coverage of support bundle endpoints --- nexus/tests/integration_tests/endpoints.rs | 29 +------------------ .../output/uncovered-authz-endpoints.txt | 5 ++++ 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 13cd7ddf28d..656b4ba8266 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -80,10 +80,6 @@ pub const SUPPORT_BUNDLES_URL: &'static str = "/experimental/v1/system/support-bundles"; pub static SUPPORT_BUNDLE_URL: Lazy = Lazy::new(|| format!("{SUPPORT_BUNDLES_URL}/{{id}}")); -pub static SUPPORT_BUNDLE_INDEX_URL: Lazy = - Lazy::new(|| format!("{SUPPORT_BUNDLES_URL}/{{id}}/index")); -pub static SUPPORT_BUNDLE_DOWNLOAD_URL: Lazy = - Lazy::new(|| format!("{SUPPORT_BUNDLES_URL}/{{id}}/download")); // Global policy pub const SYSTEM_POLICY_URL: &'static str = "/v1/system/policy"; @@ -1095,8 +1091,6 @@ pub enum AllowedMethod { /// HTTP "PUT" method, with sample input (which should be valid input for /// this endpoint) Put(serde_json::Value), - /// HTTP "HEAD" method - Head, } impl AllowedMethod { @@ -1111,7 +1105,6 @@ impl AllowedMethod { | AllowedMethod::GetWebsocket => &Method::GET, AllowedMethod::Post(_) => &Method::POST, AllowedMethod::Put(_) => &Method::PUT, - AllowedMethod::Head => &Method::HEAD, } } @@ -1126,8 +1119,7 @@ impl AllowedMethod { | AllowedMethod::GetNonexistent | AllowedMethod::GetUnimplemented | AllowedMethod::GetVolatile - | AllowedMethod::GetWebsocket - | AllowedMethod::Head => None, + | AllowedMethod::GetWebsocket => None, AllowedMethod::Post(body) => Some(&body), AllowedMethod::Put(body) => Some(&body), } @@ -2193,25 +2185,6 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { ], }, -// VerifyEndpoint { -// url: &SUPPORT_BUNDLE_INDEX_URL, -// visibility: Visibility::Protected, -// unprivileged_access: UnprivilegedAccess::None, -// allowed_methods: vec![ -// AllowedMethod::Get, -// ], -// }, -// -// VerifyEndpoint { -// url: &SUPPORT_BUNDLE_DOWNLOAD_URL, -// visibility: Visibility::Protected, -// unprivileged_access: UnprivilegedAccess::None, -// allowed_methods: vec![ -// AllowedMethod::Get, -// AllowedMethod::Head, -// ], -// }, - VerifyEndpoint { url: &SUPPORT_BUNDLE_URL, visibility: Visibility::Protected, diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index c5091c5a3bc..a53e94414a6 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -2,8 +2,13 @@ API endpoints with no coverage in authz tests: probe_delete (delete "/experimental/v1/probes/{probe}") probe_list (get "/experimental/v1/probes") probe_view (get "/experimental/v1/probes/{probe}") +support_bundle_download (get "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_download_file (get "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") +support_bundle_index (get "/experimental/v1/system/support-bundles/{support_bundle}/index") ping (get "/v1/ping") networking_switch_port_status (get "/v1/system/hardware/switch-port/{port}/status") +support_bundle_head (head "/experimental/v1/system/support-bundles/{support_bundle}/download") +support_bundle_head_file (head "/experimental/v1/system/support-bundles/{support_bundle}/download/{file}") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") From fae39ee8cbddbffaf03c1821269b2dd4a1fb0fe2 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 20 Dec 2024 15:27:21 -0800 Subject: [PATCH 45/61] clippy --- nexus/src/app/sled.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 45596175228..a8383b97762 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -272,7 +272,7 @@ impl super::Nexus { } return Ok(()); } - Err(err) if matches!(err, Error::ObjectNotFound { .. }) => {} + Err(Error::ObjectNotFound { .. }) => {} Err(err) => return Err(err), } From 1506c87e7036d7233b2840b6216c1d2db9bbcdd5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 10:19:53 -0800 Subject: [PATCH 46/61] merging --- nexus/db-queries/src/db/datastore/support_bundle.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 8f96c4fd7c2..d616b4356b7 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -262,7 +262,7 @@ impl DataStore { .all_omicron_datasets( nexus_types::deployment::BlueprintDatasetFilter::Expunged, ) - .filter_map(|dataset_config| { + .filter_map(|(_sled_id, dataset_config)| { if matches!( dataset_config.kind, omicron_common::api::internal::shared::DatasetKind::Debug From 9d416b19d6765dc8d3c2ae2d28eecea2bfef991d Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 14:32:23 -0800 Subject: [PATCH 47/61] Fix simulated nested datasets, add tests --- sled-agent/src/sim/storage.rs | 252 +++++++++++++++++++++++++++++++++- 1 file changed, 248 insertions(+), 4 deletions(-) diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index e0af1ce03d6..e907aaffe19 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -401,6 +401,11 @@ impl CrucibleDataInner { #[cfg(test)] mod test { use super::*; + use omicron_common::api::external::Generation; + use omicron_common::disk::DatasetConfig; + use omicron_common::disk::DatasetKind; + use omicron_common::disk::DatasetName; + use omicron_common::zpool_name::ZpoolName; use omicron_test_utils::dev::test_setup_log; /// Validate that the simulated Crucible agent reuses ports when regions are @@ -661,6 +666,243 @@ mod test { logctx.cleanup_successful(); } + + #[test] + fn nested_dataset_not_found_missing_dataset() { + let logctx = test_setup_log("nested_dataset_not_found_missing_dataset"); + + let storage = StorageInner::new( + Uuid::new_v4(), + std::net::Ipv4Addr::LOCALHOST.into(), + logctx.log.clone(), + ); + + let zpool_id = ZpoolUuid::new_v4(); + + let err = storage + .nested_dataset_list( + NestedDatasetLocation { + path: String::new(), + root: DatasetName::new( + ZpoolName::new_external(zpool_id), + DatasetKind::Debug, + ), + }, + NestedDatasetListOptions::SelfAndChildren, + ) + .expect_err("Nested dataset listing should fail on fake dataset"); + + assert_eq!(err.status_code, 404); + + logctx.cleanup_successful(); + } + + #[test] + fn nested_dataset() { + let logctx = test_setup_log("nested_dataset"); + + let mut storage = StorageInner::new( + Uuid::new_v4(), + std::net::Ipv4Addr::LOCALHOST.into(), + logctx.log.clone(), + ); + + let zpool_id = ZpoolUuid::new_v4(); + let zpool_name = ZpoolName::new_external(zpool_id); + let dataset_id = DatasetUuid::new_v4(); + let dataset_name = DatasetName::new(zpool_name, DatasetKind::Debug); + + let config = DatasetsConfig { + generation: Generation::new(), + datasets: BTreeMap::from([( + dataset_id, + DatasetConfig { + id: dataset_id, + name: dataset_name.clone(), + inner: SharedDatasetConfig::default(), + }, + )]), + }; + + // Create the debug dataset on which we'll store everything else. + let result = storage.datasets_ensure(config).unwrap(); + assert!(!result.has_error()); + + // The list of nested datasets should only contain the root dataset. + let nested_datasets = storage + .nested_dataset_list( + NestedDatasetLocation { + path: String::new(), + root: dataset_name.clone(), + }, + NestedDatasetListOptions::SelfAndChildren, + ) + .unwrap(); + assert_eq!( + nested_datasets, + vec![NestedDatasetConfig { + name: NestedDatasetLocation { + path: String::new(), + root: dataset_name.clone(), + }, + inner: SharedDatasetConfig::default(), + }] + ); + + // Or, if we're requesting children explicitly, it should be empty. + let nested_dataset_root = NestedDatasetLocation { + path: String::new(), + root: dataset_name.clone(), + }; + + let nested_datasets = storage + .nested_dataset_list( + nested_dataset_root.clone(), + NestedDatasetListOptions::ChildrenOnly, + ) + .unwrap(); + assert_eq!(nested_datasets, vec![]); + + // We can request a nested dataset explicitly. + let foo_config = NestedDatasetConfig { + name: NestedDatasetLocation { + path: "foo".into(), + root: dataset_name.clone(), + }, + inner: SharedDatasetConfig::default(), + }; + storage.nested_dataset_ensure(foo_config.clone()).unwrap(); + let foobar_config = NestedDatasetConfig { + name: NestedDatasetLocation { + path: "foo/bar".into(), + root: dataset_name.clone(), + }, + inner: SharedDatasetConfig::default(), + }; + storage.nested_dataset_ensure(foobar_config.clone()).unwrap(); + + // We can observe the nested datasets we just created + let nested_datasets = storage + .nested_dataset_list( + nested_dataset_root.clone(), + NestedDatasetListOptions::ChildrenOnly, + ) + .unwrap(); + assert_eq!(nested_datasets, vec![foo_config.clone(),]); + + let nested_datasets = storage + .nested_dataset_list( + foo_config.name.clone(), + NestedDatasetListOptions::ChildrenOnly, + ) + .unwrap(); + assert_eq!(nested_datasets, vec![foobar_config.clone(),]); + + // We can destroy nested datasets too + storage.nested_dataset_destroy(foobar_config.name.clone()).unwrap(); + storage.nested_dataset_destroy(foo_config.name.clone()).unwrap(); + + logctx.cleanup_successful(); + } + + #[test] + fn nested_dataset_child_parent_relationship() { + let logctx = test_setup_log("nested_dataset_child_parent_relationship"); + + let mut storage = StorageInner::new( + Uuid::new_v4(), + std::net::Ipv4Addr::LOCALHOST.into(), + logctx.log.clone(), + ); + + let zpool_id = ZpoolUuid::new_v4(); + let zpool_name = ZpoolName::new_external(zpool_id); + let dataset_id = DatasetUuid::new_v4(); + let dataset_name = DatasetName::new(zpool_name, DatasetKind::Debug); + + let config = DatasetsConfig { + generation: Generation::new(), + datasets: BTreeMap::from([( + dataset_id, + DatasetConfig { + id: dataset_id, + name: dataset_name.clone(), + inner: SharedDatasetConfig::default(), + }, + )]), + }; + + // Create the debug dataset on which we'll store everything else. + let result = storage.datasets_ensure(config).unwrap(); + assert!(!result.has_error()); + let nested_dataset_root = NestedDatasetLocation { + path: String::new(), + root: dataset_name.clone(), + }; + let nested_datasets = storage + .nested_dataset_list( + nested_dataset_root.clone(), + NestedDatasetListOptions::ChildrenOnly, + ) + .unwrap(); + assert_eq!(nested_datasets, vec![]); + + // If we try to create a nested dataset "foo/bar" before the parent + // "foo", we expect an error. + + let foo_config = NestedDatasetConfig { + name: NestedDatasetLocation { + path: "foo".into(), + root: dataset_name.clone(), + }, + inner: SharedDatasetConfig::default(), + }; + let foobar_config = NestedDatasetConfig { + name: NestedDatasetLocation { + path: "foo/bar".into(), + root: dataset_name.clone(), + }, + inner: SharedDatasetConfig::default(), + }; + + let err = storage + .nested_dataset_ensure(foobar_config.clone()) + .expect_err("Should have failed to provision foo/bar before foo"); + assert_eq!(err.status_code, 404); + + // Try again, but creating them successfully this time. + storage.nested_dataset_ensure(foo_config.clone()).unwrap(); + storage.nested_dataset_ensure(foobar_config.clone()).unwrap(); + + // We can observe the nested datasets we just created + let nested_datasets = storage + .nested_dataset_list( + nested_dataset_root.clone(), + NestedDatasetListOptions::ChildrenOnly, + ) + .unwrap(); + assert_eq!(nested_datasets, vec![foo_config.clone(),]); + let nested_datasets = storage + .nested_dataset_list( + foo_config.name.clone(), + NestedDatasetListOptions::ChildrenOnly, + ) + .unwrap(); + assert_eq!(nested_datasets, vec![foobar_config.clone(),]); + + // Destroying the nested dataset parent should destroy children. + storage.nested_dataset_destroy(foo_config.name.clone()).unwrap(); + + let nested_datasets = storage + .nested_dataset_list( + nested_dataset_root.clone(), + NestedDatasetListOptions::ChildrenOnly, + ) + .unwrap(); + assert_eq!(nested_datasets, vec![]); + + logctx.cleanup_successful(); + } } /// Represents a running Crucible Agent. Contains regions. @@ -1153,14 +1395,15 @@ impl StorageInner { )); }; - for path_component in nested_path.split('/') { + let mut path_components = nested_path.split('/').peekable(); + while let Some(path_component) = path_components.next() { if path_component.is_empty() { continue; } // Final component of path -- insert it here if it doesn't exist // already. - if !path_component.contains('/') { + if path_components.peek().is_none() { let entry = nested_dataset.children.entry(path_component.to_string()); entry @@ -1206,13 +1449,14 @@ impl StorageInner { )); }; - for path_component in name.path.split('/') { + let mut path_components = name.path.split('/').peekable(); + while let Some(path_component) = path_components.next() { if path_component.is_empty() { continue; } // Final component of path -- remove it if it exists. - if !path_component.contains('/') { + if path_components.peek().is_none() { if nested_dataset.children.remove(path_component).is_none() { return Err(HttpError::for_not_found( None, From 917759d2127d65a2fec3f092c8c9fd60ca69c0f4 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 15:52:17 -0800 Subject: [PATCH 48/61] config.test.toml --- nexus/tests/config.test.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 247caafcb5b..1ef2f908e47 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -105,6 +105,7 @@ physical_disk_adoption.period_secs = 30 # Disable automatic disk adoption to avoid interfering with tests. physical_disk_adoption.disable = true support_bundle_collector.period_secs = 30 +# Disable automatic support bundle collection to avoid interfering with tests. support_bundle_collector.disable = true decommissioned_disk_cleaner.period_secs = 60 # Disable disk decommissioning cleanup to avoid interfering with tests. From 7d166f18f0177c3a129ddfdaec78b700eb03efd5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 15:54:02 -0800 Subject: [PATCH 49/61] Error logging --- nexus/src/app/background/tasks/support_bundle_collector.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 5a8a114a3e5..0c7003b7732 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -196,6 +196,11 @@ impl SupportBundleCollector { ); return Ok(DatabaseBundleCleanupResult::BadState); } else { + warn!( + &opctx.log, + "Could not delete 'failing' bundle"; + "err" => ?err, + ); anyhow::bail!( "Could not delete 'failing' bundle: {:#}", err From bcf6f8cca3d5084cd279faf208e550764862bb14 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 16:02:31 -0800 Subject: [PATCH 50/61] Propagate unexpected errors during bundle activation --- .../tasks/support_bundle_collector.rs | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0c7003b7732..00a92a696ea 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -386,13 +386,23 @@ impl SupportBundleCollector { ) .await { - info!( - &opctx.log, - "SupportBundleCollector: Concurrent state change activating bundle"; - "bundle" => %bundle.id, - "err" => ?err, - ); - return Ok(Some(report)); + if matches!(err, Error::InvalidRequest { .. }) { + info!( + &opctx.log, + "SupportBundleCollector: Concurrent state change activating bundle"; + "bundle" => %bundle.id, + "err" => ?err, + ); + return Ok(Some(report)); + } else { + warn!( + &opctx.log, + "SupportBundleCollector: Unexpected error activating bundle"; + "bundle" => %bundle.id, + "err" => ?err, + ); + anyhow::bail!("failed to activate bundle: {:#}", err); + } } report.activated_in_db_ok = true; Ok(Some(report)) From 76be221b23776a32077204c3d1b95e15e65d3366 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 16:03:06 -0800 Subject: [PATCH 51/61] remove printlns --- nexus/src/app/background/tasks/support_bundle_collector.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 00a92a696ea..2479f1630c3 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -272,8 +272,6 @@ impl SupportBundleCollector { .zpool_get_sled_if_in_service(&opctx, bundle.zpool_id.into()) .await; - println!("zpool_get_sled_if_in_service result: {result:?}"); - let delete_from_db = match result { Ok(sled_id) => { match self @@ -429,7 +427,6 @@ impl<'a> BundleCollection<'a> { // as it's being collected. let dir = tempdir()?; - println!("created tempdir, starting collection"); let mut collection = Box::pin(self.collect_bundle_as_file(&dir)); // We periodically check the state of the support bundle - if a user From d23826e0bd9cbbbc945ab3c95ee834b5170df0f9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 16:24:03 -0800 Subject: [PATCH 52/61] Use BufReader for reading entries --- .../tasks/support_bundle_collector.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 2479f1630c3..124ff274799 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -32,6 +32,8 @@ use omicron_uuid_kinds::ZpoolUuid; use serde::Serialize; use serde_json::json; use sha2::{Digest, Sha256}; +use std::io::BufRead; +use std::io::BufReader; use std::io::Write; use std::sync::Arc; use tokio::io::AsyncReadExt; @@ -442,7 +444,7 @@ impl<'a> BundleCollection<'a> { tokio::select! { // Timer fired mid-collection - let's check if we should stop. _ = yield_interval.tick() => { - info!( + trace!( &self.log, "Checking if Bundle Collection cancelled"; "bundle" => %self.bundle.id @@ -697,8 +699,17 @@ fn recursively_add_directory_to_zipfile( let src = entry.path(); zip.start_file_from_path(dst, opts)?; - let buf = std::fs::read(&src)?; - zip.write_all(&buf)?; + let mut reader = BufReader::new(std::fs::File::open(&src)?); + + loop { + let buf = reader.fill_buf()?; + let len = buf.len(); + if len == 0 { + break; + } + zip.write_all(&buf)?; + reader.consume(len); + } } if file_type.is_dir() { let opts = FullFileOptions::default(); From b73f3eebebf7a6718ad8de469a24007250272125 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 6 Jan 2025 18:03:03 -0800 Subject: [PATCH 53/61] fixing tests --- nexus/test-utils/src/resource_helpers.rs | 14 ++++++++++++++ nexus/tests/integration_tests/unauthorized.rs | 1 + 2 files changed, 15 insertions(+) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 5e071c7497f..7b078464bff 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1191,6 +1191,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { kind: DatasetKind::Crucible, }], Self::DEFAULT_ZPOOL_SIZE_GIB, + false, ) .await; } @@ -1207,6 +1208,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { kind: DatasetKind::Crucible, }], Self::DEFAULT_ZPOOL_SIZE_GIB, + false, ) .await } @@ -1240,6 +1242,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { zpool_id: ZpoolUuid, datasets: Vec, gibibytes: u32, + ensure_datasets: bool, ) { let cptestctx = self.cptestctx; @@ -1391,6 +1394,17 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { .await .expect("expected to find inventory collection"); + // TODO: This is gross. I'd really like to clean this up before merging. + // + // - Blueprint tests don't want this, they want to control the generation + // number passed to sled agents explicitly. + // - The Unauthorized test DOES want it, so that it can create a debug + // dataset, that also can get support bundles. But there's got to be a + // better way. + if !ensure_datasets { + return; + } + zpools.push(zpool); // Configure the Sled to use all datasets diff --git a/nexus/tests/integration_tests/unauthorized.rs b/nexus/tests/integration_tests/unauthorized.rs index bbbbc0df99b..45a6af4af55 100644 --- a/nexus/tests/integration_tests/unauthorized.rs +++ b/nexus/tests/integration_tests/unauthorized.rs @@ -77,6 +77,7 @@ async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { }, ], DiskTest::DEFAULT_ZPOOL_SIZE_GIB, + true, ) .await; From 608f1b23399cc6021cde3919bd0260b6a49066ed Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 7 Jan 2025 09:49:44 -0800 Subject: [PATCH 54/61] ordering matters --- nexus/test-utils/src/resource_helpers.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 7b078464bff..927c1a5818b 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1394,6 +1394,8 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { .await .expect("expected to find inventory collection"); + zpools.push(zpool); + // TODO: This is gross. I'd really like to clean this up before merging. // // - Blueprint tests don't want this, they want to control the generation @@ -1405,8 +1407,6 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { return; } - zpools.push(zpool); - // Configure the Sled to use all datasets let datasets = zpools .iter() From 7d7d7fd5250c478495bf9aca043997b46cc5a8ac Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 7 Jan 2025 12:08:19 -0800 Subject: [PATCH 55/61] cleanup --- nexus/test-utils/src/resource_helpers.rs | 108 ++++++++++-------- nexus/tests/integration_tests/unauthorized.rs | 2 +- 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 927c1a5818b..4b535164bc8 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1191,7 +1191,6 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { kind: DatasetKind::Crucible, }], Self::DEFAULT_ZPOOL_SIZE_GIB, - false, ) .await; } @@ -1208,11 +1207,71 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { kind: DatasetKind::Crucible, }], Self::DEFAULT_ZPOOL_SIZE_GIB, - false, ) .await } + // Propagate the dataset configuration to all Sled Agents. + // + // # Panics + // + // This function will panic if any of the Sled Agents have already + // applied dataset configuration. + // + // TODO: Ideally, we should do the following: + // 1. Also call a similar method to invoke the "omicron_physical_disks_ensure" API. Right now, + // we aren't calling this at all for the simulated sled agent, which only works because + // the simulated sled agent simply treats this as a stored config, rather than processing it + // to actually provide a different view of storage. + // 2. Re-work the DiskTestBuilder API to automatically deploy the "disks + datasets" config + // to sled agents exactly once. Adding new zpools / datasets after the DiskTest has been + // started will need to also make a decision about re-deploying this configuration. + pub async fn propagate_datasets_to_sleds(&mut self) { + let cptestctx = self.cptestctx; + + for (sled_id, PerSledDiskState { zpools }) in &self.sleds { + // Grab the "SledAgent" object -- we'll be contacting it shortly. + let sleds = cptestctx.all_sled_agents(); + let sled_agent = sleds + .into_iter() + .find_map(|server| { + if server.sled_agent.id == *sled_id { + Some(server.sled_agent.clone()) + } else { + None + } + }) + .expect("Cannot find sled"); + + // Configure the Sled to use all datasets we created + let datasets = zpools + .iter() + .flat_map(|zpool| zpool.datasets.iter().map(|d| (zpool.id, d))) + .map(|(zpool_id, TestDataset { id, kind })| { + ( + *id, + DatasetConfig { + id: *id, + name: DatasetName::new( + ZpoolName::new_external(zpool_id), + kind.clone(), + ), + inner: SharedDatasetConfig::default(), + }, + ) + }) + .collect(); + + let generation = Generation::new(); + let dataset_config = DatasetsConfig { generation, datasets }; + let res = sled_agent.datasets_ensure(dataset_config).expect( + "Should have been able to ensure datasets, but could not. + Did someone else already attempt to ensure datasets?", + ); + assert!(!res.has_error()); + } + } + fn get_sled(&self, sled_id: SledUuid) -> Arc { let sleds = self.cptestctx.all_sled_agents(); sleds @@ -1242,7 +1301,6 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { zpool_id: ZpoolUuid, datasets: Vec, gibibytes: u32, - ensure_datasets: bool, ) { let cptestctx = self.cptestctx; @@ -1395,50 +1453,6 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { .expect("expected to find inventory collection"); zpools.push(zpool); - - // TODO: This is gross. I'd really like to clean this up before merging. - // - // - Blueprint tests don't want this, they want to control the generation - // number passed to sled agents explicitly. - // - The Unauthorized test DOES want it, so that it can create a debug - // dataset, that also can get support bundles. But there's got to be a - // better way. - if !ensure_datasets { - return; - } - - // Configure the Sled to use all datasets - let datasets = zpools - .iter() - .flat_map(|zpool| zpool.datasets.iter().map(|d| (zpool.id, d))) - .map(|(zpool_id, TestDataset { id, kind })| { - ( - *id, - DatasetConfig { - id: *id, - name: DatasetName::new( - ZpoolName::new_external(zpool_id), - kind.clone(), - ), - inner: SharedDatasetConfig::default(), - }, - ) - }) - .collect(); - - // If there was any prior configuration, bump the generation. - // Otherwise, set it for the first time. - let generation = { - if let Ok(config) = sled_agent.datasets_config_list() { - config.generation.next() - } else { - Generation::new() - } - }; - - let dataset_config = DatasetsConfig { generation, datasets }; - let res = sled_agent.datasets_ensure(dataset_config).unwrap(); - assert!(!res.has_error()); } pub async fn set_requested_then_created_callback(&self) { diff --git a/nexus/tests/integration_tests/unauthorized.rs b/nexus/tests/integration_tests/unauthorized.rs index 45a6af4af55..785277f230c 100644 --- a/nexus/tests/integration_tests/unauthorized.rs +++ b/nexus/tests/integration_tests/unauthorized.rs @@ -77,9 +77,9 @@ async fn test_unauthorized(cptestctx: &ControlPlaneTestContext) { }, ], DiskTest::DEFAULT_ZPOOL_SIZE_GIB, - true, ) .await; + disk_test.propagate_datasets_to_sleds().await; let client = &cptestctx.external_client; let log = &cptestctx.logctx.log; From 3fec97e387200c4d24feec67188173bdd1f36ee5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 8 Jan 2025 13:06:27 -0800 Subject: [PATCH 56/61] Integration tests --- nexus/src/app/background/mod.rs | 7 + .../tasks/support_bundle_collector.rs | 25 +- nexus/src/app/test_interfaces.rs | 2 + nexus/test-utils/src/http_testing.rs | 17 +- nexus/tests/config.test.toml | 6 +- nexus/tests/integration_tests/mod.rs | 1 + .../integration_tests/support_bundles.rs | 489 ++++++++++++++++++ nexus/types/src/external_api/shared.rs | 4 +- 8 files changed, 534 insertions(+), 17 deletions(-) create mode 100644 nexus/tests/integration_tests/support_bundles.rs diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 5b24907b0f4..3241e7c7cae 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -140,6 +140,13 @@ pub use init::BackgroundTasksData; pub use init::BackgroundTasksInitializer; pub use tasks::saga_recovery::SagaRecoveryHelpers; +// Expose background task outputs to they can be deserialized and +// observed. +pub mod task_output { + pub use super::tasks::support_bundle_collector::CleanupReport; + pub use super::tasks::support_bundle_collector::CollectionReport; +} + use futures::future::BoxFuture; use nexus_auth::context::OpContext; diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index d3ac1c8c6d2..8f0de3d49bd 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -31,6 +31,7 @@ use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; +use serde::Deserialize; use serde::Serialize; use serde_json::json; use sha2::{Digest, Sha256}; @@ -60,16 +61,16 @@ struct BundleRequest { } // Describes what happened while attempting to clean up Support Bundles. -#[derive(Debug, Default, Serialize, Eq, PartialEq)] -struct CleanupReport { +#[derive(Debug, Default, Deserialize, Serialize, Eq, PartialEq)] +pub struct CleanupReport { // Responses from Sled Agents - sled_bundles_deleted_ok: usize, - sled_bundles_deleted_not_found: usize, - sled_bundles_delete_failed: usize, + pub sled_bundles_deleted_ok: usize, + pub sled_bundles_deleted_not_found: usize, + pub sled_bundles_delete_failed: usize, // Results from updating our database records - db_destroying_bundles_removed: usize, - db_failing_bundles_updated: usize, + pub db_destroying_bundles_removed: usize, + pub db_failing_bundles_updated: usize, } // Result of asking a sled agent to clean up a bundle @@ -750,15 +751,15 @@ async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { // Identifies what we could or could not store within this support bundle. // // This struct will get emitted as part of the background task infrastructure. -#[derive(Debug, Serialize, PartialEq, Eq)] -struct CollectionReport { - bundle: SupportBundleUuid, +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct CollectionReport { + pub bundle: SupportBundleUuid, // True iff we could list in-service sleds - listed_in_service_sleds: bool, + pub listed_in_service_sleds: bool, // True iff the bundle was successfully made 'active' in the database. - activated_in_db_ok: bool, + pub activated_in_db_ok: bool, } impl CollectionReport { diff --git a/nexus/src/app/test_interfaces.rs b/nexus/src/app/test_interfaces.rs index 9852225e8cc..38d9b17a8de 100644 --- a/nexus/src/app/test_interfaces.rs +++ b/nexus/src/app/test_interfaces.rs @@ -18,6 +18,8 @@ pub use super::update::SpUpdater; pub use super::update::UpdateProgress; pub use gateway_client::types::SpType; +pub use crate::app::background::task_output; + /// The information needed to talk to a sled agent about an instance that is /// active on that sled. pub struct InstanceSledAgentInfo { diff --git a/nexus/test-utils/src/http_testing.rs b/nexus/test-utils/src/http_testing.rs index afa0a8af034..aff17b541f9 100644 --- a/nexus/test-utils/src/http_testing.rs +++ b/nexus/test-utils/src/http_testing.rs @@ -246,6 +246,20 @@ impl<'a> RequestBuilder<'a> { self } + /// Tells the requst to expect headers related to range requests + pub fn expect_range_requestable(mut self) -> Self { + self.allowed_headers.as_mut().unwrap().extend([ + http::header::CONTENT_LENGTH, + http::header::CONTENT_TYPE, + http::header::ACCEPT_RANGES, + ]); + self.expect_response_header( + http::header::CONTENT_TYPE, + "application/zip", + ) + .expect_response_header(http::header::ACCEPT_RANGES, "bytes") + } + /// Tells the request to initiate and expect a WebSocket upgrade handshake. /// This also sets the request method to GET. pub fn expect_websocket_handshake(mut self) -> Self { @@ -306,7 +320,7 @@ impl<'a> RequestBuilder<'a> { } let mut builder = - http::Request::builder().method(self.method).uri(self.uri); + http::Request::builder().method(self.method.clone()).uri(self.uri); for (header_name, header_value) in &self.headers { builder = builder.header(header_name, header_value); } @@ -452,6 +466,7 @@ impl<'a> RequestBuilder<'a> { }; if (status.is_client_error() || status.is_server_error()) && !self.allow_non_dropshot_errors + && self.method != http::Method::HEAD { let error_body = test_response .parsed_body::() diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 1ef2f908e47..c7e25081cf6 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -104,9 +104,9 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 # Disable automatic disk adoption to avoid interfering with tests. physical_disk_adoption.disable = true -support_bundle_collector.period_secs = 30 -# Disable automatic support bundle collection to avoid interfering with tests. -support_bundle_collector.disable = true +# Set this to a value that's so high as to not fire during tests unless +# explicitly requested. +support_bundle_collector.period_secs = 999999 decommissioned_disk_cleaner.period_secs = 60 # Disable disk decommissioning cleanup to avoid interfering with tests. decommissioned_disk_cleaner.disable = true diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index dfcea796076..dc404736cd1 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -43,6 +43,7 @@ mod snapshots; mod sp_updater; mod ssh_keys; mod subnet_allocation; +mod support_bundles; mod switch_port; mod unauthorized; mod unauthorized_coverage; diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs new file mode 100644 index 00000000000..5c1c036a86b --- /dev/null +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -0,0 +1,489 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Tests Support Bundles + +use anyhow::Context; +use anyhow::Result; +use dropshot::test_util::ClientTestContext; +use dropshot::HttpErrorResponseBody; +use http::method::Method; +use http::StatusCode; +use nexus_client::types::LastResult; +use nexus_test_utils::http_testing::AuthnMode; +use nexus_test_utils::http_testing::NexusRequest; +use nexus_test_utils::http_testing::RequestBuilder; +use nexus_test_utils::resource_helpers::TestDataset; +use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::shared::SupportBundleInfo; +use nexus_types::external_api::shared::SupportBundleState; +use omicron_common::api::internal::shared::DatasetKind; +use omicron_nexus::app::test_interfaces::task_output; +use omicron_uuid_kinds::{DatasetUuid, SupportBundleUuid, ZpoolUuid}; +use serde::Deserialize; +use std::io::Cursor; +use zip::read::ZipArchive; + +type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; +type DiskTest<'a> = + nexus_test_utils::resource_helpers::DiskTest<'a, omicron_nexus::Server>; + +// -- HTTP methods -- +// +// The following are a set of helper functions to access Support Bundle APIs +// through the public interface. + +const BUNDLES_URL: &str = "/experimental/v1/system/support-bundles"; + +async fn expect_not_found( + client: &ClientTestContext, + bundle_id: SupportBundleUuid, + bundle_url: &str, + method: Method, +) -> Result<()> { + let response = NexusRequest::expect_failure( + client, + StatusCode::NOT_FOUND, + method.clone(), + &bundle_url, + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .context("Failed to execute request and get response")?; + + // HEAD requests should not return bodies + if method == Method::HEAD { + return Ok(()); + } + + let error: HttpErrorResponseBody = + response.parsed_body().context("Failed to parse response")?; + + let expected = + format!("not found: support-bundle with id \"{}\"", bundle_id); + if error.message != expected { + anyhow::bail!( + "Unexpected error: {} (wanted {})", + error.message, + expected + ); + } + Ok(()) +} + +async fn bundles_list( + client: &ClientTestContext, +) -> Result> { + Ok(NexusRequest::iter_collection_authn(client, BUNDLES_URL, "", None) + .await + .context("failed to list bundles")? + .all_items) +} + +async fn bundle_get( + client: &ClientTestContext, + id: SupportBundleUuid, +) -> Result { + let url = format!("{BUNDLES_URL}/{id}"); + NexusRequest::object_get(client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .with_context(|| format!("failed to make \"GET\" request to {url}"))? + .parsed_body() +} + +async fn bundle_get_expect_fail( + client: &ClientTestContext, + id: SupportBundleUuid, + expected_status: StatusCode, + expected_message: &str, +) -> Result<()> { + let url = format!("{BUNDLES_URL}/{id}"); + let error = NexusRequest::new( + RequestBuilder::new(client, Method::GET, &url) + .expect_status(Some(expected_status)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .context("should have failed to GET bundle")? + .parsed_body::() + .context("failed to response error from bundle GET")?; + + if error.message != expected_message { + anyhow::bail!( + "Unexpected error: {} (wanted {})", + error.message, + expected_message + ); + } + Ok(()) +} + +async fn bundle_delete( + client: &ClientTestContext, + id: SupportBundleUuid, +) -> Result<()> { + let url = format!("{BUNDLES_URL}/{id}"); + NexusRequest::object_delete(client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .with_context(|| { + format!("failed to make \"DELETE\" request to {url}") + })?; + Ok(()) +} + +async fn bundle_create( + client: &ClientTestContext, +) -> Result { + NexusRequest::new( + RequestBuilder::new(client, Method::POST, BUNDLES_URL) + .expect_status(Some(StatusCode::CREATED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .context("failed to request bundle creation")? + .parsed_body() + .context("failed to parse 'create bundle' response") +} + +async fn bundle_create_expect_fail( + client: &ClientTestContext, + expected_status: StatusCode, + expected_message: &str, +) -> Result<()> { + let error = NexusRequest::new( + RequestBuilder::new(client, Method::POST, BUNDLES_URL) + .expect_status(Some(expected_status)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .context("should have failed to create bundle")? + .parsed_body::() + .context("failed to response error from bundle creation")?; + + if error.message != expected_message { + anyhow::bail!( + "Unexpected error: {} (wanted {})", + error.message, + expected_message + ); + } + Ok(()) +} + +async fn bundle_download( + client: &ClientTestContext, + id: SupportBundleUuid, +) -> Result { + let url = format!("{BUNDLES_URL}/{id}/download"); + let body = NexusRequest::new( + RequestBuilder::new(client, Method::GET, &url) + .expect_status(Some(StatusCode::OK)) + .expect_range_requestable(), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .context("failed to request bundle download")? + .body; + + Ok(body) +} + +async fn bundle_download_expect_fail( + client: &ClientTestContext, + id: SupportBundleUuid, + expected_status: StatusCode, + expected_message: &str, +) -> Result<()> { + let url = format!("{BUNDLES_URL}/{id}/download"); + let error = NexusRequest::new( + RequestBuilder::new(client, Method::GET, &url) + .expect_status(Some(expected_status)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .context("failed to request bundle download")? + .parsed_body::() + .context("failed to response error from bundle download")?; + + if error.message != expected_message { + anyhow::bail!( + "Unexpected error: {} (wanted {})", + error.message, + expected_message + ); + } + Ok(()) +} + +// -- Background Task -- +// +// The following logic helps us trigger and observe the output of the support +// bundle background task. + +#[derive(Deserialize)] +struct TaskOutput { + cleanup_err: Option, + collection_err: Option, + cleanup_report: Option, + collection_report: Option, +} + +async fn activate_bundle_collection_background_task( + cptestctx: &ControlPlaneTestContext, +) -> TaskOutput { + use nexus_test_utils::background::activate_background_task; + + let task = activate_background_task( + &cptestctx.internal_client, + "support_bundle_collector", + ) + .await; + + let LastResult::Completed(result) = task.last else { + panic!("Task did not complete"); + }; + serde_json::from_value(result.details).expect( + "Should have been able to deserialize TaskOutput from background task", + ) +} + +// Test accessing support bundle interfaces when the bundle does not exist, +// and when no U.2s exist on which to store support bundles. +#[nexus_test] +async fn test_support_bundle_not_found(cptestctx: &ControlPlaneTestContext) { + let client = &cptestctx.external_client; + + let id = SupportBundleUuid::new_v4(); + + expect_not_found( + &client, + id, + &format!("{BUNDLES_URL}/{id}"), + Method::DELETE, + ) + .await + .unwrap(); + + expect_not_found(&client, id, &format!("{BUNDLES_URL}/{id}"), Method::GET) + .await + .unwrap(); + + expect_not_found( + &client, + id, + &format!("{BUNDLES_URL}/{id}/download"), + Method::GET, + ) + .await + .unwrap(); + + expect_not_found( + &client, + id, + &format!("{BUNDLES_URL}/{id}/download/single-file"), + Method::GET, + ) + .await + .unwrap(); + + expect_not_found( + &client, + id, + &format!("{BUNDLES_URL}/{id}/download"), + Method::HEAD, + ) + .await + .unwrap(); + + expect_not_found( + &client, + id, + &format!("{BUNDLES_URL}/{id}/download/single-file"), + Method::HEAD, + ) + .await + .unwrap(); + + expect_not_found( + &client, + id, + &format!("{BUNDLES_URL}/{id}/download/index"), + Method::HEAD, + ) + .await + .unwrap(); + + assert!(bundles_list(&client).await.unwrap().is_empty()); + + bundle_create_expect_fail( + &client, + StatusCode::INSUFFICIENT_STORAGE, + "Insufficient capacity: Current policy limits support bundle creation to 'one per external disk', and no disks are available. You must delete old support bundles before new ones can be created", + ).await.unwrap(); +} + +// Test the create, read, and deletion operations on a bundle. +#[nexus_test] +async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { + let client = &cptestctx.external_client; + + let mut disk_test = DiskTest::new(&cptestctx).await; + + // We really care about just "getting a debug dataset" here. + let sled_id = cptestctx.first_sled(); + disk_test + .add_zpool_with_dataset_ext( + sled_id, + nexus_test_utils::PHYSICAL_DISK_UUID.parse().unwrap(), + ZpoolUuid::new_v4(), + vec![ + TestDataset { + id: DatasetUuid::new_v4(), + kind: DatasetKind::Crucible, + }, + TestDataset { + id: DatasetUuid::new_v4(), + kind: DatasetKind::Debug, + }, + ], + DiskTest::DEFAULT_ZPOOL_SIZE_GIB, + ) + .await; + disk_test.propagate_datasets_to_sleds().await; + + // Validate our test setup: We should see a single Debug dataset + // in our disk test. + let mut debug_dataset_count = 0; + for zpool in disk_test.zpools() { + for dataset in &zpool.datasets { + if matches!(dataset.kind, DatasetKind::Debug) { + debug_dataset_count += 1; + } + } + } + assert_eq!(debug_dataset_count, 1); + + // We should see no bundles before we start creation. + assert!(bundles_list(&client).await.unwrap().is_empty()); + let bundle = bundle_create(&client).await.unwrap(); + + assert_eq!(bundle.reason_for_creation, "Created by external API"); + assert_eq!(bundle.reason_for_failure, None); + assert_eq!(bundle.state, SupportBundleState::Collecting); + + let bundles = bundles_list(&client).await.unwrap(); + assert_eq!(bundles.len(), 1); + assert_eq!(bundles[0].id, bundle.id); + assert_eq!(bundle_get(&client, bundle.id).await.unwrap().id, bundle.id); + + // We can't collect a second bundle because the debug dataset already fully + // occupied. + // + // We'll retry this at the end of the test, and see that we can create + // another bundle when the first is cleared. + bundle_create_expect_fail( + &client, + StatusCode::INSUFFICIENT_STORAGE, + "Insufficient capacity: Current policy limits support bundle creation to 'one per external disk', and no disks are available. You must delete old support bundles before new ones can be created", + ).await.unwrap(); + + // The bundle is "Collecting", not "Active", so we can't download it yet. + bundle_download_expect_fail( + &client, + bundle.id, + StatusCode::BAD_REQUEST, + "Cannot download bundle in non-active state", + ) + .await + .unwrap(); + + // If we prompt the background task to run, the bundle should transition to + // "Active". + let output = activate_bundle_collection_background_task(&cptestctx).await; + assert_eq!(output.cleanup_err, None); + assert_eq!(output.collection_err, None); + assert_eq!( + output.cleanup_report, + Some(task_output::CleanupReport { ..Default::default() }) + ); + assert_eq!( + output.collection_report, + Some(task_output::CollectionReport { + bundle: bundle.id, + listed_in_service_sleds: true, + activated_in_db_ok: true, + }) + ); + let bundle = bundle_get(&client, bundle.id).await.unwrap(); + assert_eq!(bundle.state, SupportBundleState::Active); + + // Now we should be able to download the bundle + let contents = bundle_download(&client, bundle.id).await.unwrap(); + let archive = ZipArchive::new(Cursor::new(&contents)).unwrap(); + let mut names = archive.file_names(); + assert_eq!(names.next(), Some("bundle_id.txt")); + assert_eq!(names.next(), Some("rack/")); + // There's much more data in the bundle, but validating it isn't the point + // of this test, which cares more about bundle lifecycle. + + // We are also able to delete the bundle + bundle_delete(&client, bundle.id).await.unwrap(); + let observed = bundle_get(&client, bundle.id).await.unwrap(); + assert_eq!(observed.state, SupportBundleState::Destroying); + + // We cannot download anything after bundle deletion starts + bundle_download_expect_fail( + &client, + bundle.id, + StatusCode::BAD_REQUEST, + "Cannot download bundle in non-active state", + ) + .await + .unwrap(); + + // If we prompt the background task to run, the bundle will be cleaned up. + let output = activate_bundle_collection_background_task(&cptestctx).await; + assert_eq!(output.cleanup_err, None); + assert_eq!(output.collection_err, None); + assert_eq!( + output.cleanup_report, + Some(task_output::CleanupReport { + sled_bundles_deleted_ok: 1, + db_destroying_bundles_removed: 1, + ..Default::default() + }) + ); + assert_eq!(output.collection_report, None); + + // The bundle is now fully deleted, so it should no longer appear. + bundle_get_expect_fail( + &client, + bundle.id, + StatusCode::NOT_FOUND, + &format!("not found: support-bundle with id \"{}\"", bundle.id), + ) + .await + .unwrap(); + + // We can now create a second bundle, as the first has been freed. + let second_bundle = bundle_create(&client).await.unwrap(); + + assert_ne!( + second_bundle.id, bundle.id, + "We should have made a distinct bundle" + ); + assert_eq!(second_bundle.reason_for_creation, "Created by external API"); + assert_eq!(second_bundle.state, SupportBundleState::Collecting); +} diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index 9e9bc26ffe8..e61b74e527c 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -417,7 +417,9 @@ mod test { } } -#[derive(Debug, Clone, Copy, JsonSchema, Serialize, Deserialize)] +#[derive( + Debug, Clone, Copy, JsonSchema, Serialize, Deserialize, Eq, PartialEq, +)] #[serde(rename_all = "snake_case")] pub enum SupportBundleState { /// Support Bundle still actively being collected. From 39394ff72199976b40c630c4e741413175de5f29 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 8 Jan 2025 15:56:33 -0800 Subject: [PATCH 57/61] Update to deal with structured support bundle APIs from sled agent --- .../tasks/support_bundle_collector.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 124ff274799..d86b92d1299 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -12,7 +12,6 @@ use camino_tempfile::tempfile; use camino_tempfile::Utf8TempDir; use futures::future::BoxFuture; use futures::FutureExt; -use futures::TryStreamExt; use nexus_db_model::SupportBundle; use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; @@ -759,22 +758,22 @@ impl CollectionReport { } } -async fn write_command_result_or_error( +async fn write_command_result_or_error( path: &Utf8Path, command: &str, result: Result< - sled_agent_client::ResponseValue, + sled_agent_client::ResponseValue, sled_agent_client::Error, >, ) -> anyhow::Result<()> { match result { Ok(result) => { - let output = result - .into_inner_stream() - .try_collect::() - .await?; - tokio::fs::write(path.join(format!("{command}.txt")), output) - .await?; + let output = result.into_inner(); + tokio::fs::write( + path.join(format!("{command}.txt")), + format!("{output:?}"), + ) + .await?; } Err(err) => { tokio::fs::write( From 53dcbe6ae8e75c2386e9b0ec86ac28c6535d59e8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 9 Jan 2025 10:29:40 -0800 Subject: [PATCH 58/61] Use /var/tmp --- .../background/tasks/support_bundle_collector.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index d86b92d1299..88e5d824b80 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -7,8 +7,8 @@ use crate::app::background::BackgroundTask; use camino::Utf8DirEntry; use camino::Utf8Path; -use camino_tempfile::tempdir; -use camino_tempfile::tempfile; +use camino_tempfile::tempdir_in; +use camino_tempfile::tempfile_in; use camino_tempfile::Utf8TempDir; use futures::future::BoxFuture; use futures::FutureExt; @@ -41,6 +41,10 @@ use tokio::io::SeekFrom; use zip::write::FullFileOptions; use zip::ZipWriter; +// We use "/var/tmp" to use Nexus' filesystem for temporary storage, +// rather than "/tmp", which would keep this collected data in-memory. +const TEMPDIR: &str = "/var/tmp"; + // Specifies the data to be collected within the Support Bundle. #[derive(Default)] struct BundleRequest { @@ -426,7 +430,7 @@ impl<'a> BundleCollection<'a> { ) -> anyhow::Result { // Create a temporary directory where we'll store the support bundle // as it's being collected. - let dir = tempdir()?; + let dir = tempdir_in(TEMPDIR)?; let mut collection = Box::pin(self.collect_bundle_as_file(&dir)); @@ -666,7 +670,7 @@ impl BackgroundTask for SupportBundleCollector { // Takes a directory "dir", and zips the contents into a single zipfile. fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { - let tempfile = tempfile()?; + let tempfile = tempfile_in(TEMPDIR)?; let mut zip = ZipWriter::new(tempfile); recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; @@ -790,6 +794,7 @@ async fn write_command_result_or_error( mod test { use super::*; + use camino_tempfile::tempdir; use nexus_db_model::Dataset; use nexus_db_model::PhysicalDisk; use nexus_db_model::PhysicalDiskKind; From fb5cae95f5e5830ff7161d33d51f15c178f31168 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 9 Jan 2025 10:58:50 -0800 Subject: [PATCH 59/61] Fix merge --- sled-agent/src/sim/storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index f7fc4611db8..e93de3730e1 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -1313,7 +1313,7 @@ impl StorageInner { .collect(); for dataset in &dataset_names { // Datasets delegated to zones manage their own storage. - if dataset.dataset().zoned() { + if dataset.kind().zoned() { continue; } From f3144c37e161701569899aa3a42fb4fab25d91c8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 9 Jan 2025 11:48:02 -0800 Subject: [PATCH 60/61] omdb tests --- dev-tools/omdb/tests/successes.out | 8 ++++---- test-utils/src/dev/test_cmds.rs | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 742ed1e840e..6c3b69f2657 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -698,11 +698,11 @@ task: "service_zone_nat_tracker" last completion reported error: inventory collection is None task: "support_bundle_collector" - configured period: every s + configured period: every days h m s currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - last completion reported error: task disabled +warning: unknown background task: "support_bundle_collector" (don't know how to interpret details: Object {"cleanup_err": Null, "cleanup_report": Object {"db_destroying_bundles_removed": Number(0), "db_failing_bundles_updated": Number(0), "sled_bundles_delete_failed": Number(0), "sled_bundles_deleted_not_found": Number(0), "sled_bundles_deleted_ok": Number(0)}, "collection_err": Null, "collection_report": Null}) task: "switch_port_config_manager" configured period: every s @@ -1150,11 +1150,11 @@ task: "service_zone_nat_tracker" last completion reported error: inventory collection is None task: "support_bundle_collector" - configured period: every s + configured period: every days h m s currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - last completion reported error: task disabled +warning: unknown background task: "support_bundle_collector" (don't know how to interpret details: Object {"cleanup_err": Null, "cleanup_report": Object {"db_destroying_bundles_removed": Number(0), "db_failing_bundles_updated": Number(0), "sled_bundles_delete_failed": Number(0), "sled_bundles_deleted_not_found": Number(0), "sled_bundles_deleted_ok": Number(0)}, "collection_err": Null, "collection_report": Null}) task: "switch_port_config_manager" configured period: every s diff --git a/test-utils/src/dev/test_cmds.rs b/test-utils/src/dev/test_cmds.rs index 220efa46288..d2662abb1ce 100644 --- a/test-utils/src/dev/test_cmds.rs +++ b/test-utils/src/dev/test_cmds.rs @@ -257,6 +257,18 @@ fn redact_basic(input: &str) -> String { .replace_all(&s, "m") .to_string(); + // Replace interval (h). + let s = regex::Regex::new(r"\d+h") + .unwrap() + .replace_all(&s, "h") + .to_string(); + + // Replace interval (days). + let s = regex::Regex::new(r"\d+days") + .unwrap() + .replace_all(&s, "days") + .to_string(); + let s = regex::Regex::new( r"note: database schema version matches expected \(\d+\.\d+\.\d+\)", ) From a54097c6abb36b6b43187b728a4a866aba2b653c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 13 Jan 2025 11:14:20 -0800 Subject: [PATCH 61/61] simpler buffering --- .../background/tasks/support_bundle_collector.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 88e5d824b80..89becc5320c 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -31,8 +31,6 @@ use omicron_uuid_kinds::ZpoolUuid; use serde::Serialize; use serde_json::json; use sha2::{Digest, Sha256}; -use std::io::BufRead; -use std::io::BufReader; use std::io::Write; use std::sync::Arc; use tokio::io::AsyncReadExt; @@ -702,17 +700,8 @@ fn recursively_add_directory_to_zipfile( let src = entry.path(); zip.start_file_from_path(dst, opts)?; - let mut reader = BufReader::new(std::fs::File::open(&src)?); - - loop { - let buf = reader.fill_buf()?; - let len = buf.len(); - if len == 0 { - break; - } - zip.write_all(&buf)?; - reader.consume(len); - } + let mut file = std::fs::File::open(&src)?; + std::io::copy(&mut file, zip)?; } if file_type.is_dir() { let opts = FullFileOptions::default();