Skip to content

Commit

Permalink
Flaky CI related investigation and fixes (#1280)
Browse files Browse the repository at this point in the history
* Timeout, updated test-r, external epoch loop

* Reenabled parallel tests

* Increase timeout, reduce parallelism

* Removed huge debug print

* Always collect jUnit reports even if cancelled (by timeout)
  • Loading branch information
vigoo authored Jan 30, 2025
1 parent de381da commit 82bd98e
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 40 deletions.
14 changes: 8 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ jobs:
run: cargo make --profile ci check
- name: Publish Test Report
uses: mikepenz/action-junit-report@v4
if: success() || failure() # always run even if the previous step fails
if: always()
with:
report_paths: '**/target/report-*.xml'
detailed_summary: true
Expand Down Expand Up @@ -310,7 +310,7 @@ jobs:
timeout-minutes: 20
- name: Publish Test Report
uses: mikepenz/action-junit-report@v4
if: success() || failure() # always run even if the previous step fails
if: always()
with:
report_paths: '**/target/report-*.xml'
detailed_summary: true
Expand Down Expand Up @@ -360,7 +360,7 @@ jobs:
timeout-minutes: 40
- name: Publish Test Report
uses: mikepenz/action-junit-report@v4
if: success() || failure() # always run even if the previous step fails
if: always()
with:
report_paths: '**/target/report-*.xml'
detailed_summary: true
Expand Down Expand Up @@ -404,13 +404,15 @@ jobs:
timeout-minutes: 30
- name: Publish Test Report
uses: mikepenz/action-junit-report@v4
if: success() || failure() # always run even if the previous step fails
if: always()
with:
report_paths: '**/target/report-*.xml'
detailed_summary: true
include_passed: true
cli-tests:
runs-on: ubuntu-latest-xlarge
env:
CARGO_BUILD_JOBS: 6
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down Expand Up @@ -443,7 +445,7 @@ jobs:
timeout-minutes: 35
- name: Publish Test Report
uses: mikepenz/action-junit-report@v4
if: success() || failure() # always run even if the previous step fails
if: always()
with:
report_paths: '**/target/report-*.xml'
detailed_summary: true
Expand Down Expand Up @@ -484,7 +486,7 @@ jobs:
timeout-minutes: 40
- name: Publish Test Report
uses: mikepenz/action-junit-report@v4
if: success() || failure() # always run even if the previous step fails
if: always()
with:
report_paths: '**/target/report-*.xml'
detailed_summary: true
Expand Down
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ testcontainers-modules = { version = "0.11.4", features = [
"redis",
"minio",
] }
test-r = { version = "2.0.1", default-features = true }
test-r = { version = "2.1.0", default-features = true }
thiserror = "2.0.6"
tokio = { version = "1.42", features = [
"macros",
Expand Down
18 changes: 9 additions & 9 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -221,64 +221,64 @@ dependencies = ["wit"]
description = "Runs worker executor tests only (group 1/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag: -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag:group1 -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag: -- --report-time --nocapture $JUNIT_OPTS
cargo test --package golem-worker-executor-base --test integration :tag:group1 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.worker-executor-tests-group2]
dependencies = ["wit"]
description = "Runs worker executor tests only (group 2/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag:group2 -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag:group2 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.worker-executor-tests-group3]
dependencies = ["wit"]
description = "Runs worker executor tests only (group 3/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag:group3 -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag:group3 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.worker-executor-tests-group4]
dependencies = ["wit"]
description = "Runs worker executor tests only (group 4/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag:group4 -- --report-time $JUNIT_OPTS
cargo test --package golem-worker-executor-base --test integration :tag:group4 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.worker-executor-tests-group5]
dependencies = ["wit"]
description = "Runs worker executor tests only (group 5/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag:group5 -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag:group5 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.worker-executor-tests-group6]
dependencies = ["wit"]
description = "Runs worker executor tests only (group 6/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag:group6 -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag:group6 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.worker-executor-tests-group7]
dependencies = ["wit"]
description = "Runs worker executor tests only (group 7/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag:group7 -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag:group7 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.worker-executor-tests-group8]
dependencies = ["wit"]
description = "Runs worker executor tests only (group 8/8)"
env = { "RUST_BACKTRACE" = "1", "WASMTIME_BACKTRACE_DETAILS" = "1", "RUST_LOG" = "info", "RUST_TEST_TIME_INTEGRATION" = "5000,30000" }
script = '''
cargo test --package golem-worker-executor-base --test integration :tag:group8 -- --report-time
cargo test --package golem-worker-executor-base --test integration :tag:group8 -- --report-time --nocapture $JUNIT_OPTS
'''

[tasks.integration-tests]
Expand Down
30 changes: 14 additions & 16 deletions golem-worker-executor-base/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ const VERSION: &str = golem_version!();
pub struct RunDetails {
pub http_port: u16,
pub grpc_port: u16,
pub epoch_thread: std::thread::JoinHandle<()>,
}

/// The Bootstrap trait should be implemented by all Worker Executors to customize the initialization
Expand Down Expand Up @@ -249,12 +250,11 @@ pub trait Bootstrap<Ctx: WorkerCtx> {

let lazy_worker_activator = Arc::new(LazyWorkerActivator::new());

let worker_executor_impl = create_worker_executor_impl::<Ctx, Self>(
let (worker_executor_impl, epoch_thread) = create_worker_executor_impl::<Ctx, Self>(
golem_config.clone(),
self,
runtime.clone(),
&lazy_worker_activator,
join_set,
)
.await?;

Expand All @@ -272,6 +272,7 @@ pub trait Bootstrap<Ctx: WorkerCtx> {
Ok(RunDetails {
http_port,
grpc_port: addr.port(),
epoch_thread,
})
}
}
Expand All @@ -281,8 +282,7 @@ async fn create_worker_executor_impl<Ctx: WorkerCtx, A: Bootstrap<Ctx> + ?Sized>
bootstrap: &A,
runtime: Handle,
lazy_worker_activator: &Arc<LazyWorkerActivator<Ctx>>,
join_set: &mut JoinSet<Result<(), anyhow::Error>>,
) -> Result<All<Ctx>, anyhow::Error> {
) -> Result<(All<Ctx>, std::thread::JoinHandle<()>), anyhow::Error> {
let (redis, sqlite, key_value_storage): (
Option<RedisPool>,
Option<SqlitePool>,
Expand Down Expand Up @@ -473,17 +473,13 @@ async fn create_worker_executor_impl<Ctx: WorkerCtx, A: Bootstrap<Ctx> + ?Sized>
let engine = Arc::new(Engine::new(&config)?);
let linker = bootstrap.create_wasmtime_linker(&engine)?;

let mut epoch_interval = tokio::time::interval(golem_config.limits.epoch_interval);
let engine_ref: Arc<Engine> = engine.clone();
join_set.spawn(
async move {
loop {
epoch_interval.tick().await;
engine_ref.increment_epoch();
}
}
.in_current_span(),
);

let epoch_interval = golem_config.limits.epoch_interval;
let epoch_thread = std::thread::spawn(move || loop {
std::thread::sleep(epoch_interval);
engine_ref.increment_epoch();
});

let linker = Arc::new(linker);

Expand Down Expand Up @@ -539,7 +535,7 @@ async fn create_worker_executor_impl<Ctx: WorkerCtx, A: Bootstrap<Ctx> + ?Sized>
golem_config.scheduler.refresh_interval,
);

bootstrap
let all = bootstrap
.create_services(
active_workers,
engine,
Expand All @@ -564,5 +560,7 @@ async fn create_worker_executor_impl<Ctx: WorkerCtx, A: Bootstrap<Ctx> + ?Sized>
plugins,
oplog_processor_plugin,
)
.await
.await?;

Ok((all, epoch_thread))
}
3 changes: 2 additions & 1 deletion golem-worker-executor-base/tests/guest_languages2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use test_r::{inherit_test_dep, test};
use test_r::{inherit_test_dep, test, timeout};

use crate::common::{start, TestContext};
use crate::{LastUniqueId, Tracing, WorkerExecutorTestDependencies};
Expand All @@ -28,6 +28,7 @@ inherit_test_dep!(Tracing);

#[test]
#[tracing::instrument]
#[timeout(300_000)]
async fn javascript_example_3(
last_unique_id: &LastUniqueId,
deps: &WorkerExecutorTestDependencies,
Expand Down
2 changes: 1 addition & 1 deletion golem-worker-executor-base/tests/observability.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ async fn get_oplog_with_api_changing_updates(
.filter(|entry| !matches!(entry, PublicOplogEntry::PendingWorkerInvocation(_)))
.collect::<Vec<_>>();

println!("oplog\n{:#?}", oplog);
// println!("oplog\n{:#?}", oplog);

check!(result[0] == Value::U64(11));
assert_eq!(oplog.len(), 17);
Expand Down

0 comments on commit 82bd98e

Please sign in to comment.