From 401f1a3c38f3be634aa2ded11658a3155c1a1210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grzegorz=20=C5=9Awirski?= Date: Tue, 7 Jan 2025 18:06:35 +0400 Subject: [PATCH 1/3] cuda acceleration --- Cargo.lock | 126 ++++++++++++++++++- air/src/options.rs | 15 +++ miden/Cargo.toml | 1 + miden/src/cli/prove.rs | 10 +- prover/Cargo.toml | 11 +- prover/README.md | 1 + prover/src/gpu/cuda/mod.rs | 135 ++++++++++++++++++++ prover/src/gpu/cuda/tests.rs | 234 +++++++++++++++++++++++++++++++++++ prover/src/gpu/metal/mod.rs | 26 +++- prover/src/gpu/mod.rs | 3 + prover/src/lib.rs | 11 +- verifier/src/lib.rs | 16 +++ 12 files changed, 582 insertions(+), 7 deletions(-) create mode 100644 prover/src/gpu/cuda/mod.rs create mode 100644 prover/src/gpu/cuda/tests.rs diff --git a/Cargo.lock b/Cargo.lock index dbe6e24e5e..174e367351 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" + [[package]] name = "anes" version = "0.1.6" @@ -634,6 +640,7 @@ checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", @@ -656,6 +663,17 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.31" @@ -680,11 +698,15 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", + "futures-io", "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", + "slab", ] [[package]] @@ -767,6 +789,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "indenter" version = "0.3.3" @@ -1082,12 +1113,20 @@ dependencies = [ [[package]] name = "miden-gpu" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04742d5184bd76e7b92afcd9ca36b7c587a555d5fc4b27bb8f9d64ac57151f9c" dependencies = [ + "allocator-api2", + "cc", "metal", + "miden-crypto", "once_cell", + "rayon", + "sppark", + "thiserror 2.0.3", + "tracing", + "winter-air", "winter-math", + "winter-prover", + "winterfell", ] [[package]] @@ -1157,6 +1196,7 @@ dependencies = [ "miden-gpu", "miden-processor", "pollster", + "serial_test", "tracing", "winter-maybe-async", "winter-prover", @@ -1817,6 +1857,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scc" +version = "2.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66b202022bb57c049555430e11fc22fea12909276a80a4c3d368da36ac1d88ed" +dependencies = [ + "sdd", +] + [[package]] name = "scoped-tls" version = "1.0.1" @@ -1829,6 +1878,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sdd" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49c1eeaf4b6a87c7479688c6d52b9f1153cedd3c489300564f932b065c6eab95" + [[package]] name = "semver" version = "0.9.0" @@ -1891,6 +1946,31 @@ dependencies = [ "serde", ] +[[package]] +name = "serial_test" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b258109f244e1d6891bf1053a55d63a5cd4f8f4c30cf9a1280989f80e7a1fa9" +dependencies = [ + "futures", + "log", + "once_cell", + "parking_lot", + "scc", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha2" version = "0.10.8" @@ -1933,6 +2013,15 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + [[package]] name = "smallvec" version = "1.13.2" @@ -1951,6 +2040,16 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "sppark" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac5090642d9ae844edd9a5c23cb1fce6724ca88d50c55178ee8d1f656df5826b" +dependencies = [ + "cc", + "which", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -2491,6 +2590,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2845,6 +2956,17 @@ dependencies = [ "winter-utils", ] +[[package]] +name = "winterfell" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6bdcd01333bbf4a349d8d13f269281524bd6d1a36ae3a853187f0665bf1cfd4" +dependencies = [ + "winter-air", + "winter-prover", + "winter-verifier", +] + [[package]] name = "yansi" version = "1.0.1" diff --git a/air/src/options.rs b/air/src/options.rs index bd1a5e6e56..70122d0a2a 100644 --- a/air/src/options.rs +++ b/air/src/options.rs @@ -129,6 +129,21 @@ impl ProvingOptions { self } + /// Sets partitions for this [ProvingOptions]. + /// + /// Partitions can be provided to split traces during proving and distribute work across + /// multiple devices. The number of partitions should be equal to the number of devices. + pub const fn with_partitions(mut self, num_partitions: usize) -> Self { + let hash_rate = match self.hash_fn { + HashFunction::Blake3_192 => 6, + HashFunction::Blake3_256 => 8, + HashFunction::Rpo256 => 8, + HashFunction::Rpx256 => 8, + }; + self.proof_options = self.proof_options.with_partitions(num_partitions, hash_rate); + self + } + // PUBLIC ACCESSORS // -------------------------------------------------------------------------------------------- diff --git a/miden/Cargo.toml b/miden/Cargo.toml index 0c55a8a862..af74397bce 100644 --- a/miden/Cargo.toml +++ b/miden/Cargo.toml @@ -55,6 +55,7 @@ executable = [ "dep:rustyline", "dep:tracing-subscriber", ] +cuda = ["prover/cuda", "std"] metal = ["prover/metal", "std"] std = ["assembly/std", "processor/std", "prover/std", "verifier/std"] # For internal use, not meant to be used by users diff --git a/miden/src/cli/prove.rs b/miden/src/cli/prove.rs index 1a0d5a59ea..43f5bc2127 100644 --- a/miden/src/cli/prove.rs +++ b/miden/src/cli/prove.rs @@ -4,6 +4,8 @@ use assembly::diagnostics::{IntoDiagnostic, Report, WrapErr}; use clap::Parser; use miden_vm::{internal::InputFile, ProvingOptions}; use processor::{DefaultHost, ExecutionOptions, ExecutionOptionsError, Program}; +#[cfg(all(target_arch = "x86_64", feature = "cuda"))] +use prover::get_num_of_gpus; use stdlib::StdLibrary; use tracing::instrument; @@ -65,6 +67,11 @@ impl ProveCmd { pub fn get_proof_options(&self) -> Result { let exec_options = ExecutionOptions::new(Some(self.max_cycles), self.expected_cycles, self.trace, false)?; + + let partitions = 1; + #[cfg(all(target_arch = "x86_64", feature = "cuda"))] + let partitions = get_num_of_gpus(); + Ok(match self.security.as_str() { "96bits" => { if self.rpx { @@ -82,7 +89,8 @@ impl ProveCmd { }, other => panic!("{} is not a valid security setting", other), } - .with_execution_options(exec_options)) + .with_execution_options(exec_options) + .with_partitions(partitions)) } pub fn execute(&self) -> Result<(), Report> { diff --git a/prover/Cargo.toml b/prover/Cargo.toml index 2dc9f783a5..61005a9d9d 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -17,7 +17,8 @@ edition.workspace = true async = ["winter-maybe-async/async"] concurrent = ["processor/concurrent", "std", "winter-prover/concurrent"] default = ["std"] -metal = ["dep:miden-gpu", "dep:elsa", "dep:pollster", "concurrent", "std"] +metal = ["dep:miden-gpu", "miden-gpu/metal", "dep:elsa", "dep:pollster", "concurrent", "std"] +cuda = ["dep:miden-gpu", "miden-gpu/cuda", "std"] std = ["air/std", "processor/std", "winter-prover/std"] [dependencies] @@ -29,5 +30,11 @@ winter-prover = { package = "winter-prover", version = "0.11", default-features [target.'cfg(all(target_arch = "aarch64", target_os = "macos"))'.dependencies] elsa = { version = "1.9", optional = true } -miden-gpu = { version = "0.4", optional = true } +miden-gpu = { path = "../../miden-gpu", optional = true } pollster = { version = "0.4", optional = true } + +[target.'cfg(target_arch = "x86_64")'.dependencies] +miden-gpu = { path = "../../miden-gpu", optional = true } + +[dev-dependencies] +serial_test = "3.1" diff --git a/prover/README.md b/prover/README.md index b1465b21ac..e4864d893f 100644 --- a/prover/README.md +++ b/prover/README.md @@ -45,6 +45,7 @@ Miden prover can be compiled with the following features: * `std` - enabled by default and relies on the Rust standard library. * `concurrent` - implies `std` and also enables multi-threaded proof generation. * `metal` - enables [Metal](https://en.wikipedia.org/wiki/Metal_(API))-based acceleration of proof generation (for recursive proofs) on supported platforms (e.g., Apple silicon). +* `cuda` - enables [Cuda](https://en.wikipedia.org/wiki/CUDA)-based acceleration of proof generation (for recursive proofs) on supported platforms. * `no_std` does not rely on the Rust standard library and enables compilation to WebAssembly. * Only the `wasm32-unknown-unknown` and `wasm32-wasip1` targets are officially supported. diff --git a/prover/src/gpu/cuda/mod.rs b/prover/src/gpu/cuda/mod.rs new file mode 100644 index 0000000000..8bdee9d8bd --- /dev/null +++ b/prover/src/gpu/cuda/mod.rs @@ -0,0 +1,135 @@ +//! This module contains GPU acceleration logic for Nvidia CUDA devices. + +use std::marker::PhantomData; + +use air::{AuxRandElements, PartitionOptions}; +use miden_gpu::{ + cuda::{constraints::CudaConstraintCommitment, merkle::MerkleTree, trace_lde::CudaTraceLde}, + HashFn, +}; +use processor::crypto::{ElementHasher, Hasher}; +use winter_prover::{ + crypto::Digest, matrix::ColMatrix, CompositionPoly, CompositionPolyTrace, + ConstraintCompositionCoefficients, DefaultConstraintEvaluator, Prover, StarkDomain, TraceInfo, + TracePolyTable, +}; + +use crate::{ + crypto::{RandomCoin, Rpo256}, + ExecutionProver, ExecutionTrace, Felt, FieldElement, ProcessorAir, PublicInputs, + WinterProofOptions, +}; + +#[cfg(test)] +mod tests; + +// CONSTANTS +// ================================================================================================ + +const DIGEST_SIZE: usize = Rpo256::DIGEST_RANGE.end - Rpo256::DIGEST_RANGE.start; + +// CUDA RPO/RPX PROVER +// ================================================================================================ + +/// Wraps an [ExecutionProver] and provides GPU acceleration for building trace commitments. +pub(crate) struct CudaExecutionProver +where + H: Hasher + ElementHasher, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, + R: RandomCoin + Send, +{ + pub execution_prover: ExecutionProver, + pub hash_fn: HashFn, + phantom_data: PhantomData, +} + +impl CudaExecutionProver +where + H: Hasher + ElementHasher, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, + R: RandomCoin + Send, +{ + pub fn new(execution_prover: ExecutionProver, hash_fn: HashFn) -> Self { + CudaExecutionProver { + execution_prover, + hash_fn, + phantom_data: PhantomData, + } + } +} + +impl Prover for CudaExecutionProver +where + H: Hasher + ElementHasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, + R: RandomCoin + Send, +{ + type BaseField = Felt; + type Air = ProcessorAir; + type Trace = ExecutionTrace; + type VC = MerkleTree; + type HashFn = H; + type RandomCoin = R; + type TraceLde> = CudaTraceLde; + type ConstraintCommitment> = CudaConstraintCommitment; + type ConstraintEvaluator<'a, E: FieldElement> = + DefaultConstraintEvaluator<'a, ProcessorAir, E>; + + fn get_pub_inputs(&self, trace: &ExecutionTrace) -> PublicInputs { + self.execution_prover.get_pub_inputs(trace) + } + + fn options(&self) -> &WinterProofOptions { + self.execution_prover.options() + } + + fn new_trace_lde>( + &self, + trace_info: &TraceInfo, + main_trace: &ColMatrix, + domain: &StarkDomain, + partition_options: PartitionOptions, + ) -> (Self::TraceLde, TracePolyTable) { + CudaTraceLde::new(trace_info, main_trace, domain, partition_options, self.hash_fn) + } + + fn new_evaluator<'a, E: FieldElement>( + &self, + air: &'a ProcessorAir, + aux_rand_elements: Option>, + composition_coefficients: ConstraintCompositionCoefficients, + ) -> Self::ConstraintEvaluator<'a, E> { + self.execution_prover + .new_evaluator(air, aux_rand_elements, composition_coefficients) + } + + fn build_aux_trace( + &self, + main_trace: &Self::Trace, + aux_rand_elements: &AuxRandElements, + ) -> ColMatrix + where + E: FieldElement, + { + self.execution_prover.build_aux_trace(main_trace, aux_rand_elements) + } + + fn build_constraint_commitment( + &self, + composition_poly_trace: CompositionPolyTrace, + num_constraint_composition_columns: usize, + domain: &StarkDomain, + partition_options: PartitionOptions, + ) -> (Self::ConstraintCommitment, CompositionPoly) + where + E: FieldElement, + { + CudaConstraintCommitment::new( + composition_poly_trace, + num_constraint_composition_columns, + domain, + partition_options, + self.hash_fn, + ) + } +} diff --git a/prover/src/gpu/cuda/tests.rs b/prover/src/gpu/cuda/tests.rs new file mode 100644 index 0000000000..640f08b71c --- /dev/null +++ b/prover/src/gpu/cuda/tests.rs @@ -0,0 +1,234 @@ +use alloc::vec::Vec; +use std::vec; + +use air::{ProvingOptions, StarkField}; +use miden_gpu::{ + cuda::util::{DIGEST_SIZE, RATE}, + HashFn, +}; +use processor::{ + crypto::{Hasher, RpoDigest, RpoRandomCoin, Rpx256, RpxDigest, RpxRandomCoin}, + math::fft, + StackInputs, StackOutputs, +}; +use serial_test::serial; +use winter_prover::{crypto::Digest, CompositionPolyTrace, TraceLde}; + +use crate::{gpu::cuda::CudaExecutionProver, *}; + +fn build_trace_commitment_on_gpu_with_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let cpu_prover = create_test_prover::(is_rpx); + let gpu_prover = CudaExecutionProver::new(create_test_prover::(is_rpx), hash_fn); + let num_rows = 1 << 8; + let trace_info = get_trace_info(1, num_rows); + let trace = gen_random_trace(num_rows, RATE + 1); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + + let (cpu_trace_lde, cpu_polys) = cpu_prover.new_trace_lde::(&trace_info, &trace, &domain); + let (gpu_trace_lde, gpu_polys) = gpu_prover.new_trace_lde::(&trace_info, &trace, &domain); + + assert_eq!( + cpu_trace_lde.get_main_trace_commitment(), + gpu_trace_lde.get_main_trace_commitment() + ); + assert_eq!( + cpu_polys.main_trace_polys().collect::>(), + gpu_polys.main_trace_polys().collect::>() + ); +} + +fn build_trace_commitment_on_gpu_without_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let cpu_prover = create_test_prover::(is_rpx); + let gpu_prover = CudaExecutionProver::new(create_test_prover::(is_rpx), hash_fn); + let num_rows = 1 << 8; + let trace_info = get_trace_info(8, num_rows); + let trace = gen_random_trace(num_rows, RATE); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + + let (cpu_trace_lde, cpu_polys) = cpu_prover.new_trace_lde::(&trace_info, &trace, &domain); + let (gpu_trace_lde, gpu_polys) = gpu_prover.new_trace_lde::(&trace_info, &trace, &domain); + + assert_eq!( + cpu_trace_lde.get_main_trace_commitment(), + gpu_trace_lde.get_main_trace_commitment() + ); + assert_eq!( + cpu_polys.main_trace_polys().collect::>(), + gpu_polys.main_trace_polys().collect::>() + ); +} + +fn build_constraint_commitment_on_gpu_with_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let cpu_prover = create_test_prover::(is_rpx); + let gpu_prover = CudaExecutionProver::new(create_test_prover::(is_rpx), hash_fn); + let num_rows = 1 << 8; + let ce_blowup_factor = 2; + let values = get_random_values::(num_rows * ce_blowup_factor); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + + let (commitment_cpu, composition_poly_cpu) = cpu_prover.build_constraint_commitment( + CompositionPolyTrace::new(values.clone()), + 2, + &domain, + ); + let (commitment_gpu, composition_poly_gpu) = + gpu_prover.build_constraint_commitment(CompositionPolyTrace::new(values), 2, &domain); + + assert_eq!(commitment_cpu.root(), commitment_gpu.root()); + assert_ne!(0, composition_poly_cpu.data().num_base_cols() % RATE); + assert_eq!(composition_poly_cpu.into_columns(), composition_poly_gpu.into_columns()); +} + +fn build_constraint_commitment_on_gpu_without_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let cpu_prover = create_test_prover::(is_rpx); + let gpu_prover = CudaExecutionProver::new(create_test_prover::(is_rpx), hash_fn); + let num_rows = 1 << 8; + let ce_blowup_factor = 8; + let values = get_random_values::(num_rows * ce_blowup_factor); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + + let (commitment_cpu, composition_poly_cpu) = cpu_prover.build_constraint_commitment( + CompositionPolyTrace::new(values.clone()), + 8, + &domain, + ); + let (commitment_gpu, composition_poly_gpu) = + gpu_prover.build_constraint_commitment(CompositionPolyTrace::new(values), 8, &domain); + + assert_eq!(commitment_cpu.root(), commitment_gpu.root()); + assert_eq!(0, composition_poly_cpu.data().num_base_cols() % RATE); + assert_eq!(composition_poly_cpu.into_columns(), composition_poly_gpu.into_columns()); +} + +#[test] +#[serial] +fn rpo_build_trace_commitment_on_gpu_with_padding_matches_cpu() { + build_trace_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpo256, + ); +} + +#[test] +#[serial] +fn rpx_build_trace_commitment_on_gpu_with_padding_matches_cpu() { + build_trace_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpx256, + ); +} + +#[test] +#[serial] +fn rpo_build_trace_commitment_on_gpu_without_padding_matches_cpu() { + build_trace_commitment_on_gpu_without_padding_matches_cpu::( + HashFn::Rpo256, + ); +} + +#[test] +#[serial] +fn rpx_build_trace_commitment_on_gpu_without_padding_matches_cpu() { + build_trace_commitment_on_gpu_without_padding_matches_cpu::( + HashFn::Rpx256, + ); +} + +#[test] +#[serial] +fn rpo_build_constraint_commitment_on_gpu_with_padding_matches_cpu() { + build_constraint_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpo256, + ); +} + +#[test] +#[serial] +fn rpx_build_constraint_commitment_on_gpu_with_padding_matches_cpu() { + build_constraint_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpx256, + ); +} + +#[test] +#[serial] +fn rpo_build_constraint_commitment_on_gpu_without_padding_matches_cpu() { + build_constraint_commitment_on_gpu_without_padding_matches_cpu::< + RpoRandomCoin, + Rpo256, + RpoDigest, + >(HashFn::Rpo256); +} + +#[test] +#[serial] +fn rpx_build_constraint_commitment_on_gpu_without_padding_matches_cpu() { + build_constraint_commitment_on_gpu_without_padding_matches_cpu::< + RpxRandomCoin, + Rpx256, + RpxDigest, + >(HashFn::Rpx256); +} + +fn gen_random_trace(num_rows: usize, num_cols: usize) -> ColMatrix { + ColMatrix::new((0..num_cols as u64).map(|col| vec![Felt::new(col); num_rows]).collect()) +} + +fn get_random_values(num_rows: usize) -> Vec { + (0..num_rows).map(|i| E::from(i as u32)).collect() +} + +fn get_trace_info(num_cols: usize, num_rows: usize) -> TraceInfo { + TraceInfo::new(num_cols, num_rows) +} + +fn create_test_prover< + R: RandomCoin + Send, + H: ElementHasher, +>( + use_rpx: bool, +) -> ExecutionProver { + if use_rpx { + ExecutionProver::new( + ProvingOptions::with_128_bit_security_rpx(), + StackInputs::default(), + StackOutputs::default(), + ) + } else { + ExecutionProver::new( + ProvingOptions::with_128_bit_security(true), + StackInputs::default(), + StackOutputs::default(), + ) + } +} diff --git a/prover/src/gpu/metal/mod.rs b/prover/src/gpu/metal/mod.rs index c01b13ee85..11ce872ea2 100644 --- a/prover/src/gpu/metal/mod.rs +++ b/prover/src/gpu/metal/mod.rs @@ -120,7 +120,31 @@ where .new_evaluator(air, aux_rand_elements, composition_coefficients) } - fn build_constraint_commitment>( + /// Evaluates constraint composition polynomial over the LDE domain and builds a commitment + /// to these evaluations. + /// + /// The evaluation is done by evaluating each composition polynomial column over the LDE + /// domain. + /// + /// The commitment is computed by hashing each row in the evaluation matrix, and then building + /// a Merkle tree from the resulting hashes. + /// + /// The composition polynomial columns are evaluated on the CPU. Afterwards the commitment + /// is computed on the GPU. + /// + /// ```text + /// ───────────────────────────────────────────────────── + /// ┌───┐ ┌───┐ + /// CPU: ... ─┤fft├─┤fft├─┐ ┌─ ... + /// └───┘ └───┘ │ │ + /// ╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴┼╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴┼╴╴╴╴╴╴ + /// │ ┌──────────┐ ┌──────────┐ │ + /// GPU: └─┤ hash ├─┤ hash ├─┘ + /// └──────────┘ └──────────┘ + /// ────┼────────┼────────┼────────┼────────┼────────┼─── + /// t=n t=n+1 t=n+2 t=n+3 t=n+4 t=n+5 + /// ``` + fn build_constraint_commitment>( &self, composition_poly_trace: CompositionPolyTrace, num_constraint_composition_columns: usize, diff --git a/prover/src/gpu/mod.rs b/prover/src/gpu/mod.rs index 253db30f7c..4eba753953 100644 --- a/prover/src/gpu/mod.rs +++ b/prover/src/gpu/mod.rs @@ -1,2 +1,5 @@ #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] pub mod metal; + +#[cfg(all(feature = "cuda", target_arch = "x86_64"))] +pub mod cuda; diff --git a/prover/src/lib.rs b/prover/src/lib.rs index 25fb837dbd..5520b1d943 100644 --- a/prover/src/lib.rs +++ b/prover/src/lib.rs @@ -9,7 +9,10 @@ extern crate std; use core::marker::PhantomData; use air::{AuxRandElements, PartitionOptions, ProcessorAir, PublicInputs}; -#[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] +#[cfg(any( + all(feature = "metal", target_arch = "aarch64", target_os = "macos"), + all(feature = "cuda", target_arch = "x86_64") +))] use miden_gpu::HashFn; use processor::{ crypto::{ @@ -34,6 +37,8 @@ mod gpu; // ================================================================================================ pub use air::{DeserializationError, ExecutionProof, FieldExtension, HashFunction, ProvingOptions}; +#[cfg(all(target_arch = "x86_64", feature = "cuda"))] +pub use miden_gpu::cuda::get_num_of_gpus; pub use processor::{ crypto, math, utils, AdviceInputs, Digest, ExecutionError, Host, InputError, MemAdviceProvider, StackInputs, StackOutputs, Word, @@ -105,6 +110,8 @@ pub fn prove( ); #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpo256); + #[cfg(all(feature = "cuda", target_arch = "x86_64"))] + let prover = gpu::cuda::CudaExecutionProver::new(prover, HashFn::Rpo256); maybe_await!(prover.prove(trace)) }, HashFunction::Rpx256 => { @@ -115,6 +122,8 @@ pub fn prove( ); #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpx256); + #[cfg(all(feature = "cuda", target_arch = "x86_64"))] + let prover = gpu::cuda::CudaExecutionProver::new(prover, HashFn::Rpx256); maybe_await!(prover.prove(trace)) }, } diff --git a/verifier/src/lib.rs b/verifier/src/lib.rs index 03962b6070..ff91a5668f 100644 --- a/verifier/src/lib.rs +++ b/verifier/src/lib.rs @@ -83,6 +83,14 @@ pub fn verify( let opts = AcceptableOptions::OptionSet(vec![ ProvingOptions::RECURSIVE_96_BITS, ProvingOptions::RECURSIVE_128_BITS, + ProvingOptions::RECURSIVE_96_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(3, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(3, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8), ]); verify_proof::>( proof, pub_inputs, &opts, @@ -92,6 +100,14 @@ pub fn verify( let opts = AcceptableOptions::OptionSet(vec![ ProvingOptions::RECURSIVE_96_BITS, ProvingOptions::RECURSIVE_128_BITS, + ProvingOptions::RECURSIVE_96_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(3, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(3, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8), ]); verify_proof::>( proof, pub_inputs, &opts, From 60db7a48cc2983d6d35ba287d9d03b04f2664114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grzegorz=20=C5=9Awirski?= Date: Tue, 28 Jan 2025 13:09:49 +0100 Subject: [PATCH 2/3] feedback --- air/src/options.rs | 4 +++- prover/Cargo.toml | 4 ++-- prover/src/gpu/metal/mod.rs | 26 +------------------------- verifier/src/lib.rs | 4 ---- 4 files changed, 6 insertions(+), 32 deletions(-) diff --git a/air/src/options.rs b/air/src/options.rs index 70122d0a2a..e38f7a0bc9 100644 --- a/air/src/options.rs +++ b/air/src/options.rs @@ -134,8 +134,10 @@ impl ProvingOptions { /// Partitions can be provided to split traces during proving and distribute work across /// multiple devices. The number of partitions should be equal to the number of devices. pub const fn with_partitions(mut self, num_partitions: usize) -> Self { + // All currently supported hash functions consume 8 felts per iteration. + // Match statement ensures that future changes to available hashes are reflected here. let hash_rate = match self.hash_fn { - HashFunction::Blake3_192 => 6, + HashFunction::Blake3_192 => 8, HashFunction::Blake3_256 => 8, HashFunction::Rpo256 => 8, HashFunction::Rpx256 => 8, diff --git a/prover/Cargo.toml b/prover/Cargo.toml index 61005a9d9d..16a5ac8f5f 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -17,8 +17,8 @@ edition.workspace = true async = ["winter-maybe-async/async"] concurrent = ["processor/concurrent", "std", "winter-prover/concurrent"] default = ["std"] -metal = ["dep:miden-gpu", "miden-gpu/metal", "dep:elsa", "dep:pollster", "concurrent", "std"] -cuda = ["dep:miden-gpu", "miden-gpu/cuda", "std"] +metal = ["dep:miden-gpu", "miden-gpu/metal", "dep:elsa", "dep:pollster", "concurrent"] +cuda = ["dep:miden-gpu", "miden-gpu/cuda", "concurrent"] std = ["air/std", "processor/std", "winter-prover/std"] [dependencies] diff --git a/prover/src/gpu/metal/mod.rs b/prover/src/gpu/metal/mod.rs index 11ce872ea2..c01b13ee85 100644 --- a/prover/src/gpu/metal/mod.rs +++ b/prover/src/gpu/metal/mod.rs @@ -120,31 +120,7 @@ where .new_evaluator(air, aux_rand_elements, composition_coefficients) } - /// Evaluates constraint composition polynomial over the LDE domain and builds a commitment - /// to these evaluations. - /// - /// The evaluation is done by evaluating each composition polynomial column over the LDE - /// domain. - /// - /// The commitment is computed by hashing each row in the evaluation matrix, and then building - /// a Merkle tree from the resulting hashes. - /// - /// The composition polynomial columns are evaluated on the CPU. Afterwards the commitment - /// is computed on the GPU. - /// - /// ```text - /// ───────────────────────────────────────────────────── - /// ┌───┐ ┌───┐ - /// CPU: ... ─┤fft├─┤fft├─┐ ┌─ ... - /// └───┘ └───┘ │ │ - /// ╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴┼╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴╴┼╴╴╴╴╴╴ - /// │ ┌──────────┐ ┌──────────┐ │ - /// GPU: └─┤ hash ├─┤ hash ├─┘ - /// └──────────┘ └──────────┘ - /// ────┼────────┼────────┼────────┼────────┼────────┼─── - /// t=n t=n+1 t=n+2 t=n+3 t=n+4 t=n+5 - /// ``` - fn build_constraint_commitment>( + fn build_constraint_commitment>( &self, composition_poly_trace: CompositionPolyTrace, num_constraint_composition_columns: usize, diff --git a/verifier/src/lib.rs b/verifier/src/lib.rs index ff91a5668f..7b367f844d 100644 --- a/verifier/src/lib.rs +++ b/verifier/src/lib.rs @@ -87,8 +87,6 @@ pub fn verify( ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8), ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8), ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8), - ProvingOptions::RECURSIVE_96_BITS.with_partitions(3, 8), - ProvingOptions::RECURSIVE_128_BITS.with_partitions(3, 8), ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8), ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8), ]); @@ -104,8 +102,6 @@ pub fn verify( ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8), ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8), ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8), - ProvingOptions::RECURSIVE_96_BITS.with_partitions(3, 8), - ProvingOptions::RECURSIVE_128_BITS.with_partitions(3, 8), ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8), ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8), ]); From 94775a43eb2c12ae313413e0c255ec73192fdefe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grzegorz=20=C5=9Awirski?= Date: Tue, 28 Jan 2025 13:25:52 +0100 Subject: [PATCH 3/3] feedback --- miden/src/cli/prove.rs | 2 +- prover/Cargo.toml | 4 ++-- prover/src/lib.rs | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/miden/src/cli/prove.rs b/miden/src/cli/prove.rs index 43f5bc2127..92a58b478c 100644 --- a/miden/src/cli/prove.rs +++ b/miden/src/cli/prove.rs @@ -5,7 +5,7 @@ use clap::Parser; use miden_vm::{internal::InputFile, ProvingOptions}; use processor::{DefaultHost, ExecutionOptions, ExecutionOptionsError, Program}; #[cfg(all(target_arch = "x86_64", feature = "cuda"))] -use prover::get_num_of_gpus; +use prover::cuda::get_num_of_gpus; use stdlib::StdLibrary; use tracing::instrument; diff --git a/prover/Cargo.toml b/prover/Cargo.toml index 16a5ac8f5f..4320b8097d 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -17,8 +17,8 @@ edition.workspace = true async = ["winter-maybe-async/async"] concurrent = ["processor/concurrent", "std", "winter-prover/concurrent"] default = ["std"] -metal = ["dep:miden-gpu", "miden-gpu/metal", "dep:elsa", "dep:pollster", "concurrent"] -cuda = ["dep:miden-gpu", "miden-gpu/cuda", "concurrent"] +metal = ["miden-gpu/metal", "dep:elsa", "dep:pollster", "concurrent"] +cuda = ["miden-gpu/cuda", "concurrent"] std = ["air/std", "processor/std", "winter-prover/std"] [dependencies] diff --git a/prover/src/lib.rs b/prover/src/lib.rs index 5520b1d943..fb8a490fc6 100644 --- a/prover/src/lib.rs +++ b/prover/src/lib.rs @@ -38,7 +38,9 @@ mod gpu; pub use air::{DeserializationError, ExecutionProof, FieldExtension, HashFunction, ProvingOptions}; #[cfg(all(target_arch = "x86_64", feature = "cuda"))] -pub use miden_gpu::cuda::get_num_of_gpus; +pub mod cuda { + pub use miden_gpu::cuda::get_num_of_gpus; +} pub use processor::{ crypto, math, utils, AdviceInputs, Digest, ExecutionError, Host, InputError, MemAdviceProvider, StackInputs, StackOutputs, Word,