From 4c7e11578a62b3e3cdfa6cdd5ffdad96fa515d59 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Thu, 30 Sep 2021 09:57:22 +0200 Subject: [PATCH 1/5] Generate larger problems for benchmarks, presumably to increase realism at the cost of runtime. --- benches/gen_benchmark_inputs.py | 51 ++++++++------------------------- benches/main.rs | 18 ++++++------ 2 files changed, 21 insertions(+), 48 deletions(-) diff --git a/benches/gen_benchmark_inputs.py b/benches/gen_benchmark_inputs.py index eacc33d..142581d 100755 --- a/benches/gen_benchmark_inputs.py +++ b/benches/gen_benchmark_inputs.py @@ -8,8 +8,8 @@ def gen_field_summate(path, seed): - pos_no = 1000 - mode_no = 100 + pos_no = 100_000 + mode_no = 1_000 x = np.linspace(0.0, 10.0, pos_no) y = np.linspace(-5.0, 5.0, pos_no) z = np.linspace(-6.0, 8.0, pos_no) @@ -17,29 +17,12 @@ def gen_field_summate(path, seed): model = gs.Gaussian(dim=3, var=1.0, len_scale=1.0) rm = RandMeth(model, mode_no, seed) - np.savetxt(path / 'field_bench_x.txt', x) - np.savetxt(path / 'field_bench_y.txt', y) - np.savetxt(path / 'field_bench_z.txt', z) - np.savetxt(path / 'field_bench_cov_samples.txt', rm._cov_sample) - np.savetxt(path / 'field_bench_z_1.txt', rm._z_1) - np.savetxt(path / 'field_bench_z_2.txt', rm._z_2) - - pos_no = 100000 - mode_no = 1000 - x = np.linspace(0.0, 10.0, pos_no) - y = np.linspace(-5.0, 5.0, pos_no) - z = np.linspace(-6.0, 8.0, pos_no) - - model = gs.Gaussian(dim=3, var=1.0, len_scale=1.0) - - rm = RandMeth(model, mode_no, seed) - - np.savetxt(path / 'field_vs_x.txt', x) - np.savetxt(path / 'field_vs_y.txt', y) - np.savetxt(path / 'field_vs_z.txt', z) - np.savetxt(path / 'field_vs_cov_samples.txt', rm._cov_sample) - np.savetxt(path / 'field_vs_z_1.txt', rm._z_1) - np.savetxt(path / 'field_vs_z_2.txt', rm._z_2) + np.savetxt(path / 'field_x.txt', x) + np.savetxt(path / 'field_y.txt', y) + np.savetxt(path / 'field_z.txt', z) + np.savetxt(path / 'field_cov_samples.txt', rm._cov_sample) + np.savetxt(path / 'field_z_1.txt', rm._z_1) + np.savetxt(path / 'field_z_2.txt', rm._z_2) def gen_krige(path, seed): def prepare_data(pos_no, cond_no): @@ -67,21 +50,11 @@ def prepare_data(pos_no, cond_no): return krige_mat, k_vec, krige_cond - pos_no = 10000 - cond_no = 500 - krige_mat, k_vec, krige_cond = prepare_data(pos_no, cond_no) - - np.savetxt(path / 'krige_vs_krige_mat.txt', krige_mat) - np.savetxt(path / 'krige_vs_k_vec.txt', k_vec) - np.savetxt(path / 'krige_vs_krige_cond.txt', krige_cond) - - pos_no = 1000 - cond_no = 50 - krige_mat, k_vec, krige_cond = prepare_data(pos_no, cond_no) + krige_mat, k_vec, krige_cond = prepare_data(pos_no=10_000, cond_no=500) - np.savetxt(path / 'krige_bench_krige_mat.txt', krige_mat) - np.savetxt(path / 'krige_bench_k_vec.txt', k_vec) - np.savetxt(path / 'krige_bench_krige_cond.txt', krige_cond) + np.savetxt(path / 'krige_krige_mat.txt', krige_mat) + np.savetxt(path / 'krige_k_vec.txt', k_vec) + np.savetxt(path / 'krige_krige_cond.txt', krige_cond) if __name__ == '__main__': diff --git a/benches/main.rs b/benches/main.rs index 81a1064..730f601 100644 --- a/benches/main.rs +++ b/benches/main.rs @@ -40,16 +40,16 @@ fn read_2d_from_file(file_path: &Path) -> Array2 { pub fn field_benchmark(c: &mut Criterion) { let path = Path::new("benches/input"); - let x = read_1d_from_file(&path.join("field_bench_x.txt")); - let y = read_1d_from_file(&path.join("field_bench_y.txt")); - let z = read_1d_from_file(&path.join("field_bench_z.txt")); + let x = read_1d_from_file(&path.join("field_x.txt")); + let y = read_1d_from_file(&path.join("field_y.txt")); + let z = read_1d_from_file(&path.join("field_z.txt")); let pos = stack![Axis(0), x, y, z]; - let cov_samples = read_2d_from_file(&path.join("field_bench_cov_samples.txt")); + let cov_samples = read_2d_from_file(&path.join("field_cov_samples.txt")); - let z_1 = read_1d_from_file(&path.join("field_bench_z_1.txt")); - let z_2 = read_1d_from_file(&path.join("field_bench_z_2.txt")); + let z_1 = read_1d_from_file(&path.join("field_z_1.txt")); + let z_2 = read_1d_from_file(&path.join("field_z_2.txt")); c.bench_function("field summate", |b| { b.iter(|| summator(cov_samples.view(), z_1.view(), z_2.view(), pos.view())) @@ -63,9 +63,9 @@ pub fn field_benchmark(c: &mut Criterion) { pub fn krige_benchmark(c: &mut Criterion) { let path = Path::new("benches/input"); - let krige_mat = read_2d_from_file(&path.join("krige_bench_krige_mat.txt")); - let k_vec = read_2d_from_file(&path.join("krige_bench_k_vec.txt")); - let krige_cond = read_1d_from_file(&path.join("krige_bench_krige_cond.txt")); + let krige_mat = read_2d_from_file(&path.join("krige_krige_mat.txt")); + let k_vec = read_2d_from_file(&path.join("krige_k_vec.txt")); + let krige_cond = read_1d_from_file(&path.join("krige_krige_cond.txt")); c.bench_function("krige error", |b| { b.iter(|| { From e913fdd130b84a232fa268f1013c1bcdad566d98 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Thu, 30 Sep 2021 09:57:29 +0200 Subject: [PATCH 2/5] Do not parallelize the second loop level of the summator_incompr function When switching to larger problem sizes, this yields a measurable speed-up Benchmarking field summate incompr: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 149.7s, or reduce sample count to 10. field summate incompr time: [1.4950 s 1.4954 s 1.4958 s] change: [-24.384% -23.691% -23.212%] (p = 0.00 < 0.05) Performance has improved. Found 13 outliers among 100 measurements (13.00%) 4 (4.00%) high mild 9 (9.00%) high severe --- src/field.rs | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/src/field.rs b/src/field.rs index ed2df37..33f9a71 100644 --- a/src/field.rs +++ b/src/field.rs @@ -1,5 +1,4 @@ use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Zip}; -use rayon::prelude::*; pub fn summator( cov_samples: ArrayView2<'_, f64>, @@ -50,36 +49,21 @@ pub fn summator_incompr( Zip::from(pos.columns()) .and(summed_modes.columns_mut()) .par_for_each(|pos, mut summed_modes| { - let sum = Zip::from(cov_samples.columns()) + Zip::from(cov_samples.columns()) .and(z1) .and(z2) - .into_par_iter() - .fold( - || Array1::::zeros(dim), - |mut sum, (cov_samples, z1, z2)| { - let k_2 = cov_samples.dot(&cov_samples); - let phase = cov_samples.dot(&pos); - - Zip::from(&mut sum) - .and(&e1) - .and(cov_samples) - .for_each(|sum, e1, cs| { - let proj = *e1 - cs * cov_samples[0] / k_2; - *sum += proj * (z1 * phase.cos() + z2 * phase.sin()); - }); - - sum - }, - ) - .reduce( - || Array1::::zeros(dim), - |mut lhs, rhs| { - lhs += &rhs; - lhs - }, - ); - - summed_modes.assign(&sum); + .for_each(|cov_samples, z1, z2| { + let k_2 = cov_samples.dot(&cov_samples); + let phase = cov_samples.dot(&pos); + + Zip::from(&mut summed_modes) + .and(&e1) + .and(cov_samples) + .for_each(|sum, e1, cs| { + let proj = *e1 - cs * cov_samples[0] / k_2; + *sum += proj * (z1 * phase.cos() + z2 * phase.sin()); + }); + }); }); summed_modes From c688b82bd4bd09bb42e6adfc80ce73ecca5fa029 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Thu, 30 Sep 2021 09:58:43 +0200 Subject: [PATCH 3/5] Parallelize the second loop level of the krige functions When switching to larger problem sizes, this does slightly improve performance: Benchmarking krige error: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 350.8s, or reduce sample count to 10. krige error time: [1.2209 s 1.2643 s 1.3303 s] change: [-6.9990% -3.7794% +0.8052%] (p = 0.07 > 0.05) No change in performance detected. Found 10 outliers among 100 measurements (10.00%) 1 (1.00%) high mild 9 (9.00%) high severe Benchmarking krige: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 121.4s, or reduce sample count to 10. krige time: [1.2105 s 1.2118 s 1.2134 s] change: [-8.6930% -8.4186% -8.1473%] (p = 0.00 < 0.05) Performance has improved. Found 6 outliers among 100 measurements (6.00%) 1 (1.00%) low mild 3 (3.00%) high mild 2 (2.00%) high severe --- src/krige.rs | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/krige.rs b/src/krige.rs index e97ec32..436bd4f 100644 --- a/src/krige.rs +++ b/src/krige.rs @@ -1,4 +1,5 @@ use ndarray::{Array1, ArrayView1, ArrayView2, Zip}; +use rayon::prelude::*; pub fn calculator_field_krige_and_variance( krig_mat: ArrayView2<'_, f64>, @@ -16,14 +17,30 @@ pub fn calculator_field_krige_and_variance( .and(error.view_mut()) .and(krig_vecs.columns()) .par_for_each(|f, e, v_col| { - Zip::from(cond) + let acc = Zip::from(cond) .and(v_col) .and(krig_mat.columns()) - .for_each(|c, v, m_row| { - let krig_fac = m_row.dot(&v_col); - *f += c * krig_fac; - *e += v * krig_fac; - }); + .into_par_iter() + .fold( + || (0.0, 0.0), + |mut acc, (c, v, m_row)| { + let krig_fac = m_row.dot(&v_col); + acc.0 += c * krig_fac; + acc.1 += v * krig_fac; + acc + }, + ) + .reduce( + || (0.0, 0.0), + |mut lhs, rhs| { + lhs.0 += rhs.0; + lhs.1 += rhs.1; + lhs + }, + ); + + *f = acc.0; + *e = acc.1; }); (field, error) @@ -43,12 +60,16 @@ pub fn calculator_field_krige( Zip::from(field.view_mut()) .and(krig_vecs.columns()) .par_for_each(|f, v_col| { - Zip::from(cond) + let acc = Zip::from(cond) .and(krig_mat.columns()) - .for_each(|c, m_row| { + .into_par_iter() + .map(|(c, m_row)| { let krig_fac = m_row.dot(&v_col); - *f += c * krig_fac; - }); + c * krig_fac + }) + .sum(); + + *f = acc; }); field From 8f582269a9351b64e3c88fdb31131905bb5fa86d Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Tue, 5 Oct 2021 14:16:33 +0200 Subject: [PATCH 4/5] Switch loop nesting for summator incompr implementation to avoid repeatedly computing k_2. Benchmarking field summate incompr: Warming up for 3.0000 s Warning: Unable to complete 10 samples in 5.0s. You may wish to increase target time to 10.7s. field summate incompr time: [1.0707 s 1.0732 s 1.0761 s] change: [-28.910% -28.377% -27.966%] (p = 0.00 < 0.05) Performance has improved. --- src/field.rs | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/field.rs b/src/field.rs index 33f9a71..4c09563 100644 --- a/src/field.rs +++ b/src/field.rs @@ -6,11 +6,11 @@ pub fn summator( z2: ArrayView1<'_, f64>, pos: ArrayView2<'_, f64>, ) -> Array1 { - assert!(cov_samples.shape()[0] == pos.shape()[0]); - assert!(cov_samples.shape()[1] == z1.shape()[0]); - assert!(z1.shape()[0] == z2.shape()[0]); + assert_eq!(cov_samples.dim().0, pos.dim().0); + assert_eq!(cov_samples.dim().1, z1.dim()); + assert_eq!(cov_samples.dim().1, z2.dim()); - let mut summed_modes = Array1::::zeros(pos.shape()[1]); + let mut summed_modes = Array1::::zeros(pos.dim().1); Zip::from(&mut summed_modes) .and(pos.columns()) @@ -34,34 +34,33 @@ pub fn summator_incompr( z2: ArrayView1<'_, f64>, pos: ArrayView2<'_, f64>, ) -> Array2 { - assert!(cov_samples.shape()[0] == pos.shape()[0]); - assert!(cov_samples.shape()[1] == z1.shape()[0]); - assert!(z1.shape()[0] == z2.shape()[0]); - - let dim = pos.shape()[0]; + assert_eq!(cov_samples.dim().0, pos.dim().0); + assert_eq!(cov_samples.dim().1, z1.dim()); + assert_eq!(cov_samples.dim().1, z2.dim()); let mut summed_modes = Array2::::zeros(pos.dim()); // unit vector in x dir. - let mut e1 = Array1::::zeros(dim); + let mut e1 = Array1::::zeros(pos.dim().0); e1[0] = 1.0; - Zip::from(pos.columns()) - .and(summed_modes.columns_mut()) - .par_for_each(|pos, mut summed_modes| { - Zip::from(cov_samples.columns()) - .and(z1) - .and(z2) - .for_each(|cov_samples, z1, z2| { - let k_2 = cov_samples.dot(&cov_samples); + Zip::from(cov_samples.columns()) + .and(z1) + .and(z2) + .for_each(|cov_samples, z1, z2| { + let k_2 = cov_samples[0] / cov_samples.dot(&cov_samples); + + Zip::from(pos.columns()) + .and(summed_modes.columns_mut()) + .par_for_each(|pos, mut summed_modes| { let phase = cov_samples.dot(&pos); + let z12 = z1 * phase.cos() + z2 * phase.sin(); Zip::from(&mut summed_modes) .and(&e1) .and(cov_samples) .for_each(|sum, e1, cs| { - let proj = *e1 - cs * cov_samples[0] / k_2; - *sum += proj * (z1 * phase.cos() + z2 * phase.sin()); + *sum += (*e1 - cs * k_2) * z12; }); }); }); From 931faf38ed9c607d6d457f8fb21c5008cc4468c0 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Fri, 8 Oct 2021 11:35:22 +0200 Subject: [PATCH 5/5] Redo parallel implementation of summator_incompr This is a speed-up due to the call to `with_min_len` which however requires indexed parallel iterators which `ndarray` provides only via `axis_iter` and `axis_chunk_iter`, so this workaround is necessary until [1] is merged. [1] https://github.com/rust-ndarray/ndarray/pull/1081 --- src/field.rs | 64 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/src/field.rs b/src/field.rs index 4c09563..0b16c62 100644 --- a/src/field.rs +++ b/src/field.rs @@ -1,4 +1,5 @@ -use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Zip}; +use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis, Zip}; +use rayon::prelude::*; pub fn summator( cov_samples: ArrayView2<'_, f64>, @@ -38,34 +39,47 @@ pub fn summator_incompr( assert_eq!(cov_samples.dim().1, z1.dim()); assert_eq!(cov_samples.dim().1, z2.dim()); - let mut summed_modes = Array2::::zeros(pos.dim()); - // unit vector in x dir. let mut e1 = Array1::::zeros(pos.dim().0); e1[0] = 1.0; - Zip::from(cov_samples.columns()) - .and(z1) - .and(z2) - .for_each(|cov_samples, z1, z2| { - let k_2 = cov_samples[0] / cov_samples.dot(&cov_samples); - - Zip::from(pos.columns()) - .and(summed_modes.columns_mut()) - .par_for_each(|pos, mut summed_modes| { - let phase = cov_samples.dot(&pos); - let z12 = z1 * phase.cos() + z2 * phase.sin(); - - Zip::from(&mut summed_modes) - .and(&e1) - .and(cov_samples) - .for_each(|sum, e1, cs| { - *sum += (*e1 - cs * k_2) * z12; - }); - }); - }); - - summed_modes + // FIXME: Use Zip::from(cov_samples.columns()).and(z1).and(z1) after + // https://github.com/rust-ndarray/ndarray/pull/1081 is merged. + cov_samples + .axis_iter(Axis(1)) + .into_par_iter() + .zip(z1.axis_iter(Axis(0))) + .zip(z2.axis_iter(Axis(0))) + .with_min_len(100) + .fold( + || Array2::::zeros(pos.dim()), + |mut summed_modes, ((cov_samples, z1), z2)| { + let k_2 = cov_samples[0] / cov_samples.dot(&cov_samples); + let z1 = z1.into_scalar(); + let z2 = z2.into_scalar(); + + Zip::from(pos.columns()) + .and(summed_modes.columns_mut()) + .par_for_each(|pos, mut summed_modes| { + let phase = cov_samples.dot(&pos); + let z12 = z1 * phase.cos() + z2 * phase.sin(); + + Zip::from(&mut summed_modes) + .and(&e1) + .and(cov_samples) + .for_each(|sum, e1, cs| { + *sum += (*e1 - cs * k_2) * z12; + }); + }); + + summed_modes + }, + ) + .reduce_with(|mut lhs, rhs| { + lhs += &rhs; + lhs + }) + .unwrap() } #[cfg(test)]