diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index d6870ae1c..7191f8a05 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -16,11 +16,12 @@
 #pragma once
 
 #include "../common/ann_types.hpp"
-#include "../common/thread_pool.hpp"
 #include "../common/util.hpp"
 
 #include <hnswlib/hnswlib.h>
 
+#include <omp.h>
+
 #include <algorithm>
 #include <atomic>
 #include <cassert>
@@ -66,13 +67,13 @@ class hnsw_lib : public algo<T> {
   struct build_param {
     int m;
     int ef_construction;
-    int num_threads = omp_get_num_procs();
+    int num_threads = omp_get_max_threads();
   };
 
   using search_param_base = typename algo<T>::search_param;
   struct search_param : public search_param_base {
     int ef;
-    int num_threads = 1;
+    int num_threads = omp_get_max_threads();
   };
 
   hnsw_lib(Metric metric, int dim, const build_param& param);
@@ -114,7 +115,6 @@ class hnsw_lib : public algo<T> {
   int ef_construction_;
   int m_;
   int num_threads_;
-  std::shared_ptr<fixed_thread_pool> thread_pool_;
   Mode bench_mode_;
 };
 
@@ -150,22 +150,10 @@ void hnsw_lib<T>::build(const T* dataset, size_t nrow)
   appr_alg_ = std::make_shared<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
     space_.get(), nrow, m_, ef_construction_);
 
-  thread_pool_                  = std::make_shared<fixed_thread_pool>(num_threads_);
-  const size_t items_per_thread = nrow / (num_threads_ + 1);
-
-  thread_pool_->submit(
-    [&](size_t i) {
-      if (i < items_per_thread && i % 10000 == 0) {
-        char buf[20];
-        std::time_t now = std::time(nullptr);
-        std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
-        printf("%s building %zu / %zu\n", buf, i, items_per_thread);
-        fflush(stdout);
-      }
-
-      appr_alg_->addPoint(dataset + i * dim_, i);
-    },
-    nrow);
+#pragma omp parallel for num_threads(num_threads_)
+  for (size_t i = 0; i < nrow; i++) {
+    appr_alg_->addPoint(dataset + i * dim_, i);
+  }
 }
 
 template <typename T>
@@ -175,12 +163,7 @@ void hnsw_lib<T>::set_search_param(const search_param_base& param_, const void*
   auto param     = dynamic_cast<const search_param&>(param_);
   appr_alg_->ef_ = param.ef;
   num_threads_   = param.num_threads;
-  // bench_mode_ = param.metric_objective;
   bench_mode_ = Mode::kLatency;  // TODO(achirkin): pass the benchmark mode in the algo parameters
-
-  // Create a pool if multiple query threads have been set and the pool hasn't been created already
-  bool create_pool = (bench_mode_ == Mode::kLatency && num_threads_ > 1 && !thread_pool_);
-  if (create_pool) { thread_pool_ = std::make_shared<fixed_thread_pool>(num_threads_); }
 }
 
 template <typename T>
@@ -192,7 +175,10 @@ void hnsw_lib<T>::search(
     get_search_knn_results(query + i * dim_, k, indices + i * k, distances + i * k);
   };
   if (bench_mode_ == Mode::kLatency && num_threads_ > 1) {
-    thread_pool_->submit(f, batch_size);
+#pragma omp parallel for num_threads(num_threads_)
+    for (int i = 0; i < batch_size; i++) {
+      f(i);
+    }
   } else {
     for (int i = 0; i < batch_size; i++) {
       f(i);
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 6ab8631d4..1aff6229b 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -489,26 +489,15 @@ void search(raft::resources const& res,
   auto const* hnswlib_index =
     reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const*>(
       idx.get_index());
+  auto num_threads = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
 
-  // when num_threads == 0, automatically maximize parallelism
-  if (params.num_threads) {
-#pragma omp parallel for num_threads(params.num_threads)
-    for (int64_t i = 0; i < queries.extent(0); ++i) {
-      get_search_knn_results(hnswlib_index,
-                             queries.data_handle() + i * queries.extent(1),
-                             neighbors.extent(1),
-                             neighbors.data_handle() + i * neighbors.extent(1),
-                             distances.data_handle() + i * distances.extent(1));
-    }
-  } else {
-#pragma omp parallel for
-    for (int64_t i = 0; i < queries.extent(0); ++i) {
-      get_search_knn_results(hnswlib_index,
-                             queries.data_handle() + i * queries.extent(1),
-                             neighbors.extent(1),
-                             neighbors.data_handle() + i * neighbors.extent(1),
-                             distances.data_handle() + i * distances.extent(1));
-    }
+#pragma omp parallel for num_threads(num_threads)
+  for (int64_t i = 0; i < queries.extent(0); ++i) {
+    get_search_knn_results(hnswlib_index,
+                           queries.data_handle() + i * queries.extent(1),
+                           neighbors.extent(1),
+                           neighbors.data_handle() + i * neighbors.extent(1),
+                           distances.data_handle() + i * distances.extent(1));
   }
 }
 
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index 063502290..bf5cd35d3 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -8,7 +8,7 @@ groups:
       graph_degree: [32, 64, 96, 128]
       intermediate_graph_degree: [32, 64, 96, 128]
       graph_build_algo: ["NN_DESCENT"]
-      hierarchy: ["none", "cpu"]
+      hierarchy: ["none", "cpu", "gpu"]
       ef_construction: [64, 128, 256, 512]
     search:
       ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]