Skip to content

Commit 1de6513

Browse files
committed
Fix preallocate_batch_size to take into account the hw_load.
Signed-off-by: Joaquin Anton Guirao <janton@nvidia.com>
1 parent de81944 commit 1de6513

File tree

1 file changed

+44
-2
lines changed

1 file changed

+44
-2
lines changed

dali/operators/imgcodec/image_decoder.h

+44-2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "dali/core/call_at_exit.h"
2020
#include "dali/core/mm/memory.h"
2121
#include "dali/operators.h"
22+
#include "dali/operators/decoder/nvjpeg/nvjpeg_helper.h"
2223
#include "dali/operators/decoder/cache/cached_decoder_impl.h"
2324
#include "dali/operators/generic/slice/slice_attr.h"
2425
#include "dali/operators/image/crop/crop_attr.h"
@@ -31,6 +32,10 @@
3132
#include "dali/pipeline/operator/common.h"
3233
#include "dali/pipeline/operator/operator.h"
3334

35+
// TODO(janton): remove this when there's no need to query the HW decoder config
36+
// nvjpeg dynlink wrapper, or true if linking statically
37+
bool nvjpegIsSymbolAvailable(const char *name);
38+
3439
#if not(WITH_DYNAMIC_NVIMGCODEC_ENABLED)
3540
nvimgcodecStatus_t get_libjpeg_turbo_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
3641
nvimgcodecStatus_t get_libtiff_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
@@ -354,8 +359,31 @@ class ImageDecoder : public StatelessOperator<Backend> {
354359
opts_.add_module_option("nvjpeg_cuda_decoder", "preallocate_buffers", true);
355360

356361
// Batch size
357-
opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size",
358-
std::max(1, max_batch_size_));
362+
int hw_batch_size = max_batch_size_;
363+
// TODO(janton): workaround to be removed
364+
if (need_nvjpeg_hw_preallocate_fix()) {
365+
hw_batch_size = 0;
366+
if (hw_load > 0.f) {
367+
nvjpegHandle_t handle;
368+
unsigned int nvjpeg_flags = 0;
369+
unsigned int num_hw_engines = 1;
370+
unsigned int num_hw_cores_per_engine = 1;
371+
if (nvjpegCreateEx(NVJPEG_BACKEND_HARDWARE, NULL, NULL, nvjpeg_flags, &handle) == NVJPEG_STATUS_SUCCESS) {
372+
if (nvjpegIsSymbolAvailable("nvjpegGetHardwareDecoderInfo")) {
373+
nvjpegGetHardwareDecoderInfo(handle, &num_hw_engines, &num_hw_cores_per_engine);
374+
} else {
375+
DALI_WARN("nvjpegGetHardwareDecoderInfo API not available. Assuming 5 cores per engine.");
376+
num_hw_engines = 1;
377+
num_hw_cores_per_engine = 5;
378+
}
379+
CUDA_CALL(nvjpegDestroy(handle));
380+
} else {
381+
LOG_LINE << "Failed to create nvjpeg handle for the Hardware backend.\n";
382+
}
383+
}
384+
}
385+
opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size", hw_batch_size);
386+
359387
// Nvjpeg2k parallel tiles
360388
opts_.add_module_option("nvjpeg2k_cuda_decoder", "num_parallel_tiles", 16);
361389

@@ -610,6 +638,18 @@ class ImageDecoder : public StatelessOperator<Backend> {
610638
return !version_at_least(0, 3, 0);
611639
}
612640

641+
/**
642+
* @brief nvImageCodec up to 0.3 doesn't take into account the hw load hint for the memory
643+
* preallocation, which causes some cuMemFree calls during the decoding, later in the
644+
* pipeline execution. This workarounds this issue by limiting the preallocate batch size
645+
* on the DALI side.
646+
*/
647+
bool need_nvjpeg_hw_preallocate_fix() {
648+
int major, minor, patch;
649+
get_nvimgcodec_version(&major, &minor, &patch);
650+
return !version_at_least(0, 4, 0);
651+
}
652+
613653
template <typename OutBackend>
614654
void PrepareOutput(SampleState &st, SampleView<OutBackend> out, const ROI &roi,
615655
const Workspace &ws) {
@@ -803,6 +843,8 @@ class ImageDecoder : public StatelessOperator<Backend> {
803843
}
804844
return false;
805845
};
846+
847+
// TODO(janton): workaround to be removed
806848
if (ws.has_stream() && need_host_sync_alloc() && any_need_processing()) {
807849
DomainTimeRange tr("alloc sync", DomainTimeRange::kOrange);
808850
CUDA_CALL(cudaStreamSynchronize(ws.stream()));

0 commit comments

Comments
 (0)