|
19 | 19 | #include "dali/core/call_at_exit.h"
|
20 | 20 | #include "dali/core/mm/memory.h"
|
21 | 21 | #include "dali/operators.h"
|
| 22 | +#include "dali/operators/decoder/nvjpeg/nvjpeg_helper.h" |
22 | 23 | #include "dali/operators/decoder/cache/cached_decoder_impl.h"
|
23 | 24 | #include "dali/operators/generic/slice/slice_attr.h"
|
24 | 25 | #include "dali/operators/image/crop/crop_attr.h"
|
|
31 | 32 | #include "dali/pipeline/operator/common.h"
|
32 | 33 | #include "dali/pipeline/operator/operator.h"
|
33 | 34 |
|
| 35 | +// TODO(janton): remove this when there's no need to query the HW decoder config |
| 36 | +// nvjpeg dynlink wrapper, or true if linking statically |
| 37 | +bool nvjpegIsSymbolAvailable(const char *name); |
| 38 | + |
34 | 39 | #if not(WITH_DYNAMIC_NVIMGCODEC_ENABLED)
|
35 | 40 | nvimgcodecStatus_t get_libjpeg_turbo_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
|
36 | 41 | nvimgcodecStatus_t get_libtiff_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
|
@@ -354,8 +359,31 @@ class ImageDecoder : public StatelessOperator<Backend> {
|
354 | 359 | opts_.add_module_option("nvjpeg_cuda_decoder", "preallocate_buffers", true);
|
355 | 360 |
|
356 | 361 | // Batch size
|
357 |
| - opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size", |
358 |
| - std::max(1, max_batch_size_)); |
| 362 | + int hw_batch_size = max_batch_size_; |
| 363 | + // TODO(janton): workaround to be removed |
| 364 | + if (need_nvjpeg_hw_preallocate_fix()) { |
| 365 | + hw_batch_size = 0; |
| 366 | + if (hw_load > 0.f) { |
| 367 | + nvjpegHandle_t handle; |
| 368 | + unsigned int nvjpeg_flags = 0; |
| 369 | + unsigned int num_hw_engines = 1; |
| 370 | + unsigned int num_hw_cores_per_engine = 1; |
| 371 | + if (nvjpegCreateEx(NVJPEG_BACKEND_HARDWARE, NULL, NULL, nvjpeg_flags, &handle) == NVJPEG_STATUS_SUCCESS) { |
| 372 | + if (nvjpegIsSymbolAvailable("nvjpegGetHardwareDecoderInfo")) { |
| 373 | + nvjpegGetHardwareDecoderInfo(handle, &num_hw_engines, &num_hw_cores_per_engine); |
| 374 | + } else { |
| 375 | + DALI_WARN("nvjpegGetHardwareDecoderInfo API not available. Assuming 5 cores per engine."); |
| 376 | + num_hw_engines = 1; |
| 377 | + num_hw_cores_per_engine = 5; |
| 378 | + } |
| 379 | + CUDA_CALL(nvjpegDestroy(handle)); |
| 380 | + } else { |
| 381 | + LOG_LINE << "Failed to create nvjpeg handle for the Hardware backend.\n"; |
| 382 | + } |
| 383 | + } |
| 384 | + } |
| 385 | + opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size", hw_batch_size); |
| 386 | + |
359 | 387 | // Nvjpeg2k parallel tiles
|
360 | 388 | opts_.add_module_option("nvjpeg2k_cuda_decoder", "num_parallel_tiles", 16);
|
361 | 389 |
|
@@ -610,6 +638,18 @@ class ImageDecoder : public StatelessOperator<Backend> {
|
610 | 638 | return !version_at_least(0, 3, 0);
|
611 | 639 | }
|
612 | 640 |
|
| 641 | + /** |
| 642 | + * @brief nvImageCodec up to 0.3 doesn't take into account the hw load hint for the memory |
| 643 | + * preallocation, which causes some cuMemFree calls during the decoding, later in the |
| 644 | + * pipeline execution. This workarounds this issue by limiting the preallocate batch size |
| 645 | + * on the DALI side. |
| 646 | + */ |
| 647 | + bool need_nvjpeg_hw_preallocate_fix() { |
| 648 | + int major, minor, patch; |
| 649 | + get_nvimgcodec_version(&major, &minor, &patch); |
| 650 | + return !version_at_least(0, 4, 0); |
| 651 | + } |
| 652 | + |
613 | 653 | template <typename OutBackend>
|
614 | 654 | void PrepareOutput(SampleState &st, SampleView<OutBackend> out, const ROI &roi,
|
615 | 655 | const Workspace &ws) {
|
@@ -803,6 +843,8 @@ class ImageDecoder : public StatelessOperator<Backend> {
|
803 | 843 | }
|
804 | 844 | return false;
|
805 | 845 | };
|
| 846 | + |
| 847 | + // TODO(janton): workaround to be removed |
806 | 848 | if (ws.has_stream() && need_host_sync_alloc() && any_need_processing()) {
|
807 | 849 | DomainTimeRange tr("alloc sync", DomainTimeRange::kOrange);
|
808 | 850 | CUDA_CALL(cudaStreamSynchronize(ws.stream()));
|
|
0 commit comments