diff --git a/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx b/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx index 04e57426c1927..f050563645faf 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx @@ -110,7 +110,7 @@ void GPUTPCClusterStatistics::RunStatistics(const o2::tpc::ClusterNativeAccess* std::vector clusterBuffer; GPUInfo("Compression statistics, decoding: %d attached (%d tracks), %d unattached", clustersCompressed->nAttachedClusters, clustersCompressed->nTracks, clustersCompressed->nUnattachedClusters); auto allocator = [&clusterBuffer](size_t size) {clusterBuffer.resize(size); return clusterBuffer.data(); }; - mDecoder.decompress(clustersCompressed, clustersNativeDecoded, allocator, param); + mDecoder.decompress(clustersCompressed, clustersNativeDecoded, allocator, param, true); std::vector tmpClusters; if (param.rec.tpc.rejectionStrategy == GPUSettings::RejectionNone) { // verification does not make sense if we reject clusters during compression for (unsigned int i = 0; i < NSLICES; i++) { diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index bbe9c338acc79..7cad6b4377f92 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -126,15 +126,16 @@ GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2 } template <> -GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors, int iSlice) +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors, int sliceStart, int nSlices) { GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer; const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative; - unsigned int* offsets = decompressor.mUnattachedClustersOffsets; - for (unsigned int iRow = get_global_id(0); iRow < GPUCA_ROW_COUNT; iRow += get_global_size(0)) { + for (unsigned int i = get_global_id(0); i < GPUCA_ROW_COUNT * nSlices; i += get_global_size(0)) { + unsigned int iRow = i % GPUCA_ROW_COUNT; + unsigned int iSlice = sliceStart + (i / GPUCA_ROW_COUNT); const int linearIndex = iSlice * GPUCA_ROW_COUNT + iRow; unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(iSlice, iRow, decompressor.mMaxNativeClustersPerBuffer); ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[iSlice][iRow]; diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h index 4d21bcaef1ab8..904dc5aabd2c4 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -62,8 +62,7 @@ class GPUTPCDecompressionUtilKernels : public GPUKernelTemplate { public: enum K : int { - gatherAttachedClusters = 0, - sortPerSectorRow = 1, + sortPerSectorRow = 0, }; template diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx index 08d1c8bac6762..61e3392af0f03 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx @@ -25,7 +25,7 @@ using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; -int TPCClusterDecompressor::decompress(const CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param) +int TPCClusterDecompressor::decompress(const CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param, bool deterministicRec) { CompressedClusters c; const CompressedClusters* p; @@ -35,10 +35,10 @@ int TPCClusterDecompressor::decompress(const CompressedClustersFlat* clustersCom c = *clustersCompressed; p = &c; } - return decompress(p, clustersNative, allocator, param); + return decompress(p, clustersNative, allocator, param, deterministicRec); } -int TPCClusterDecompressor::decompress(const CompressedClusters* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param) +int TPCClusterDecompressor::decompress(const CompressedClusters* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param, bool deterministicRec) { if (clustersCompressed->nTracks && clustersCompressed->solenoidBz != -1e6f && clustersCompressed->solenoidBz != param.bzkG) { throw std::runtime_error("Configured solenoid Bz does not match value used for track model encoding"); @@ -105,9 +105,10 @@ int TPCClusterDecompressor::decompress(const CompressedClusters* clustersCompres cl.setTime(t); } } -// std::sort(buffer, buffer + clustersNative.nClusters[i][j]); + if (deterministicRec) { + std::sort(buffer, buffer + clustersNative.nClusters[i][j]); + } } } - return 0; } diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.h b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.h index 03c40e9b54923..79f14fe71e093 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.h +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.h @@ -33,8 +33,8 @@ class TPCClusterDecompressor { public: static constexpr unsigned int NSLICES = GPUCA_NSLICES; - static int decompress(const o2::tpc::CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param); - static int decompress(const o2::tpc::CompressedClusters* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param); + static int decompress(const o2::tpc::CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param, bool deterministicRec); + static int decompress(const o2::tpc::CompressedClusters* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param, bool deterministicRec); template static void decompressTrack(const o2::tpc::CompressedClusters* clustersCompressed, const GPUParam& param, const unsigned int maxTime, const unsigned int i, unsigned int& offset, Args&... args); diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index db1a0c5863c57..9ab78d034d8bf 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -71,6 +71,8 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -135,6 +137,8 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -199,8 +203,8 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3 - #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 1 - #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 1 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 32, 1 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 32, 1 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 448 @@ -265,8 +269,8 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 - #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 1 - #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 1 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 32, 1 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 32, 1 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 8a9d98d16bf55..955b54645f6ad 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -17,7 +17,6 @@ #include "GPUO2DataTypes.h" #include "GPUTrackingInputProvider.h" #include -#include #ifdef GPUCA_HAVE_O2HEADERS #include "GPUTPCCFChainContext.h" @@ -207,8 +206,7 @@ int GPUChainTracking::RunTPCCompression() int GPUChainTracking::RunTPCDecompression() { - auto start = std::chrono::high_resolution_clock::now(); - //#ifdef GPUCA_HAVE_O2HEADERS +#ifdef GPUCA_HAVE_O2HEADERS if (GetProcessingSettings().tpcUseOldCPUDecoding) { const auto& threadContext = GetThreadContext(); TPCClusterDecompressor decomp; @@ -219,7 +217,7 @@ int GPUChainTracking::RunTPCDecompression() }; auto& gatherTimer = getTimer("TPCDecompression", 0); gatherTimer.Start(); - if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) { + if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param(), GetProcessingSettings().deterministicGPUReconstruction)) { GPUError("Error decompressing clusters"); return 1; } @@ -257,7 +255,6 @@ int GPUChainTracking::RunTPCDecompression() inputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask; inputGPU.solenoidBz = param().bzkG; inputGPU.maxTimeBin = param().par.continuousMaxTimeBin; - SetupGPUProcessor(&Decompressor, true); WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream); @@ -286,12 +283,12 @@ int GPUChainTracking::RunTPCDecompression() inputGPU.padA = cmprClsHost.padA; bool toGPU = true; - runKernel(GetGridAutoStep(inputStream, RecoStep::TPCDecompression), krnlRunRangeNone, &mEvents->init, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0])); - std::exclusive_scan(cmprClsHost.nTrackClusters, cmprClsHost.nTrackClusters + cmprClsHost.nTracks, Decompressor.mAttachedClustersOffsets,0u); - int nStreams = mRec->NStreams() - 1; + runKernel({GetGridAutoStep(inputStream, RecoStep::TPCDecompression), krnlRunRangeNone, &mEvents->init}, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0])); + std::exclusive_scan(cmprClsHost.nTrackClusters, cmprClsHost.nTrackClusters + cmprClsHost.nTracks, Decompressor.mAttachedClustersOffsets, 0u); // computing clusters offsets for first kernel + int nStreams = doGPU ? mRec->NStreams() - 1 : 1; for (unsigned int iStream = 0; iStream < nStreams; ++iStream) { unsigned int startTrack = cmprClsHost.nTracks / nStreams * iStream; - unsigned int endTrack = cmprClsHost.nTracks / nStreams * (iStream + 1) + (iStream < nStreams - 1 ? 0 : cmprClsHost.nTracks % nStreams); + unsigned int endTrack = cmprClsHost.nTracks / nStreams * (iStream + 1) + (iStream < nStreams - 1 ? 0 : cmprClsHost.nTracks % nStreams); // index of last track (excluded from computation) unsigned int numTracks = endTrack - startTrack; unsigned int* offsets = Decompressor.mAttachedClustersOffsets; unsigned int numClusters = (endTrack == cmprClsHost.nTracks ? offsets[endTrack - 1] + cmprClsHost.nTrackClusters[endTrack - 1] : offsets[endTrack]) - offsets[startTrack]; @@ -312,10 +309,8 @@ int GPUChainTracking::RunTPCDecompression() GPUMemCpy(myStep, inputGPUShadow.sliceA + startTrack, cmprClsHost.sliceA + startTrack, numTracks * sizeof(cmprClsHost.sliceA[0]), iStream, toGPU); GPUMemCpy(myStep, inputGPUShadow.timeA + startTrack, cmprClsHost.timeA + startTrack, numTracks * sizeof(cmprClsHost.timeA[0]), iStream, toGPU); GPUMemCpy(myStep, inputGPUShadow.padA + startTrack, cmprClsHost.padA + startTrack, numTracks * sizeof(cmprClsHost.padA[0]), iStream, toGPU); - - runKernel({60,96,iStream}, krnlRunRangeNone, {&mEvents->stream[iStream], &mEvents->init}, startTrack, endTrack); + runKernel({GetGridAuto(iStream), krnlRunRangeNone, {&mEvents->stream[iStream], &mEvents->init}}, startTrack, endTrack); } - GPUMemCpy(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), unattachedStream, toGPU); GPUMemCpy(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), unattachedStream, toGPU); GPUMemCpy(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), unattachedStream, toGPU); @@ -331,9 +326,8 @@ int GPUChainTracking::RunTPCDecompression() DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer; Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput; WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream); - TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, inputStream,nullptr,mEvents->stream,nStreams); + TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, inputStream, nullptr, mEvents->stream, nStreams); SynchronizeStream(inputStream); - unsigned int offset = 0; unsigned int decodedAttachedClusters = 0; for (unsigned int i = 0; i < NSLICES; i++) { @@ -350,45 +344,42 @@ int GPUChainTracking::RunTPCDecompression() if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) { GPUWarning("%u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); } - if (doGPU) { mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; mClusterNativeAccess->setOffsetPtrs(); *mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess; processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream); - TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->init); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single); } mIOPtrs.clustersNative = mClusterNativeAccess.get(); mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput; mClusterNativeAccess->setOffsetPtrs(); - auto startU = std::chrono::high_resolution_clock::now(); - for (unsigned int iSlice = 0; iSlice < NSLICES; ++iSlice) { - int iStream = iSlice % mRec->NStreams(); - runKernel({120,32,iStream}, krnlRunRangeNone, {nullptr, &mEvents->init}, iSlice); - GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * mClusterNativeAccess->nClustersSector[iSlice], iStream, false); + + unsigned int batchSize = doGPU ? 6 : NSLICES; + for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice = iSlice + batchSize) { + int iStream = (iSlice / batchSize) % mRec->NStreams(); + runKernel({GetGridAuto(iStream), krnlRunRangeNone, {nullptr, &mEvents->single}}, iSlice, batchSize); + unsigned int copySize = std::accumulate(mClusterNativeAccess->nClustersSector + iSlice, mClusterNativeAccess->nClustersSector + iSlice + batchSize, 0u); + GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false); } SynchronizeGPU(); - auto endU = std::chrono::high_resolution_clock::now(); - std::chrono::duration durationU = endU - startU; - LOGP(info,"Unatt time: {} ms", durationU.count() * 1e3); if (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4) { - runKernel(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone); + runKernel(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression)); const ClusterNativeAccess* decoded = mIOPtrs.clustersNative; - for (unsigned int i = 0; i < NSLICES; i++) { - for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { - ClusterNative* begin = mInputsHost->mPclusterNativeOutput + decoded->clusterOffset[i][j]; - ClusterNative* end = begin + decoded->nClusters[i][j]; - std::sort(begin, end); + if (doGPU) { + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { + ClusterNative* begin = mInputsHost->mPclusterNativeOutput + decoded->clusterOffset[i][j]; + ClusterNative* end = begin + decoded->nClusters[i][j]; + std::sort(begin, end); + } } } } mRec->PopNonPersistentMemory(RecoStep::TPCDecompression, qStr2Tag("TPCDCMPR")); } -//#endif - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end - start; - LOGP(info,"Exec time: {} ms", duration.count() * 1e3); +#endif return 0; } diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 51668dfff42a5..272890ac8df78 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -106,7 +106,7 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64" "GPUTPCCom o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128" "GPUTPCCompressionKernels" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" "GPUTPCCompressionKernels" LB simple) o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" "= TPCDECOMPRESSION" LB simple int trackStart int trackEnd) -o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" "= TPCDECOMPRESSION" LB simple int iSlice) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" "= TPCDECOMPRESSION" LB simple int sliceStart int nSlices) o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, sortPerSectorRow" "GPUTPCDecompressionKernels" LB simple) o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" "= TPCCLUSTERFINDER" LB single)