Skip to content

Commit

Permalink
GPU TPC: Decompression: unattached clusters kernels on multiple streams
Browse files Browse the repository at this point in the history
  • Loading branch information
cima22 authored and Gabriele Cimador committed May 8, 2024
1 parent aadeb98 commit e709f3a
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 35 deletions.
23 changes: 11 additions & 12 deletions GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -126,27 +126,26 @@ GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2
}

template <>
GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors, int iSlice)
{
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;

unsigned int* offsets = decompressor.mUnattachedClustersOffsets;
for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
unsigned int slice = i / GPUCA_ROW_COUNT;
unsigned int row = i % GPUCA_ROW_COUNT;
unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice, row, decompressor.mMaxNativeClustersPerBuffer);
ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[slice][row];
if (decompressor.mNativeClustersIndex[i] != 0) {
decompressorMemcpyBasic(buffer, decompressor.mTmpNativeClusters + tmpBufferIndex, decompressor.mNativeClustersIndex[i]);
for (unsigned int iRow = get_global_id(0); iRow < GPUCA_ROW_COUNT; iRow += get_global_size(0)) {
const int linearIndex = iSlice * GPUCA_ROW_COUNT + iRow;
unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(iSlice, iRow, decompressor.mMaxNativeClustersPerBuffer);
ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[iSlice][iRow];
if (decompressor.mNativeClustersIndex[linearIndex] != 0) {
decompressorMemcpyBasic(buffer, decompressor.mTmpNativeClusters + tmpBufferIndex, decompressor.mNativeClustersIndex[linearIndex]);
}
ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i];
unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]);
decompressHits(cmprClusters, offsets[i], end, clout);
ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[linearIndex];
unsigned int end = offsets[linearIndex] + ((linearIndex >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[linearIndex]);
decompressHits(cmprClusters, offsets[linearIndex], end, clout);
if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) {
for (unsigned int k = 0; k < outputAccess->nClusters[slice][row]; k++) {
for (unsigned int k = 0; k < outputAccess->nClusters[iSlice][iRow]; k++) {
auto& cl = buffer[k];
float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins;
if (t < 0) {
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3
#define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 1
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 288, 2
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 1
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 448
Expand Down Expand Up @@ -266,7 +266,7 @@
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
#define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 1
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 288, 2
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 1
#define GPUCA_LB_COMPRESSION_GATHER 1024
#define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
#define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
Expand Down
33 changes: 13 additions & 20 deletions GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ int GPUChainTracking::RunTPCCompression()
int GPUChainTracking::RunTPCDecompression()
{
auto start = std::chrono::high_resolution_clock::now();
//#ifdef GPUCA_HAVE_O2HEADERS
//#ifdef GPUCA_HAVE_O2HEADERS
if (GetProcessingSettings().tpcUseOldCPUDecoding) {
const auto& threadContext = GetThreadContext();
TPCClusterDecompressor decomp;
Expand Down Expand Up @@ -248,7 +248,7 @@ int GPUChainTracking::RunTPCDecompression()
CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;

int inputStream = 0;
int unattachedStream = 4;
int unattachedStream = mRec->NStreams() - 1;
inputGPU.nAttachedClusters = cmprClsHost.nAttachedClusters;
inputGPU.nUnattachedClusters = cmprClsHost.nUnattachedClusters;
inputGPU.nTracks = cmprClsHost.nTracks;
Expand Down Expand Up @@ -288,7 +288,7 @@ int GPUChainTracking::RunTPCDecompression()
bool toGPU = true;
runKernel<GPUMemClean16>(GetGridAutoStep(inputStream, RecoStep::TPCDecompression), krnlRunRangeNone, &mEvents->init, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0]));
std::exclusive_scan(cmprClsHost.nTrackClusters, cmprClsHost.nTrackClusters + cmprClsHost.nTracks, Decompressor.mAttachedClustersOffsets,0u);
int nStreams = 5;
int nStreams = mRec->NStreams() - 1;
for (unsigned int iStream = 0; iStream < nStreams; ++iStream) {
unsigned int startTrack = cmprClsHost.nTracks / nStreams * iStream;
unsigned int endTrack = cmprClsHost.nTracks / nStreams * (iStream + 1) + (iStream < nStreams - 1 ? 0 : cmprClsHost.nTracks % nStreams);
Expand All @@ -313,7 +313,7 @@ int GPUChainTracking::RunTPCDecompression()
GPUMemCpy(myStep, inputGPUShadow.timeA + startTrack, cmprClsHost.timeA + startTrack, numTracks * sizeof(cmprClsHost.timeA[0]), iStream, toGPU);
GPUMemCpy(myStep, inputGPUShadow.padA + startTrack, cmprClsHost.padA + startTrack, numTracks * sizeof(cmprClsHost.padA[0]), iStream, toGPU);

runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step0attached>({60,128,iStream}, krnlRunRangeNone, {&mEvents->stream[iStream], &mEvents->init}, startTrack, endTrack);
runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step0attached>({60,96,iStream}, krnlRunRangeNone, {&mEvents->stream[iStream], &mEvents->init}, startTrack, endTrack);
}

GPUMemCpy(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), unattachedStream, toGPU);
Expand Down Expand Up @@ -362,23 +362,16 @@ int GPUChainTracking::RunTPCDecompression()
mIOPtrs.clustersNative = mClusterNativeAccess.get();
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
mClusterNativeAccess->setOffsetPtrs();

runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>(GetGridAuto(unattachedStream), krnlRunRangeNone, {nullptr, &mEvents->init});

for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {
for (unsigned int iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
int linearIndex = iSlice * GPUCA_ROW_COUNT + iRow;
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][iRow], DecompressorShadow.mTmpNativeClusters + GPUTPCDecompressionKernels::computeLinearTmpBufferIndex(iSlice, iRow, Decompressor.mMaxNativeClustersPerBuffer), sizeof(DecompressorShadow.mTmpNativeClusters[0]) * Decompressor.mNativeClustersIndex[linearIndex], inputStream, false, nullptr,mEvents->stream,nStreams);
}
auto startU = std::chrono::high_resolution_clock::now();
for (unsigned int iSlice = 0; iSlice < NSLICES; ++iSlice) {
int iStream = iSlice % mRec->NStreams();
runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({120,32,iStream}, krnlRunRangeNone, {nullptr, &mEvents->init}, iSlice);
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * mClusterNativeAccess->nClustersSector[iSlice], iStream, false);
}
for (unsigned int iSlice = 0; iSlice < NSLICES; iSlice++) {
for (unsigned int iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
int linearIndex = iSlice * GPUCA_ROW_COUNT + iRow;
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][iRow] + Decompressor.mNativeClustersIndex[linearIndex], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][iRow] + Decompressor.mNativeClustersIndex[linearIndex], sizeof(DecompressorShadow.mNativeClustersBuffer[0]) * ((linearIndex >= cmprClsHost.nSliceRows) ? 0 : cmprClsHost.nSliceRowClusters[linearIndex]), unattachedStream, false);
}
}
SynchronizeStream(inputStream);
SynchronizeStream(unattachedStream);
SynchronizeGPU();
auto endU = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> durationU = endU - startU;
LOGP(info,"Unatt time: {} ms", durationU.count() * 1e3);

if (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4) {
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone);
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/kernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64" "GPUTPCCom
o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128" "GPUTPCCompressionKernels" LB simple)
o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" "GPUTPCCompressionKernels" LB simple)
o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" "= TPCDECOMPRESSION" LB simple int trackStart int trackEnd)
o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" "= TPCDECOMPRESSION" LB simple)
o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" "= TPCDECOMPRESSION" LB simple int iSlice)
o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, sortPerSectorRow" "GPUTPCDecompressionKernels" LB simple)
o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" "= TPCCLUSTERFINDER" LB single)
o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" "= TPCCLUSTERFINDER" LB single)
Expand Down

0 comments on commit e709f3a

Please sign in to comment.