Skip to content

Commit b2b1457

Browse files
Jonas HahnfeldJonas Hahnfeld
Jonas Hahnfeld
authored and
Jonas Hahnfeld
committed
Implement overlapped gather with OpenCL
1 parent 9fab4f3 commit b2b1457

File tree

4 files changed

+174
-25
lines changed

4 files changed

+174
-25
lines changed

opencl/CGMultiOpenCL.cpp

+113-14
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,22 @@ class CGMultiOpenCL : public CGOpenCLBase {
1919
int id;
2020
WorkDistribution *workDistribution;
2121

22+
MatrixCRSDevice diagMatrixCRS;
23+
MatrixELLDevice diagMatrixELL;
24+
cl_command_queue gatherQueue;
25+
2226
floatType vectorDotResult;
2327

28+
~MultiDevice() { clReleaseCommandQueue(gatherQueue); }
29+
30+
virtual void init(cl_device_id device_id, CGOpenCLBase *cg) override {
31+
Device::init(device_id, cg);
32+
33+
cl_int err;
34+
gatherQueue = clCreateCommandQueue(ctx, device_id, 0, &err);
35+
checkError(err);
36+
}
37+
2438
int getOffset(Vector v) const {
2539
if (v == VectorX || v == VectorP) {
2640
// These vectors are fully allocated, but we only need the "local" part.
@@ -35,11 +49,16 @@ class CGMultiOpenCL : public CGOpenCLBase {
3549

3650
std::unique_ptr<floatType[]> p;
3751

52+
cl_kernel matvecKernelCRSRoundup = NULL;
53+
cl_kernel matvecKernelELLRoundup = NULL;
54+
3855
virtual int getNumberOfChunks() override { return devices.size(); }
56+
virtual bool supportsOverlappedGather() override { return true; }
3957

4058
virtual void init(const char *matrixFile) override;
4159

4260
void finishAllDevices();
61+
void finishAllDevicesGatherQueue();
4362

4463
virtual void doTransferTo() override;
4564
virtual void doTransferFrom() override;
@@ -55,6 +74,18 @@ class CGMultiOpenCL : public CGOpenCLBase {
5574
virtual floatType vectorDotKernel(Vector _a, Vector _b) override;
5675

5776
virtual void applyPreconditionerKernel(Vector _x, Vector _y) override;
77+
78+
virtual void cleanup() override {
79+
CGOpenCLBase::cleanup();
80+
81+
if (overlappedGather) {
82+
clReleaseKernel(matvecKernelCRSRoundup);
83+
clReleaseKernel(matvecKernelELLRoundup);
84+
}
85+
}
86+
87+
public:
88+
CGMultiOpenCL() : CGOpenCLBase(/* overlappedGather= */ true) {}
5889
};
5990

6091
void CGMultiOpenCL::init(const char *matrixFile) {
@@ -77,6 +108,11 @@ void CGMultiOpenCL::init(const char *matrixFile) {
77108
CGOpenCLBase::init(matrixFile);
78109
assert(workDistribution->numberOfChunks == numberOfDevices);
79110

111+
if (overlappedGather) {
112+
matvecKernelCRSRoundup = checkedCreateKernel("matvecKernelCRSRoundup");
113+
matvecKernelELLRoundup = checkedCreateKernel("matvecKernelELLRoundup");
114+
}
115+
80116
for (MultiDevice &device : devices) {
81117
device.workDistribution = workDistribution.get();
82118
int length = workDistribution->lengths[device.id];
@@ -92,6 +128,12 @@ void CGMultiOpenCL::finishAllDevices() {
92128
}
93129
}
94130

131+
void CGMultiOpenCL::finishAllDevicesGatherQueue() {
132+
for (MultiDevice &device : devices) {
133+
checkError(clFinish(device.gatherQueue));
134+
}
135+
}
136+
95137
void CGMultiOpenCL::doTransferTo() {
96138
size_t fullVectorSize = sizeof(floatType) * N;
97139

@@ -113,12 +155,26 @@ void CGMultiOpenCL::doTransferTo() {
113155

114156
switch (matrixFormat) {
115157
case MatrixFormatCRS:
116-
allocateAndCopyMatrixDataCRS(length, splitMatrixCRS->data[d], device,
117-
device.matrixCRS);
158+
if (!overlappedGather) {
159+
allocateAndCopyMatrixDataCRS(length, splitMatrixCRS->data[d], device,
160+
device.matrixCRS);
161+
} else {
162+
allocateAndCopyMatrixDataCRS(length, partitionedMatrixCRS->diag[d],
163+
device, device.diagMatrixCRS);
164+
allocateAndCopyMatrixDataCRS(length, partitionedMatrixCRS->minor[d],
165+
device, device.matrixCRS);
166+
}
118167
break;
119168
case MatrixFormatELL:
120-
allocateAndCopyMatrixDataELL(length, splitMatrixELL->data[d], device,
121-
device.matrixELL);
169+
if (!overlappedGather) {
170+
allocateAndCopyMatrixDataELL(length, splitMatrixELL->data[d], device,
171+
device.matrixELL);
172+
} else {
173+
allocateAndCopyMatrixDataELL(length, partitionedMatrixELL->diag[d],
174+
device, device.diagMatrixELL);
175+
allocateAndCopyMatrixDataELL(length, partitionedMatrixELL->minor[d],
176+
device, device.matrixELL);
177+
}
122178
break;
123179
default:
124180
assert(0 && "Invalid matrix format!");
@@ -162,9 +218,15 @@ void CGMultiOpenCL::doTransferFrom() {
162218

163219
switch (matrixFormat) {
164220
case MatrixFormatCRS:
221+
if (overlappedGather) {
222+
freeMatrixCRSDevice(device.diagMatrixCRS);
223+
}
165224
freeMatrixCRSDevice(device.matrixCRS);
166225
break;
167226
case MatrixFormatELL:
227+
if (overlappedGather) {
228+
freeMatrixELLDevice(device.diagMatrixELL);
229+
}
168230
freeMatrixELLDevice(device.matrixELL);
169231
break;
170232
default:
@@ -227,10 +289,11 @@ void CGMultiOpenCL::matvecGatherXViaHost(Vector _x) {
227289
cl_mem x = device.getVector(_x);
228290
assert(offset == device.getOffset(_x));
229291

230-
device.checkedEnqueueReadBuffer(x, sizeof(floatType) * offset,
292+
device.checkedEnqueueReadBuffer(device.gatherQueue, x,
293+
sizeof(floatType) * offset,
231294
sizeof(floatType) * length, xHost + offset);
232295
}
233-
finishAllDevices();
296+
finishAllDevicesGatherQueue();
234297

235298
// Transfer x to devices.
236299
for (MultiDevice &device : devices) {
@@ -244,15 +307,41 @@ void CGMultiOpenCL::matvecGatherXViaHost(Vector _x) {
244307
int offset = workDistribution->offsets[src.id];
245308
int length = workDistribution->lengths[src.id];
246309

247-
device.checkedEnqueueWriteBuffer(x, sizeof(floatType) * offset,
248-
sizeof(floatType) * length,
249-
xHost + offset);
310+
device.checkedEnqueueWriteBuffer(
311+
device.gatherQueue, x, sizeof(floatType) * offset,
312+
sizeof(floatType) * length, xHost + offset);
250313
}
251314
}
252-
finishAllDevices();
315+
finishAllDevicesGatherQueue();
253316
}
254317

255318
void CGMultiOpenCL::matvecKernel(Vector _x, Vector _y) {
319+
if (overlappedGather) {
320+
// Start computation on the diagonal that does not require data exchange
321+
// between the devices. It is efficient to do so before the gather because
322+
// the computation is expected to take longer. This effectively even hides
323+
// the overhead of starting the gather.
324+
for (MultiDevice &device : devices) {
325+
int length = workDistribution->lengths[device.id];
326+
cl_mem x = device.getVector(_x);
327+
cl_mem y = device.getVector(_y);
328+
int yOffset = device.getOffset(_y);
329+
330+
switch (matrixFormat) {
331+
case MatrixFormatCRS:
332+
device.checkedEnqueueMatvecKernelCRS(
333+
matvecKernelCRS, device.diagMatrixCRS, x, y, yOffset, length);
334+
break;
335+
case MatrixFormatELL:
336+
device.checkedEnqueueMatvecKernelELL(
337+
matvecKernelELL, device.diagMatrixELL, x, y, yOffset, length);
338+
break;
339+
default:
340+
assert(0 && "Invalid matrix format!");
341+
}
342+
}
343+
}
344+
256345
matvecGatherXViaHost(_x);
257346

258347
for (MultiDevice &device : devices) {
@@ -263,12 +352,22 @@ void CGMultiOpenCL::matvecKernel(Vector _x, Vector _y) {
263352

264353
switch (matrixFormat) {
265354
case MatrixFormatCRS:
266-
device.checkedEnqueueMatvecKernelCRS(matvecKernelCRS, device.matrixCRS, x,
267-
y, yOffset, length);
355+
if (!overlappedGather) {
356+
device.checkedEnqueueMatvecKernelCRS(matvecKernelCRS, device.matrixCRS,
357+
x, y, yOffset, length);
358+
} else {
359+
device.checkedEnqueueMatvecKernelCRS(
360+
matvecKernelCRSRoundup, device.matrixCRS, x, y, yOffset, length);
361+
}
268362
break;
269363
case MatrixFormatELL:
270-
device.checkedEnqueueMatvecKernelELL(matvecKernelELL, device.matrixELL, x,
271-
y, yOffset, length);
364+
if (!overlappedGather) {
365+
device.checkedEnqueueMatvecKernelELL(matvecKernelELL, device.matrixELL,
366+
x, y, yOffset, length);
367+
} else {
368+
device.checkedEnqueueMatvecKernelELL(
369+
matvecKernelELLRoundup, device.matrixELL, x, y, yOffset, length);
370+
}
272371
break;
273372
default:
274373
assert(0 && "Invalid matrix format!");

opencl/CGOpenCLBase.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,9 @@ void CGOpenCLBase::freeMatrixELLDevice(
194194
checkedReleaseMemObject(deviceMatrix.data);
195195
}
196196

197-
CGOpenCLBase::~CGOpenCLBase() {
197+
void CGOpenCLBase::cleanup() {
198+
CG::cleanup();
199+
198200
clReleaseKernel(matvecKernelCRS);
199201
clReleaseKernel(matvecKernelELL);
200202
clReleaseKernel(axpyKernelCL);

opencl/CGOpenCLBase.h

+24-10
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class CGOpenCLBase : public CG {
5555

5656
/// This device's id.
5757
cl_device_id device_id;
58+
/// The (cached) context.
59+
cl_context ctx;
5860
/// The queue for this device.
5961
cl_command_queue queue = NULL;
6062

@@ -105,11 +107,12 @@ class CGOpenCLBase : public CG {
105107
~Device() { clReleaseCommandQueue(queue); }
106108

107109
/// Init device with id \a device_id.
108-
void init(cl_device_id device_id, CGOpenCLBase *cg) {
110+
virtual void init(cl_device_id device_id, CGOpenCLBase *cg) {
109111
this->device_id = device_id;
112+
this->ctx = cg->ctx;
110113

111114
cl_int err;
112-
queue = clCreateCommandQueue(cg->ctx, device_id, 0, &err);
115+
queue = clCreateCommandQueue(ctx, device_id, 0, &err);
113116
checkError(err);
114117
}
115118

@@ -133,23 +136,33 @@ class CGOpenCLBase : public CG {
133136
cl_mem y, int yOffset, int N);
134137

135138
/// Enqueue read of \a buffer.
136-
void checkedEnqueueReadBuffer(cl_mem buffer, size_t offset, size_t cb,
137-
void *ptr) {
139+
void checkedEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer,
140+
size_t offset, size_t cb, void *ptr) {
138141
checkError(clEnqueueReadBuffer(queue, buffer, CL_FALSE, offset, cb, ptr,
139142
0, NULL, NULL));
140143
}
141144
/// Enqueue read of \a buffer.
145+
void checkedEnqueueReadBuffer(cl_mem buffer, size_t offset, size_t cb,
146+
void *ptr) {
147+
checkedEnqueueReadBuffer(queue, buffer, offset, cb, ptr);
148+
}
149+
/// Enqueue read of \a buffer.
142150
void checkedEnqueueReadBuffer(cl_mem buffer, size_t cb, void *ptr) {
143151
checkedEnqueueReadBuffer(buffer, 0, cb, ptr);
144152
}
145153

146154
/// Enqueue write of \a buffer.
147-
void checkedEnqueueWriteBuffer(cl_mem buffer, size_t offset, size_t cb,
148-
const void *ptr) {
155+
void checkedEnqueueWriteBuffer(cl_command_queue queue, cl_mem buffer,
156+
size_t offset, size_t cb, const void *ptr) {
149157
checkError(clEnqueueWriteBuffer(queue, buffer, CL_FALSE, offset, cb, ptr,
150158
0, NULL, NULL));
151159
}
152160
/// Enqueue write of \a buffer.
161+
void checkedEnqueueWriteBuffer(cl_mem buffer, size_t offset, size_t cb,
162+
const void *ptr) {
163+
checkedEnqueueWriteBuffer(queue, buffer, offset, cb, ptr);
164+
}
165+
/// Enqueue write of \a buffer.
153166
void checkedEnqueueWriteBuffer(cl_mem buffer, size_t cb, const void *ptr) {
154167
checkedEnqueueWriteBuffer(buffer, 0, cb, ptr);
155168
}
@@ -188,10 +201,9 @@ class CGOpenCLBase : public CG {
188201
/// @return all devices suitable for computation (excluding CPUs).
189202
static std::vector<cl_device_id> getAllDevices();
190203

191-
private:
204+
/// @return the loaded kernel called \a kernelname.
192205
cl_kernel checkedCreateKernel(const char *kernelName);
193206

194-
protected:
195207
/// @return buffer of size \a size created with \a flags.
196208
cl_mem checkedCreateBufferWithFlags(cl_mem_flags flags, size_t size);
197209
/// @return read and write buffer.
@@ -223,10 +235,12 @@ class CGOpenCLBase : public CG {
223235
/// Free \a deviceMatrix.
224236
void freeMatrixELLDevice(const Device::MatrixELLDevice &deviceMatrix);
225237

238+
virtual void cleanup() override;
239+
226240
public:
227241
/// @see CG
228-
CGOpenCLBase() : CG(MatrixFormatELL, PreconditionerJacobi) {}
229-
~CGOpenCLBase();
242+
CGOpenCLBase(bool overlappedGather = false)
243+
: CG(MatrixFormatELL, PreconditionerJacobi, overlappedGather) {}
230244
};
231245

232246
#endif

opencl/kernel.cl

+34
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,23 @@ __kernel void matvecKernelCRS(__global int *ptr, __global int *index,
1818
}
1919
}
2020

21+
__kernel void matvecKernelCRSRoundup(__global int *ptr, __global int *index,
22+
__global floatType *value,
23+
__global floatType *x,
24+
__global floatType *y, int yOffset,
25+
int N) {
26+
for (int i = get_global_id(0); i < N; i += get_global_size(0)) {
27+
// Skip load and store if nothing to be done...
28+
if (ptr[i] != ptr[i + 1]) {
29+
floatType tmp = y[yOffset + i];
30+
for (int j = ptr[i]; j < ptr[i + 1]; j++) {
31+
tmp += value[j] * x[index[j]];
32+
}
33+
y[yOffset + i] = tmp;
34+
}
35+
}
36+
}
37+
2138
__kernel void matvecKernelELL(__global int *length, __global int *index,
2239
__global floatType *data, __global floatType *x,
2340
__global floatType *y, int yOffset, int N) {
@@ -31,6 +48,23 @@ __kernel void matvecKernelELL(__global int *length, __global int *index,
3148
}
3249
}
3350

51+
__kernel void matvecKernelELLRoundup(__global int *length, __global int *index,
52+
__global floatType *data,
53+
__global floatType *x,
54+
__global floatType *y, int yOffset,
55+
int N) {
56+
for (int i = get_global_id(0); i < N; i += get_global_size(0)) {
57+
if (length[i] > 0) {
58+
floatType tmp = y[yOffset + i];
59+
for (int j = 0; j < length[i]; j++) {
60+
int k = j * N + i;
61+
tmp += data[k] * x[index[k]];
62+
}
63+
y[yOffset + i] = tmp;
64+
}
65+
}
66+
}
67+
3468
__kernel void axpyKernel(floatType a, __global floatType *x, int xOffset,
3569
__global floatType *y, int yOffset, int N) {
3670
for (int i = get_global_id(0); i < N; i += get_global_size(0)) {

0 commit comments

Comments
 (0)