@@ -19,8 +19,22 @@ class CGMultiOpenCL : public CGOpenCLBase {
19
19
int id;
20
20
WorkDistribution *workDistribution;
21
21
22
+ MatrixCRSDevice diagMatrixCRS;
23
+ MatrixELLDevice diagMatrixELL;
24
+ cl_command_queue gatherQueue;
25
+
22
26
floatType vectorDotResult;
23
27
28
+ ~MultiDevice () { clReleaseCommandQueue (gatherQueue); }
29
+
30
+ virtual void init (cl_device_id device_id, CGOpenCLBase *cg) override {
31
+ Device::init (device_id, cg);
32
+
33
+ cl_int err;
34
+ gatherQueue = clCreateCommandQueue (ctx, device_id, 0 , &err);
35
+ checkError (err);
36
+ }
37
+
24
38
int getOffset (Vector v) const {
25
39
if (v == VectorX || v == VectorP) {
26
40
// These vectors are fully allocated, but we only need the "local" part.
@@ -35,11 +49,16 @@ class CGMultiOpenCL : public CGOpenCLBase {
35
49
36
50
std::unique_ptr<floatType[]> p;
37
51
52
+ cl_kernel matvecKernelCRSRoundup = NULL ;
53
+ cl_kernel matvecKernelELLRoundup = NULL ;
54
+
38
55
virtual int getNumberOfChunks () override { return devices.size (); }
56
+ virtual bool supportsOverlappedGather () override { return true ; }
39
57
40
58
virtual void init (const char *matrixFile) override ;
41
59
42
60
void finishAllDevices ();
61
+ void finishAllDevicesGatherQueue ();
43
62
44
63
virtual void doTransferTo () override ;
45
64
virtual void doTransferFrom () override ;
@@ -55,6 +74,18 @@ class CGMultiOpenCL : public CGOpenCLBase {
55
74
virtual floatType vectorDotKernel (Vector _a, Vector _b) override ;
56
75
57
76
virtual void applyPreconditionerKernel (Vector _x, Vector _y) override ;
77
+
78
+ virtual void cleanup () override {
79
+ CGOpenCLBase::cleanup ();
80
+
81
+ if (overlappedGather) {
82
+ clReleaseKernel (matvecKernelCRSRoundup);
83
+ clReleaseKernel (matvecKernelELLRoundup);
84
+ }
85
+ }
86
+
87
+ public:
88
+ CGMultiOpenCL () : CGOpenCLBase(/* overlappedGather= */ true ) {}
58
89
};
59
90
60
91
void CGMultiOpenCL::init (const char *matrixFile) {
@@ -77,6 +108,11 @@ void CGMultiOpenCL::init(const char *matrixFile) {
77
108
CGOpenCLBase::init (matrixFile);
78
109
assert (workDistribution->numberOfChunks == numberOfDevices);
79
110
111
+ if (overlappedGather) {
112
+ matvecKernelCRSRoundup = checkedCreateKernel (" matvecKernelCRSRoundup" );
113
+ matvecKernelELLRoundup = checkedCreateKernel (" matvecKernelELLRoundup" );
114
+ }
115
+
80
116
for (MultiDevice &device : devices) {
81
117
device.workDistribution = workDistribution.get ();
82
118
int length = workDistribution->lengths [device.id ];
@@ -92,6 +128,12 @@ void CGMultiOpenCL::finishAllDevices() {
92
128
}
93
129
}
94
130
131
+ void CGMultiOpenCL::finishAllDevicesGatherQueue () {
132
+ for (MultiDevice &device : devices) {
133
+ checkError (clFinish (device.gatherQueue ));
134
+ }
135
+ }
136
+
95
137
void CGMultiOpenCL::doTransferTo () {
96
138
size_t fullVectorSize = sizeof (floatType) * N;
97
139
@@ -113,12 +155,26 @@ void CGMultiOpenCL::doTransferTo() {
113
155
114
156
switch (matrixFormat) {
115
157
case MatrixFormatCRS:
116
- allocateAndCopyMatrixDataCRS (length, splitMatrixCRS->data [d], device,
117
- device.matrixCRS );
158
+ if (!overlappedGather) {
159
+ allocateAndCopyMatrixDataCRS (length, splitMatrixCRS->data [d], device,
160
+ device.matrixCRS );
161
+ } else {
162
+ allocateAndCopyMatrixDataCRS (length, partitionedMatrixCRS->diag [d],
163
+ device, device.diagMatrixCRS );
164
+ allocateAndCopyMatrixDataCRS (length, partitionedMatrixCRS->minor [d],
165
+ device, device.matrixCRS );
166
+ }
118
167
break ;
119
168
case MatrixFormatELL:
120
- allocateAndCopyMatrixDataELL (length, splitMatrixELL->data [d], device,
121
- device.matrixELL );
169
+ if (!overlappedGather) {
170
+ allocateAndCopyMatrixDataELL (length, splitMatrixELL->data [d], device,
171
+ device.matrixELL );
172
+ } else {
173
+ allocateAndCopyMatrixDataELL (length, partitionedMatrixELL->diag [d],
174
+ device, device.diagMatrixELL );
175
+ allocateAndCopyMatrixDataELL (length, partitionedMatrixELL->minor [d],
176
+ device, device.matrixELL );
177
+ }
122
178
break ;
123
179
default :
124
180
assert (0 && " Invalid matrix format!" );
@@ -162,9 +218,15 @@ void CGMultiOpenCL::doTransferFrom() {
162
218
163
219
switch (matrixFormat) {
164
220
case MatrixFormatCRS:
221
+ if (overlappedGather) {
222
+ freeMatrixCRSDevice (device.diagMatrixCRS );
223
+ }
165
224
freeMatrixCRSDevice (device.matrixCRS );
166
225
break ;
167
226
case MatrixFormatELL:
227
+ if (overlappedGather) {
228
+ freeMatrixELLDevice (device.diagMatrixELL );
229
+ }
168
230
freeMatrixELLDevice (device.matrixELL );
169
231
break ;
170
232
default :
@@ -227,10 +289,11 @@ void CGMultiOpenCL::matvecGatherXViaHost(Vector _x) {
227
289
cl_mem x = device.getVector (_x);
228
290
assert (offset == device.getOffset (_x));
229
291
230
- device.checkedEnqueueReadBuffer (x, sizeof (floatType) * offset,
292
+ device.checkedEnqueueReadBuffer (device.gatherQueue , x,
293
+ sizeof (floatType) * offset,
231
294
sizeof (floatType) * length, xHost + offset);
232
295
}
233
- finishAllDevices ();
296
+ finishAllDevicesGatherQueue ();
234
297
235
298
// Transfer x to devices.
236
299
for (MultiDevice &device : devices) {
@@ -244,15 +307,41 @@ void CGMultiOpenCL::matvecGatherXViaHost(Vector _x) {
244
307
int offset = workDistribution->offsets [src.id ];
245
308
int length = workDistribution->lengths [src.id ];
246
309
247
- device.checkedEnqueueWriteBuffer (x, sizeof (floatType) * offset,
248
- sizeof (floatType) * length ,
249
- xHost + offset);
310
+ device.checkedEnqueueWriteBuffer (
311
+ device. gatherQueue , x, sizeof (floatType) * offset ,
312
+ sizeof (floatType) * length, xHost + offset);
250
313
}
251
314
}
252
- finishAllDevices ();
315
+ finishAllDevicesGatherQueue ();
253
316
}
254
317
255
318
void CGMultiOpenCL::matvecKernel (Vector _x, Vector _y) {
319
+ if (overlappedGather) {
320
+ // Start computation on the diagonal that does not require data exchange
321
+ // between the devices. It is efficient to do so before the gather because
322
+ // the computation is expected to take longer. This effectively even hides
323
+ // the overhead of starting the gather.
324
+ for (MultiDevice &device : devices) {
325
+ int length = workDistribution->lengths [device.id ];
326
+ cl_mem x = device.getVector (_x);
327
+ cl_mem y = device.getVector (_y);
328
+ int yOffset = device.getOffset (_y);
329
+
330
+ switch (matrixFormat) {
331
+ case MatrixFormatCRS:
332
+ device.checkedEnqueueMatvecKernelCRS (
333
+ matvecKernelCRS, device.diagMatrixCRS , x, y, yOffset, length);
334
+ break ;
335
+ case MatrixFormatELL:
336
+ device.checkedEnqueueMatvecKernelELL (
337
+ matvecKernelELL, device.diagMatrixELL , x, y, yOffset, length);
338
+ break ;
339
+ default :
340
+ assert (0 && " Invalid matrix format!" );
341
+ }
342
+ }
343
+ }
344
+
256
345
matvecGatherXViaHost (_x);
257
346
258
347
for (MultiDevice &device : devices) {
@@ -263,12 +352,22 @@ void CGMultiOpenCL::matvecKernel(Vector _x, Vector _y) {
263
352
264
353
switch (matrixFormat) {
265
354
case MatrixFormatCRS:
266
- device.checkedEnqueueMatvecKernelCRS (matvecKernelCRS, device.matrixCRS , x,
267
- y, yOffset, length);
355
+ if (!overlappedGather) {
356
+ device.checkedEnqueueMatvecKernelCRS (matvecKernelCRS, device.matrixCRS ,
357
+ x, y, yOffset, length);
358
+ } else {
359
+ device.checkedEnqueueMatvecKernelCRS (
360
+ matvecKernelCRSRoundup, device.matrixCRS , x, y, yOffset, length);
361
+ }
268
362
break ;
269
363
case MatrixFormatELL:
270
- device.checkedEnqueueMatvecKernelELL (matvecKernelELL, device.matrixELL , x,
271
- y, yOffset, length);
364
+ if (!overlappedGather) {
365
+ device.checkedEnqueueMatvecKernelELL (matvecKernelELL, device.matrixELL ,
366
+ x, y, yOffset, length);
367
+ } else {
368
+ device.checkedEnqueueMatvecKernelELL (
369
+ matvecKernelELLRoundup, device.matrixELL , x, y, yOffset, length);
370
+ }
272
371
break ;
273
372
default :
274
373
assert (0 && " Invalid matrix format!" );
0 commit comments