Implement overlapped gather with OpenCL

Jonas Hahnfeld · Jonas Hahnfeld · commit b2b14570246b · 2017-04-13T17:00:15.000+02:00
diff --git a/opencl/CGMultiOpenCL.cpp b/opencl/CGMultiOpenCL.cpp
@@ -19,8 +19,22 @@ class CGMultiOpenCL : public CGOpenCLBase {
     int id;
     WorkDistribution *workDistribution;
 
+    MatrixCRSDevice diagMatrixCRS;
+    MatrixELLDevice diagMatrixELL;
+    cl_command_queue gatherQueue;
+
     floatType vectorDotResult;
 
+    ~MultiDevice() { clReleaseCommandQueue(gatherQueue); }
+
+    virtual void init(cl_device_id device_id, CGOpenCLBase *cg) override {
+      Device::init(device_id, cg);
+
+      cl_int err;
+      gatherQueue = clCreateCommandQueue(ctx, device_id, 0, &err);
+      checkError(err);
+    }
+
     int getOffset(Vector v) const {
       if (v == VectorX || v == VectorP) {
         // These vectors are fully allocated, but we only need the "local" part.
@@ -35,11 +49,16 @@ class CGMultiOpenCL : public CGOpenCLBase {
 
   std::unique_ptr<floatType[]> p;
 
+  cl_kernel matvecKernelCRSRoundup = NULL;
+  cl_kernel matvecKernelELLRoundup = NULL;
+
   virtual int getNumberOfChunks() override { return devices.size(); }
+  virtual bool supportsOverlappedGather() override { return true; }
 
   virtual void init(const char *matrixFile) override;
 
   void finishAllDevices();
+  void finishAllDevicesGatherQueue();
 
   virtual void doTransferTo() override;
   virtual void doTransferFrom() override;
@@ -55,6 +74,18 @@ class CGMultiOpenCL : public CGOpenCLBase {
   virtual floatType vectorDotKernel(Vector _a, Vector _b) override;
 
   virtual void applyPreconditionerKernel(Vector _x, Vector _y) override;
+
+  virtual void cleanup() override {
+    CGOpenCLBase::cleanup();
+
+    if (overlappedGather) {
+      clReleaseKernel(matvecKernelCRSRoundup);
+      clReleaseKernel(matvecKernelELLRoundup);
+    }
+  }
+
+public:
+  CGMultiOpenCL() : CGOpenCLBase(/* overlappedGather= */ true) {}
 };
 
 void CGMultiOpenCL::init(const char *matrixFile) {
@@ -77,6 +108,11 @@ void CGMultiOpenCL::init(const char *matrixFile) {
   CGOpenCLBase::init(matrixFile);
   assert(workDistribution->numberOfChunks == numberOfDevices);
 
+  if (overlappedGather) {
+    matvecKernelCRSRoundup = checkedCreateKernel("matvecKernelCRSRoundup");
+    matvecKernelELLRoundup = checkedCreateKernel("matvecKernelELLRoundup");
+  }
+
   for (MultiDevice &device : devices) {
     device.workDistribution = workDistribution.get();
     int length = workDistribution->lengths[device.id];
@@ -92,6 +128,12 @@ void CGMultiOpenCL::finishAllDevices() {
   }
 }
 
+void CGMultiOpenCL::finishAllDevicesGatherQueue() {
+  for (MultiDevice &device : devices) {
+    checkError(clFinish(device.gatherQueue));
+  }
+}
+
 void CGMultiOpenCL::doTransferTo() {
   size_t fullVectorSize = sizeof(floatType) * N;
 
@@ -113,12 +155,26 @@ void CGMultiOpenCL::doTransferTo() {
 
     switch (matrixFormat) {
     case MatrixFormatCRS:
-      allocateAndCopyMatrixDataCRS(length, splitMatrixCRS->data[d], device,
-                                   device.matrixCRS);
+      if (!overlappedGather) {
+        allocateAndCopyMatrixDataCRS(length, splitMatrixCRS->data[d], device,
+                                     device.matrixCRS);
+      } else {
+        allocateAndCopyMatrixDataCRS(length, partitionedMatrixCRS->diag[d],
+                                     device, device.diagMatrixCRS);
+        allocateAndCopyMatrixDataCRS(length, partitionedMatrixCRS->minor[d],
+                                     device, device.matrixCRS);
+      }
       break;
     case MatrixFormatELL:
-      allocateAndCopyMatrixDataELL(length, splitMatrixELL->data[d], device,
-                                   device.matrixELL);
+      if (!overlappedGather) {
+        allocateAndCopyMatrixDataELL(length, splitMatrixELL->data[d], device,
+                                     device.matrixELL);
+      } else {
+        allocateAndCopyMatrixDataELL(length, partitionedMatrixELL->diag[d],
+                                     device, device.diagMatrixELL);
+        allocateAndCopyMatrixDataELL(length, partitionedMatrixELL->minor[d],
+                                     device, device.matrixELL);
+      }
       break;
     default:
       assert(0 && "Invalid matrix format!");
@@ -162,9 +218,15 @@ void CGMultiOpenCL::doTransferFrom() {
 
     switch (matrixFormat) {
     case MatrixFormatCRS:
+      if (overlappedGather) {
+        freeMatrixCRSDevice(device.diagMatrixCRS);
+      }
       freeMatrixCRSDevice(device.matrixCRS);
       break;
     case MatrixFormatELL:
+      if (overlappedGather) {
+        freeMatrixELLDevice(device.diagMatrixELL);
+      }
       freeMatrixELLDevice(device.matrixELL);
       break;
     default:
@@ -227,10 +289,11 @@ void CGMultiOpenCL::matvecGatherXViaHost(Vector _x) {
     cl_mem x = device.getVector(_x);
     assert(offset == device.getOffset(_x));
 
-    device.checkedEnqueueReadBuffer(x, sizeof(floatType) * offset,
+    device.checkedEnqueueReadBuffer(device.gatherQueue, x,
+                                    sizeof(floatType) * offset,
                                     sizeof(floatType) * length, xHost + offset);
   }
-  finishAllDevices();
+  finishAllDevicesGatherQueue();
 
   // Transfer x to devices.
   for (MultiDevice &device : devices) {
@@ -244,15 +307,41 @@ void CGMultiOpenCL::matvecGatherXViaHost(Vector _x) {
       int offset = workDistribution->offsets[src.id];
       int length = workDistribution->lengths[src.id];
 
-      device.checkedEnqueueWriteBuffer(x, sizeof(floatType) * offset,
-                                       sizeof(floatType) * length,
-                                       xHost + offset);
+      device.checkedEnqueueWriteBuffer(
+          device.gatherQueue, x, sizeof(floatType) * offset,
+          sizeof(floatType) * length, xHost + offset);
     }
   }
-  finishAllDevices();
+  finishAllDevicesGatherQueue();
 }
 
 void CGMultiOpenCL::matvecKernel(Vector _x, Vector _y) {
+  if (overlappedGather) {
+    // Start computation on the diagonal that does not require data exchange
+    // between the devices. It is efficient to do so before the gather because
+    // the computation is expected to take longer. This effectively even hides
+    // the overhead of starting the gather.
+    for (MultiDevice &device : devices) {
+      int length = workDistribution->lengths[device.id];
+      cl_mem x = device.getVector(_x);
+      cl_mem y = device.getVector(_y);
+      int yOffset = device.getOffset(_y);
+
+      switch (matrixFormat) {
+      case MatrixFormatCRS:
+        device.checkedEnqueueMatvecKernelCRS(
+            matvecKernelCRS, device.diagMatrixCRS, x, y, yOffset, length);
+        break;
+      case MatrixFormatELL:
+        device.checkedEnqueueMatvecKernelELL(
+            matvecKernelELL, device.diagMatrixELL, x, y, yOffset, length);
+        break;
+      default:
+        assert(0 && "Invalid matrix format!");
+      }
+    }
+  }
+
   matvecGatherXViaHost(_x);
 
   for (MultiDevice &device : devices) {
@@ -263,12 +352,22 @@ void CGMultiOpenCL::matvecKernel(Vector _x, Vector _y) {
 
     switch (matrixFormat) {
     case MatrixFormatCRS:
-      device.checkedEnqueueMatvecKernelCRS(matvecKernelCRS, device.matrixCRS, x,
-                                           y, yOffset, length);
+      if (!overlappedGather) {
+        device.checkedEnqueueMatvecKernelCRS(matvecKernelCRS, device.matrixCRS,
+                                             x, y, yOffset, length);
+      } else {
+        device.checkedEnqueueMatvecKernelCRS(
+            matvecKernelCRSRoundup, device.matrixCRS, x, y, yOffset, length);
+      }
       break;
     case MatrixFormatELL:
-      device.checkedEnqueueMatvecKernelELL(matvecKernelELL, device.matrixELL, x,
-                                           y, yOffset, length);
+      if (!overlappedGather) {
+        device.checkedEnqueueMatvecKernelELL(matvecKernelELL, device.matrixELL,
+                                             x, y, yOffset, length);
+      } else {
+        device.checkedEnqueueMatvecKernelELL(
+            matvecKernelELLRoundup, device.matrixELL, x, y, yOffset, length);
+      }
       break;
     default:
       assert(0 && "Invalid matrix format!");
diff --git a/opencl/CGOpenCLBase.cpp b/opencl/CGOpenCLBase.cpp
@@ -194,7 +194,9 @@ void CGOpenCLBase::freeMatrixELLDevice(
   checkedReleaseMemObject(deviceMatrix.data);
 }
 
-CGOpenCLBase::~CGOpenCLBase() {
+void CGOpenCLBase::cleanup() {
+  CG::cleanup();
+
   clReleaseKernel(matvecKernelCRS);
   clReleaseKernel(matvecKernelELL);
   clReleaseKernel(axpyKernelCL);
diff --git a/opencl/CGOpenCLBase.h b/opencl/CGOpenCLBase.h
@@ -55,6 +55,8 @@ class CGOpenCLBase : public CG {
 
     /// This device's id.
     cl_device_id device_id;
+    /// The (cached) context.
+    cl_context ctx;
     /// The queue for this device.
     cl_command_queue queue = NULL;
 
@@ -105,11 +107,12 @@ class CGOpenCLBase : public CG {
     ~Device() { clReleaseCommandQueue(queue); }
 
     /// Init device with id \a device_id.
-    void init(cl_device_id device_id, CGOpenCLBase *cg) {
+    virtual void init(cl_device_id device_id, CGOpenCLBase *cg) {
       this->device_id = device_id;
+      this->ctx = cg->ctx;
 
       cl_int err;
-      queue = clCreateCommandQueue(cg->ctx, device_id, 0, &err);
+      queue = clCreateCommandQueue(ctx, device_id, 0, &err);
       checkError(err);
     }
 
@@ -133,23 +136,33 @@ class CGOpenCLBase : public CG {
                                        cl_mem y, int yOffset, int N);
 
     /// Enqueue read of \a buffer.
-    void checkedEnqueueReadBuffer(cl_mem buffer, size_t offset, size_t cb,
-                                  void *ptr) {
+    void checkedEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer,
+                                  size_t offset, size_t cb, void *ptr) {
       checkError(clEnqueueReadBuffer(queue, buffer, CL_FALSE, offset, cb, ptr,
                                      0, NULL, NULL));
     }
     /// Enqueue read of \a buffer.
+    void checkedEnqueueReadBuffer(cl_mem buffer, size_t offset, size_t cb,
+                                  void *ptr) {
+      checkedEnqueueReadBuffer(queue, buffer, offset, cb, ptr);
+    }
+    /// Enqueue read of \a buffer.
     void checkedEnqueueReadBuffer(cl_mem buffer, size_t cb, void *ptr) {
       checkedEnqueueReadBuffer(buffer, 0, cb, ptr);
     }
 
     /// Enqueue write of \a buffer.
-    void checkedEnqueueWriteBuffer(cl_mem buffer, size_t offset, size_t cb,
-                                   const void *ptr) {
+    void checkedEnqueueWriteBuffer(cl_command_queue queue, cl_mem buffer,
+                                   size_t offset, size_t cb, const void *ptr) {
       checkError(clEnqueueWriteBuffer(queue, buffer, CL_FALSE, offset, cb, ptr,
                                       0, NULL, NULL));
     }
     /// Enqueue write of \a buffer.
+    void checkedEnqueueWriteBuffer(cl_mem buffer, size_t offset, size_t cb,
+                                   const void *ptr) {
+      checkedEnqueueWriteBuffer(queue, buffer, offset, cb, ptr);
+    }
+    /// Enqueue write of \a buffer.
     void checkedEnqueueWriteBuffer(cl_mem buffer, size_t cb, const void *ptr) {
       checkedEnqueueWriteBuffer(buffer, 0, cb, ptr);
     }
@@ -188,10 +201,9 @@ class CGOpenCLBase : public CG {
   /// @return all devices suitable for computation (excluding CPUs).
   static std::vector<cl_device_id> getAllDevices();
 
-private:
+  /// @return the loaded kernel called \a kernelname.
   cl_kernel checkedCreateKernel(const char *kernelName);
 
-protected:
   /// @return buffer of size \a size created with \a flags.
   cl_mem checkedCreateBufferWithFlags(cl_mem_flags flags, size_t size);
   /// @return read and write buffer.
@@ -223,10 +235,12 @@ class CGOpenCLBase : public CG {
   /// Free \a deviceMatrix.
   void freeMatrixELLDevice(const Device::MatrixELLDevice &deviceMatrix);
 
+  virtual void cleanup() override;
+
 public:
   /// @see CG
-  CGOpenCLBase() : CG(MatrixFormatELL, PreconditionerJacobi) {}
-  ~CGOpenCLBase();
+  CGOpenCLBase(bool overlappedGather = false)
+      : CG(MatrixFormatELL, PreconditionerJacobi, overlappedGather) {}
 };
 
 #endif
diff --git a/opencl/kernel.cl b/opencl/kernel.cl
@@ -18,6 +18,23 @@ __kernel void matvecKernelCRS(__global int *ptr, __global int *index,
   }
 }
 
+__kernel void matvecKernelCRSRoundup(__global int *ptr, __global int *index,
+                                     __global floatType *value,
+                                     __global floatType *x,
+                                     __global floatType *y, int yOffset,
+                                     int N) {
+  for (int i = get_global_id(0); i < N; i += get_global_size(0)) {
+    // Skip load and store if nothing to be done...
+    if (ptr[i] != ptr[i + 1]) {
+      floatType tmp = y[yOffset + i];
+      for (int j = ptr[i]; j < ptr[i + 1]; j++) {
+        tmp += value[j] * x[index[j]];
+      }
+      y[yOffset + i] = tmp;
+    }
+  }
+}
+
 __kernel void matvecKernelELL(__global int *length, __global int *index,
                               __global floatType *data, __global floatType *x,
                               __global floatType *y, int yOffset, int N) {
@@ -31,6 +48,23 @@ __kernel void matvecKernelELL(__global int *length, __global int *index,
   }
 }
 
+__kernel void matvecKernelELLRoundup(__global int *length, __global int *index,
+                                     __global floatType *data,
+                                     __global floatType *x,
+                                     __global floatType *y, int yOffset,
+                                     int N) {
+  for (int i = get_global_id(0); i < N; i += get_global_size(0)) {
+    if (length[i] > 0) {
+      floatType tmp = y[yOffset + i];
+      for (int j = 0; j < length[i]; j++) {
+        int k = j * N + i;
+        tmp += data[k] * x[index[k]];
+      }
+      y[yOffset + i] = tmp;
+    }
+  }
+}
+
 __kernel void axpyKernel(floatType a, __global floatType *x, int xOffset,
                          __global floatType *y, int yOffset, int N) {
   for (int i = get_global_id(0); i < N; i += get_global_size(0)) {