forked from takezo5096/cuMat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmat_vec_mul_kernel.cu
28 lines (20 loc) · 971 Bytes
/
mat_vec_mul_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#include "mat_vec_mul_kernel.h"
#define BLOCK_SIZE 32
__global__ void mat_vec_mul_kernel (const float * __restrict__ src_mat,
const float * __restrict__ src_vec,
float * __restrict__ dst, int m, int n, int axis){
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x+threadIdx.x;
if (row < m && col < n){
if (axis == 0) dst[row * n + col] = src_mat[row * n + col] * src_vec[col];
if (axis == 1) dst[row * n + col] = src_mat[row * n + col] * src_vec[row];
}
}
void mat_vec_mul_kernel_exec(const float *src_mat, const float *src_vec, float *dst, int m, int n, int axis){
/* specified block and grid size */
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid((n+block.x-1)/block.x, (m+block.y-1)/block.y);
/* lunch kernel */
mat_vec_mul_kernel<<<grid, block>>>(src_mat, src_vec, dst, m, n, axis);
cudaThreadSynchronize();
}