Skip to content

Commit

Permalink
refs #6: Fix and optimize IPsec GPU kernels.
Browse files Browse the repository at this point in the history
 * Enforce same alignment of data structures shared by the host CPU and
   CUDA GPUs using "alignas" C++11 keyword.

 * Fix wrong uses of pkt_idx, where they should be item_idx.
   (Note that IPsec parallelizes by the unit of "blocks", which are
    16-byte sized slices of packets)

 * Remove some unnecessary branches in IPsecAES kernels.

 * Let the CUDA engine to ignore "cudaErrorCudartUnloading" which
   may be returned from API calls during program termination.

 * Now the performance is half of the CPU version with 64-B packets.
  • Loading branch information
achimnol committed Feb 11, 2016
1 parent e75e280 commit a7eca05
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 102 deletions.
6 changes: 4 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ ELEMENT_HEADER_FILES = [s for s in compilelib.find_all(['elements'], r'^.+\.(h|h

# List of object files
OBJ_DIR = 'build'
os.makedirs('build', exist_ok=True)
OBJ_FILES = [joinpath(OBJ_DIR, o) for o in map(lambda s: re.sub(r'^(.+)\.(c|cc|cpp|cu)$', r'\1.o', s), SOURCE_FILES)]
GTEST_MAIN_OBJ = 'build/src/lib/gtest/gtest_main.o'
GTEST_FUSED_OBJ = 'build/src/lib/gtest/gtest-all.o'
Expand Down Expand Up @@ -112,13 +113,14 @@ if v: CFLAGS += ' -DNBA_RANDOM_PORT_ACCESS'

# NVIDIA CUDA configurations
if USE_CUDA:
os.makedirs('build/nvcc-temp', exist_ok=True)
CUDA_ARCHS = compilelib.get_cuda_arch()
NVCFLAGS = '-O2 -g -std=c++11 --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include'
NVCFLAGS = '-O2 -lineinfo -g -std=c++11 --keep --keep-dir build/nvcc-temp --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include'
CFLAGS += ' -I/usr/local/cuda/include'
LIBS += ' -L/usr/local/cuda/lib64 -lcudart' #' -lnvidia-ml'
print(CUDA_ARCHS)
if os.getenv('DEBUG', 0):
NVCFLAGS = '-O0 --device-debug -g -G -std=c++11 --use_fast_math -Iinclude -I/usr/local/cuda/include --ptxas-options=-v'
NVCFLAGS = '-O0 -lineinfo -G -g -std=c++11 --keep --keep-dir build/nvcc-temp --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include --ptxas-options=-v'
if len(CUDA_ARCHS) == 0:
NVCFLAGS += ' -DMP_USE_64BIT=0' \
+ ' -gencode arch=compute_10,code=sm_10' \
Expand Down
26 changes: 13 additions & 13 deletions elements/ipsec/IPsecAES.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ int IPsecAES::initialize()
h_sa_table = (unordered_map<struct ipaddr_pair, int> *)ctx->node_local_storage->get_alloc("h_aes_sa_table");

/* Storage for host aes key array */
h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array");
h_flows = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_flows");

/* Get device pointer from the node local storage. */
d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr");
d_flows_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_flows_ptr");

if (aes_sa_entry_array != NULL) {
free(aes_sa_entry_array);
Expand Down Expand Up @@ -115,13 +115,13 @@ int IPsecAES::initialize_per_node()

/* Storage for host aes key array */
size = sizeof(struct aes_sa_entry) * num_tunnels;
ctx->node_local_storage->alloc("h_aes_key_array", size);
temp_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array");
ctx->node_local_storage->alloc("h_aes_flows", size);
temp_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_flows");
assert(aes_sa_entry_array != NULL);
rte_memcpy(temp_array, aes_sa_entry_array, size);

/* Storage for pointer, which points aes key array in device */
ctx->node_local_storage->alloc("d_aes_key_array_ptr", sizeof(dev_mem_t));
ctx->node_local_storage->alloc("d_aes_flows_ptr", sizeof(dev_mem_t));

return 0;
}
Expand Down Expand Up @@ -156,7 +156,7 @@ int IPsecAES::process(int input_port, Packet *pkt)
struct aes_sa_entry *sa_entry = NULL;

if (likely(anno_isset(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID))) {
sa_entry = &h_key_array[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)];
sa_entry = &h_flows[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)];
unsigned mode = 0;
#ifdef USE_OPENSSL_EVP
int cipher_body_len = 0;
Expand All @@ -182,20 +182,20 @@ int IPsecAES::process(int input_port, Packet *pkt)
void IPsecAES::cuda_init_handler(ComputeDevice *device)
{
// Put key array content to device space.
size_t key_array_size = sizeof(struct aes_sa_entry) * num_tunnels;
h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array");
dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size);
device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size);
size_t flows_size = sizeof(struct aes_sa_entry) * num_tunnels;
h_flows = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_flows");
dev_mem_t flows_in_device = device->alloc_device_buffer(flows_size);
device->memwrite({ h_flows }, flows_in_device, 0, flows_size);

// Store the device pointer for per-thread instances.
dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr");
*p = key_array_in_device;
dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_flows_ptr");
*p = flows_in_device;
}

void IPsecAES::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res)
{
struct kernel_arg arg;
arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)};
arg = {(void *) &d_flows_ptr->ptr, sizeof(void *), alignof(void *)};
cctx->push_kernel_arg(arg);

dev_kernel_t kern;
Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAES.hh
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ protected:

/* Per-thread pointers, which points to the node local storage variables. */
std::unordered_map<struct ipaddr_pair, int> *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr.
struct aes_sa_entry *h_key_array = nullptr; // used in CPU.
dev_mem_t *d_key_array_ptr;
struct aes_sa_entry *h_flows = nullptr; // used in CPU.
dev_mem_t *d_flows_ptr;
};

EXPORT_ELEMENT(IPsecAES);
Expand Down
89 changes: 40 additions & 49 deletions elements/ipsec/IPsecAES_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
struct datablock_kernel_arg **datablocks,
uint32_t count, uint8_t *batch_ids, uint16_t *item_ids,
uint8_t *checkbits_d,
struct aes_sa_entry* flow_info
struct aes_sa_entry* flows
)
{
__shared__ uint32_t shared_Te0[256];
Expand All @@ -696,96 +696,87 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5(
if (idx < count && count != 0) {

const uint8_t batch_idx = batch_ids[idx];
const uint16_t item_idx = item_ids[idx];
const uint16_t item_idx = item_ids[idx];

const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d];
const struct datablock_kernel_arg *db_iv = datablocks[dbid_iv_d];
const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d];
const struct datablock_kernel_arg *db_aes_block_info = datablocks[dbid_aes_block_info_d];
const struct datablock_kernel_arg *const db_flow_ids = datablocks[dbid_flow_ids_d];
const struct datablock_kernel_arg *const db_iv = datablocks[dbid_iv_d];
const struct datablock_kernel_arg *const db_aes_block_info = datablocks[dbid_aes_block_info_d];

assert(batch_idx < 32);
assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in);

uint64_t flow_id = 65536;
const struct aes_block_info cur_block_info = ((struct aes_block_info *)
const struct aes_block_info &cur_block_info = ((struct aes_block_info *)
db_aes_block_info->batches[batch_idx].buffer_bases_in)
[item_idx];
const int pkt_idx = cur_block_info.pkt_idx;
const int block_idx_local = cur_block_info.block_idx;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx];
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];

if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) {
flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx];
if (flow_id != 65536)
assert(flow_id < 1024);
}

/* Step 1. */
uint4 iv = {0,0,0,0};
uint4 ecounter = {0,0,0,0};
uint8_t *aes_key = NULL;
uint8_t *enc_payload = NULL;

if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64) {

aes_key = flow_info[flow_id].aes_key;
iv = ((uint4 *) db_iv->batches[batch_idx].buffer_bases_in)[pkt_idx];

if (offset != 0 && length != 0) {

enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset;

/* Step 2. (marginal) */
for (int i = 0; i * blockDim.x < 256; i++) {
int index = threadIdx.x + blockDim.x * i;
if (index < 256) {
shared_Te0[index] = Te0_ConstMem[index];
shared_Te1[index] = Te1_ConstMem[index];
shared_Te2[index] = Te2_ConstMem[index];
shared_Te3[index] = Te3_ConstMem[index];
}
}

for (int i = 0; i * blockDim.x < 10; i++) {
int index = threadIdx.x + blockDim.x * i;
if (index < 10) {
shared_Rcon[index] = rcon[index];
}
}
/* Step 2. (marginal) */
for (int i = 0; i * blockDim.x < 256; i++) {
int index = threadIdx.x + blockDim.x * i;
if (index < 256) {
shared_Te0[index] = Te0_ConstMem[index];
shared_Te1[index] = Te1_ConstMem[index];
shared_Te2[index] = Te2_ConstMem[index];
shared_Te3[index] = Te3_ConstMem[index];
}
}

for (int i = 0; i * blockDim.x < 10; i++) {
int index = threadIdx.x + blockDim.x * i;
if (index < 10) {
shared_Rcon[index] = rcon[index];
}
}

__syncthreads();

if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64 && enc_payload != NULL && aes_key != NULL) {
if (flow_id != 65536) {
assert(pkt_idx < 64);
assert(length != 0);

const uint8_t *const aes_key = flows[flow_id].aes_key;
uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases_in
+ (uintptr_t) (16 * pkt_idx));
const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset;
uint4 ecounter = {0,0,0,0};

assert(enc_payload != NULL);

/* Step 3: Update the IV counters. */
AES_ctr128_inc((unsigned char*) &iv, block_idx_local);
AES_ctr128_inc(iv, block_idx_local);

/* Step 4: Encrypt the counter (this is the bottleneck) */
AES_encrypt_cu_optimized((uint8_t*) &iv, (uint8_t *) &ecounter,
AES_encrypt_cu_optimized(iv, (uint8_t *) &ecounter,
aes_key, shared_Te0, shared_Te1, shared_Te2,
shared_Te3, shared_Rcon);
//AES_encrypt_cu_optimized((uint8_t*) &iv, (uint8_t *) &ecounter,
//AES_encrypt_cu_optimized(iv, (uint8_t *) &ecounter,
// aes_key, Te0_ConstMem, Te1_ConstMem, Te2_ConstMem,
// Te3_ConstMem, rcon);

/* Step 5: XOR the plain text (in-place). */
uint4 *in_blk = (uint4 *) &enc_payload[block_idx_local * AES_BLOCK_SIZE];
assert((uint8_t*)in_blk + AES_BLOCK_SIZE <=
enc_payload + db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]);
assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= enc_payload + length);
(*in_blk).x = ecounter.x ^ (*in_blk).x;
(*in_blk).y = ecounter.y ^ (*in_blk).y;
(*in_blk).z = ecounter.z ^ (*in_blk).z;
(*in_blk).w = ecounter.w ^ (*in_blk).w;
}

} /* endif (idx < total_count) */

__syncthreads();
if (threadIdx.x == 0 && checkbits_d != NULL)
checkbits_d[blockIdx.x] = 1;

} // endif(valid-idx)
}

void *nba::ipsec_aes_encryption_get_cuda_kernel() {
Expand Down
26 changes: 13 additions & 13 deletions elements/ipsec/IPsecAuthHMACSHA1.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ int IPsecAuthHMACSHA1::initialize()
h_sa_table = (unordered_map<struct ipaddr_pair, int> *)ctx->node_local_storage->get_alloc("h_hmac_sa_table");

/* Storage for host hmac key array */
h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array");
h_flows = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_flows");

/* Get device pointer from the node local storage. */
d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr");
d_flows_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_flows_ptr");

if (hmac_sa_entry_array != NULL) {
free(hmac_sa_entry_array);
Expand Down Expand Up @@ -105,13 +105,13 @@ int IPsecAuthHMACSHA1::initialize_per_node()

/* Storage for host hmac key array */
size = sizeof(struct hmac_sa_entry) * num_tunnels;
ctx->node_local_storage->alloc("h_hmac_key_array", size);
temp_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array");
ctx->node_local_storage->alloc("h_hmac_flows", size);
temp_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_flows");
assert(hmac_sa_entry_array != NULL);
rte_memcpy(temp_array, hmac_sa_entry_array, size);

/* Storage for pointer, which points hmac key array in device */
ctx->node_local_storage->alloc("d_hmac_key_array_ptr", sizeof(dev_mem_t));
ctx->node_local_storage->alloc("d_hmac_flows_ptr", sizeof(dev_mem_t));

return 0;
}
Expand Down Expand Up @@ -151,7 +151,7 @@ int IPsecAuthHMACSHA1::process(int input_port, Packet *pkt)

uint8_t *hmac_key;
if (likely(anno_isset(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID))) {
sa_entry = &h_key_array[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)];
sa_entry = &h_flows[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)];
hmac_key = sa_entry->hmac_key;

rte_memcpy(hmac_buf + 64, payload_out, payload_len);
Expand All @@ -177,21 +177,21 @@ int IPsecAuthHMACSHA1::process(int input_port, Packet *pkt)
void IPsecAuthHMACSHA1::cuda_init_handler(ComputeDevice *device)
{
// Put key array content to device space.
size_t key_array_size = sizeof(struct hmac_sa_entry) * num_tunnels;
h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array");
dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size);
device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size);
size_t flows_size = sizeof(struct hmac_sa_entry) * num_tunnels;
h_flows = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_flows");
dev_mem_t flows_in_device = device->alloc_device_buffer(flows_size);
device->memwrite({ h_flows }, flows_in_device, 0, flows_size);

// Store the device pointer for per-thread instances.
dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr");
*p = key_array_in_device;
dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_flows_ptr");
*p = flows_in_device;
}

void IPsecAuthHMACSHA1::cuda_compute_handler(ComputeContext *cctx,
struct resource_param *res)
{
struct kernel_arg arg;
arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)};
arg = {(void *) &d_flows_ptr->ptr, sizeof(void *), alignof(void *)};
cctx->push_kernel_arg(arg);

dev_kernel_t kern;
Expand Down
4 changes: 2 additions & 2 deletions elements/ipsec/IPsecAuthHMACSHA1.hh
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ protected:
int dummy_index;

std::unordered_map<struct ipaddr_pair, int> *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr.
struct hmac_sa_entry *h_key_array = nullptr; // used in CPU.
dev_mem_t *d_key_array_ptr; // points to the device buffer.
struct hmac_sa_entry *h_flows = nullptr; // used in CPU.
dev_mem_t *d_flows_ptr; // points to the device buffer.

private:
const int idx_pkt_offset = 0;
Expand Down
14 changes: 7 additions & 7 deletions elements/ipsec/IPsecAuthHMACSHA1_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1254,21 +1254,21 @@ __global__ void computeHMAC_SHA1_3(
const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in;
const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value<uintptr_t>();
const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx];
if (enc_payload_base != NULL && offset != 0 && length != 0) {
if (enc_payload_base != NULL && length != 0) {
const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx];
if (flow_id != 65536 && flow_id < 1024) {
//assert(flow_id < 1024);
if (flow_id != 65536) {
assert(flow_id < 1024);
const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key;
HMAC_SHA1((uint32_t *) (enc_payload_base + offset),
(uint32_t *) (enc_payload_base + offset + length),
length, hmac_key);
}
}
}

__syncthreads();
if (threadIdx.x == 0 && checkbits_d != NULL)
checkbits_d[blockIdx.x] = 1;
__syncthreads();
if (threadIdx.x == 0 && checkbits_d != NULL)
checkbits_d[blockIdx.x] = 1;
} // endif(valid-idx)
}

}
Expand Down
8 changes: 4 additions & 4 deletions elements/ipsec/util_sa_entry.hh
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,27 @@ enum {
HMAC_KEY_SIZE = 64,
};

struct aes_block_info {
struct alignas(8) aes_block_info {
int pkt_idx;
int block_idx;
int pkt_offset;
int magic;
};

struct aes_sa_entry {
struct alignas(8) aes_sa_entry {
// Below two variables have same value.
uint8_t aes_key[AES_BLOCK_SIZE]; // Used in CUDA encryption.
AES_KEY aes_key_t; // Prepared for AES library function.
EVP_CIPHER_CTX evpctx;
int entry_idx; // Index of current flow: value for verification.
};

struct hmac_sa_entry {
struct alignas(8) hmac_sa_entry {
uint8_t hmac_key[HMAC_KEY_SIZE];
int entry_idx;
};

struct hmac_aes_sa_entry {
struct alignas(8) hmac_aes_sa_entry {
// Below two variables have same value.
uint8_t aes_key[AES_BLOCK_SIZE]; // Used in CUDA encryption.
AES_KEY aes_key_t; // Prepared for AES library function.
Expand Down
3 changes: 2 additions & 1 deletion include/nba/engines/cuda/computecontext.hh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ public:
cudaError_t ret = cudaStreamQuery(_stream);
if (ret == cudaErrorNotReady)
return false;
assert(ret == cudaSuccess);
// ignore non-cudaSuccess results...
// (may happend on termination)
return true;
}

Expand Down
Loading

0 comments on commit a7eca05

Please sign in to comment.