diff --git a/Snakefile b/Snakefile index 73df518..86d2a2c 100644 --- a/Snakefile +++ b/Snakefile @@ -74,6 +74,7 @@ ELEMENT_HEADER_FILES = [s for s in compilelib.find_all(['elements'], r'^.+\.(h|h # List of object files OBJ_DIR = 'build' +os.makedirs('build', exist_ok=True) OBJ_FILES = [joinpath(OBJ_DIR, o) for o in map(lambda s: re.sub(r'^(.+)\.(c|cc|cpp|cu)$', r'\1.o', s), SOURCE_FILES)] GTEST_MAIN_OBJ = 'build/src/lib/gtest/gtest_main.o' GTEST_FUSED_OBJ = 'build/src/lib/gtest/gtest-all.o' @@ -112,13 +113,14 @@ if v: CFLAGS += ' -DNBA_RANDOM_PORT_ACCESS' # NVIDIA CUDA configurations if USE_CUDA: + os.makedirs('build/nvcc-temp', exist_ok=True) CUDA_ARCHS = compilelib.get_cuda_arch() - NVCFLAGS = '-O2 -g -std=c++11 --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include' + NVCFLAGS = '-O2 -lineinfo -g -std=c++11 --keep --keep-dir build/nvcc-temp --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include' CFLAGS += ' -I/usr/local/cuda/include' LIBS += ' -L/usr/local/cuda/lib64 -lcudart' #' -lnvidia-ml' print(CUDA_ARCHS) if os.getenv('DEBUG', 0): - NVCFLAGS = '-O0 --device-debug -g -G -std=c++11 --use_fast_math -Iinclude -I/usr/local/cuda/include --ptxas-options=-v' + NVCFLAGS = '-O0 -lineinfo -G -g -std=c++11 --keep --keep-dir build/nvcc-temp --use_fast_math --expt-relaxed-constexpr -Iinclude -I/usr/local/cuda/include --ptxas-options=-v' if len(CUDA_ARCHS) == 0: NVCFLAGS += ' -DMP_USE_64BIT=0' \ + ' -gencode arch=compute_10,code=sm_10' \ diff --git a/elements/ipsec/IPsecAES.cc b/elements/ipsec/IPsecAES.cc index cd06e95..526b515 100644 --- a/elements/ipsec/IPsecAES.cc +++ b/elements/ipsec/IPsecAES.cc @@ -48,10 +48,10 @@ int IPsecAES::initialize() h_sa_table = (unordered_map *)ctx->node_local_storage->get_alloc("h_aes_sa_table"); /* Storage for host aes key array */ - h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array"); + h_flows = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_flows"); /* Get device pointer from the node local storage. */ - d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr"); + d_flows_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_flows_ptr"); if (aes_sa_entry_array != NULL) { free(aes_sa_entry_array); @@ -115,13 +115,13 @@ int IPsecAES::initialize_per_node() /* Storage for host aes key array */ size = sizeof(struct aes_sa_entry) * num_tunnels; - ctx->node_local_storage->alloc("h_aes_key_array", size); - temp_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array"); + ctx->node_local_storage->alloc("h_aes_flows", size); + temp_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_flows"); assert(aes_sa_entry_array != NULL); rte_memcpy(temp_array, aes_sa_entry_array, size); /* Storage for pointer, which points aes key array in device */ - ctx->node_local_storage->alloc("d_aes_key_array_ptr", sizeof(dev_mem_t)); + ctx->node_local_storage->alloc("d_aes_flows_ptr", sizeof(dev_mem_t)); return 0; } @@ -156,7 +156,7 @@ int IPsecAES::process(int input_port, Packet *pkt) struct aes_sa_entry *sa_entry = NULL; if (likely(anno_isset(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID))) { - sa_entry = &h_key_array[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)]; + sa_entry = &h_flows[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)]; unsigned mode = 0; #ifdef USE_OPENSSL_EVP int cipher_body_len = 0; @@ -182,20 +182,20 @@ int IPsecAES::process(int input_port, Packet *pkt) void IPsecAES::cuda_init_handler(ComputeDevice *device) { // Put key array content to device space. - size_t key_array_size = sizeof(struct aes_sa_entry) * num_tunnels; - h_key_array = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_key_array"); - dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size); - device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size); + size_t flows_size = sizeof(struct aes_sa_entry) * num_tunnels; + h_flows = (struct aes_sa_entry *) ctx->node_local_storage->get_alloc("h_aes_flows"); + dev_mem_t flows_in_device = device->alloc_device_buffer(flows_size); + device->memwrite({ h_flows }, flows_in_device, 0, flows_size); // Store the device pointer for per-thread instances. - dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_key_array_ptr"); - *p = key_array_in_device; + dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_aes_flows_ptr"); + *p = flows_in_device; } void IPsecAES::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res) { struct kernel_arg arg; - arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)}; + arg = {(void *) &d_flows_ptr->ptr, sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); dev_kernel_t kern; diff --git a/elements/ipsec/IPsecAES.hh b/elements/ipsec/IPsecAES.hh index 2d07b1d..df74a10 100644 --- a/elements/ipsec/IPsecAES.hh +++ b/elements/ipsec/IPsecAES.hh @@ -61,8 +61,8 @@ protected: /* Per-thread pointers, which points to the node local storage variables. */ std::unordered_map *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr. - struct aes_sa_entry *h_key_array = nullptr; // used in CPU. - dev_mem_t *d_key_array_ptr; + struct aes_sa_entry *h_flows = nullptr; // used in CPU. + dev_mem_t *d_flows_ptr; }; EXPORT_ELEMENT(IPsecAES); diff --git a/elements/ipsec/IPsecAES_kernel.cu b/elements/ipsec/IPsecAES_kernel.cu index 478d312..d5fab3c 100644 --- a/elements/ipsec/IPsecAES_kernel.cu +++ b/elements/ipsec/IPsecAES_kernel.cu @@ -683,7 +683,7 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( struct datablock_kernel_arg **datablocks, uint32_t count, uint8_t *batch_ids, uint16_t *item_ids, uint8_t *checkbits_d, - struct aes_sa_entry* flow_info + struct aes_sa_entry* flows ) { __shared__ uint32_t shared_Te0[256]; @@ -696,24 +696,23 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( if (idx < count && count != 0) { const uint8_t batch_idx = batch_ids[idx]; - const uint16_t item_idx = item_ids[idx]; + const uint16_t item_idx = item_ids[idx]; const struct datablock_kernel_arg *db_enc_payloads = datablocks[dbid_enc_payloads_d]; - const struct datablock_kernel_arg *db_iv = datablocks[dbid_iv_d]; - const struct datablock_kernel_arg *db_flow_ids = datablocks[dbid_flow_ids_d]; - const struct datablock_kernel_arg *db_aes_block_info = datablocks[dbid_aes_block_info_d]; + const struct datablock_kernel_arg *const db_flow_ids = datablocks[dbid_flow_ids_d]; + const struct datablock_kernel_arg *const db_iv = datablocks[dbid_iv_d]; + const struct datablock_kernel_arg *const db_aes_block_info = datablocks[dbid_aes_block_info_d]; - assert(batch_idx < 32); assert(item_idx < db_aes_block_info->batches[batch_idx].item_count_in); uint64_t flow_id = 65536; - const struct aes_block_info cur_block_info = ((struct aes_block_info *) + const struct aes_block_info &cur_block_info = ((struct aes_block_info *) db_aes_block_info->batches[batch_idx].buffer_bases_in) [item_idx]; const int pkt_idx = cur_block_info.pkt_idx; const int block_idx_local = cur_block_info.block_idx; - const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[pkt_idx].as_value(); - const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]; + const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value(); + const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx]; if (cur_block_info.magic == 85739 && pkt_idx < 64 && offset != 0 && length != 0) { flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[pkt_idx]; @@ -721,71 +720,63 @@ __global__ void AES_ctr_encrypt_chunk_SharedMem_5( assert(flow_id < 1024); } - /* Step 1. */ - uint4 iv = {0,0,0,0}; - uint4 ecounter = {0,0,0,0}; - uint8_t *aes_key = NULL; - uint8_t *enc_payload = NULL; - - if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64) { - - aes_key = flow_info[flow_id].aes_key; - iv = ((uint4 *) db_iv->batches[batch_idx].buffer_bases_in)[pkt_idx]; - - if (offset != 0 && length != 0) { - - enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset; - - /* Step 2. (marginal) */ - for (int i = 0; i * blockDim.x < 256; i++) { - int index = threadIdx.x + blockDim.x * i; - if (index < 256) { - shared_Te0[index] = Te0_ConstMem[index]; - shared_Te1[index] = Te1_ConstMem[index]; - shared_Te2[index] = Te2_ConstMem[index]; - shared_Te3[index] = Te3_ConstMem[index]; - } - } - - for (int i = 0; i * blockDim.x < 10; i++) { - int index = threadIdx.x + blockDim.x * i; - if (index < 10) { - shared_Rcon[index] = rcon[index]; - } - } + /* Step 2. (marginal) */ + for (int i = 0; i * blockDim.x < 256; i++) { + int index = threadIdx.x + blockDim.x * i; + if (index < 256) { + shared_Te0[index] = Te0_ConstMem[index]; + shared_Te1[index] = Te1_ConstMem[index]; + shared_Te2[index] = Te2_ConstMem[index]; + shared_Te3[index] = Te3_ConstMem[index]; + } + } + + for (int i = 0; i * blockDim.x < 10; i++) { + int index = threadIdx.x + blockDim.x * i; + if (index < 10) { + shared_Rcon[index] = rcon[index]; } } __syncthreads(); - if (flow_id != 65536 && flow_id < 1024 && pkt_idx < 64 && enc_payload != NULL && aes_key != NULL) { + if (flow_id != 65536) { + assert(pkt_idx < 64); + assert(length != 0); + + const uint8_t *const aes_key = flows[flow_id].aes_key; + uint8_t *iv = ((uint8_t *) db_iv->batches[batch_idx].buffer_bases_in + + (uintptr_t) (16 * pkt_idx)); + const uint8_t *enc_payload = ((uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in) + offset; + uint4 ecounter = {0,0,0,0}; + + assert(enc_payload != NULL); /* Step 3: Update the IV counters. */ - AES_ctr128_inc((unsigned char*) &iv, block_idx_local); + AES_ctr128_inc(iv, block_idx_local); /* Step 4: Encrypt the counter (this is the bottleneck) */ - AES_encrypt_cu_optimized((uint8_t*) &iv, (uint8_t *) &ecounter, + AES_encrypt_cu_optimized(iv, (uint8_t *) &ecounter, aes_key, shared_Te0, shared_Te1, shared_Te2, shared_Te3, shared_Rcon); - //AES_encrypt_cu_optimized((uint8_t*) &iv, (uint8_t *) &ecounter, + //AES_encrypt_cu_optimized(iv, (uint8_t *) &ecounter, // aes_key, Te0_ConstMem, Te1_ConstMem, Te2_ConstMem, // Te3_ConstMem, rcon); /* Step 5: XOR the plain text (in-place). */ uint4 *in_blk = (uint4 *) &enc_payload[block_idx_local * AES_BLOCK_SIZE]; - assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= - enc_payload + db_enc_payloads->batches[batch_idx].item_sizes_in[pkt_idx]); + assert((uint8_t*)in_blk + AES_BLOCK_SIZE <= enc_payload + length); (*in_blk).x = ecounter.x ^ (*in_blk).x; (*in_blk).y = ecounter.y ^ (*in_blk).y; (*in_blk).z = ecounter.z ^ (*in_blk).z; (*in_blk).w = ecounter.w ^ (*in_blk).w; } - } /* endif (idx < total_count) */ - __syncthreads(); if (threadIdx.x == 0 && checkbits_d != NULL) checkbits_d[blockIdx.x] = 1; + + } // endif(valid-idx) } void *nba::ipsec_aes_encryption_get_cuda_kernel() { diff --git a/elements/ipsec/IPsecAuthHMACSHA1.cc b/elements/ipsec/IPsecAuthHMACSHA1.cc index 29c73b0..a0bee55 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1.cc +++ b/elements/ipsec/IPsecAuthHMACSHA1.cc @@ -48,10 +48,10 @@ int IPsecAuthHMACSHA1::initialize() h_sa_table = (unordered_map *)ctx->node_local_storage->get_alloc("h_hmac_sa_table"); /* Storage for host hmac key array */ - h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array"); + h_flows = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_flows"); /* Get device pointer from the node local storage. */ - d_key_array_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr"); + d_flows_ptr = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_flows_ptr"); if (hmac_sa_entry_array != NULL) { free(hmac_sa_entry_array); @@ -105,13 +105,13 @@ int IPsecAuthHMACSHA1::initialize_per_node() /* Storage for host hmac key array */ size = sizeof(struct hmac_sa_entry) * num_tunnels; - ctx->node_local_storage->alloc("h_hmac_key_array", size); - temp_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array"); + ctx->node_local_storage->alloc("h_hmac_flows", size); + temp_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_flows"); assert(hmac_sa_entry_array != NULL); rte_memcpy(temp_array, hmac_sa_entry_array, size); /* Storage for pointer, which points hmac key array in device */ - ctx->node_local_storage->alloc("d_hmac_key_array_ptr", sizeof(dev_mem_t)); + ctx->node_local_storage->alloc("d_hmac_flows_ptr", sizeof(dev_mem_t)); return 0; } @@ -151,7 +151,7 @@ int IPsecAuthHMACSHA1::process(int input_port, Packet *pkt) uint8_t *hmac_key; if (likely(anno_isset(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID))) { - sa_entry = &h_key_array[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)]; + sa_entry = &h_flows[anno_get(&pkt->anno, NBA_ANNO_IPSEC_FLOW_ID)]; hmac_key = sa_entry->hmac_key; rte_memcpy(hmac_buf + 64, payload_out, payload_len); @@ -177,21 +177,21 @@ int IPsecAuthHMACSHA1::process(int input_port, Packet *pkt) void IPsecAuthHMACSHA1::cuda_init_handler(ComputeDevice *device) { // Put key array content to device space. - size_t key_array_size = sizeof(struct hmac_sa_entry) * num_tunnels; - h_key_array = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_key_array"); - dev_mem_t key_array_in_device = device->alloc_device_buffer(key_array_size); - device->memwrite({ h_key_array }, key_array_in_device, 0, key_array_size); + size_t flows_size = sizeof(struct hmac_sa_entry) * num_tunnels; + h_flows = (struct hmac_sa_entry *) ctx->node_local_storage->get_alloc("h_hmac_flows"); + dev_mem_t flows_in_device = device->alloc_device_buffer(flows_size); + device->memwrite({ h_flows }, flows_in_device, 0, flows_size); // Store the device pointer for per-thread instances. - dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_key_array_ptr"); - *p = key_array_in_device; + dev_mem_t *p = (dev_mem_t *) ctx->node_local_storage->get_alloc("d_hmac_flows_ptr"); + *p = flows_in_device; } void IPsecAuthHMACSHA1::cuda_compute_handler(ComputeContext *cctx, struct resource_param *res) { struct kernel_arg arg; - arg = {(void *) &d_key_array_ptr->ptr, sizeof(void *), alignof(void *)}; + arg = {(void *) &d_flows_ptr->ptr, sizeof(void *), alignof(void *)}; cctx->push_kernel_arg(arg); dev_kernel_t kern; diff --git a/elements/ipsec/IPsecAuthHMACSHA1.hh b/elements/ipsec/IPsecAuthHMACSHA1.hh index b2fced0..36e3bdc 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1.hh +++ b/elements/ipsec/IPsecAuthHMACSHA1.hh @@ -60,8 +60,8 @@ protected: int dummy_index; std::unordered_map *h_sa_table; // tunnel lookup is done in CPU only. No need for GPU ptr. - struct hmac_sa_entry *h_key_array = nullptr; // used in CPU. - dev_mem_t *d_key_array_ptr; // points to the device buffer. + struct hmac_sa_entry *h_flows = nullptr; // used in CPU. + dev_mem_t *d_flows_ptr; // points to the device buffer. private: const int idx_pkt_offset = 0; diff --git a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu index ea6bd2b..eff63ae 100644 --- a/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu +++ b/elements/ipsec/IPsecAuthHMACSHA1_kernel.cu @@ -1254,21 +1254,21 @@ __global__ void computeHMAC_SHA1_3( const uint8_t *enc_payload_base = (uint8_t *) db_enc_payloads->batches[batch_idx].buffer_bases_in; const uintptr_t offset = (uintptr_t) db_enc_payloads->batches[batch_idx].item_offsets_in[item_idx].as_value(); const uintptr_t length = (uintptr_t) db_enc_payloads->batches[batch_idx].item_sizes_in[item_idx]; - if (enc_payload_base != NULL && offset != 0 && length != 0) { + if (enc_payload_base != NULL && length != 0) { const uint64_t flow_id = ((uint64_t *) db_flow_ids->batches[batch_idx].buffer_bases_in)[item_idx]; - if (flow_id != 65536 && flow_id < 1024) { - //assert(flow_id < 1024); + if (flow_id != 65536) { + assert(flow_id < 1024); const char *hmac_key = (char *) hmac_key_array[flow_id].hmac_key; HMAC_SHA1((uint32_t *) (enc_payload_base + offset), (uint32_t *) (enc_payload_base + offset + length), length, hmac_key); } } - } - __syncthreads(); - if (threadIdx.x == 0 && checkbits_d != NULL) - checkbits_d[blockIdx.x] = 1; + __syncthreads(); + if (threadIdx.x == 0 && checkbits_d != NULL) + checkbits_d[blockIdx.x] = 1; + } // endif(valid-idx) } } diff --git a/elements/ipsec/util_sa_entry.hh b/elements/ipsec/util_sa_entry.hh index a2ce1d8..39f4dc1 100644 --- a/elements/ipsec/util_sa_entry.hh +++ b/elements/ipsec/util_sa_entry.hh @@ -9,14 +9,14 @@ enum { HMAC_KEY_SIZE = 64, }; -struct aes_block_info { +struct alignas(8) aes_block_info { int pkt_idx; int block_idx; int pkt_offset; int magic; }; -struct aes_sa_entry { +struct alignas(8) aes_sa_entry { // Below two variables have same value. uint8_t aes_key[AES_BLOCK_SIZE]; // Used in CUDA encryption. AES_KEY aes_key_t; // Prepared for AES library function. @@ -24,12 +24,12 @@ struct aes_sa_entry { int entry_idx; // Index of current flow: value for verification. }; -struct hmac_sa_entry { +struct alignas(8) hmac_sa_entry { uint8_t hmac_key[HMAC_KEY_SIZE]; int entry_idx; }; -struct hmac_aes_sa_entry { +struct alignas(8) hmac_aes_sa_entry { // Below two variables have same value. uint8_t aes_key[AES_BLOCK_SIZE]; // Used in CUDA encryption. AES_KEY aes_key_t; // Prepared for AES library function. diff --git a/include/nba/engines/cuda/computecontext.hh b/include/nba/engines/cuda/computecontext.hh index 6b180e3..8a8bb6c 100644 --- a/include/nba/engines/cuda/computecontext.hh +++ b/include/nba/engines/cuda/computecontext.hh @@ -62,7 +62,8 @@ public: cudaError_t ret = cudaStreamQuery(_stream); if (ret == cudaErrorNotReady) return false; - assert(ret == cudaSuccess); + // ignore non-cudaSuccess results... + // (may happend on termination) return true; } diff --git a/include/nba/engines/cuda/utils.hh b/include/nba/engines/cuda/utils.hh index 1146c78..7c4605c 100644 --- a/include/nba/engines/cuda/utils.hh +++ b/include/nba/engines/cuda/utils.hh @@ -12,22 +12,18 @@ * We should have our own cutilSafeCall() macro. */ -#ifdef __cplusplus - -#endif +extern "C" { inline void __cudaSafeCall(cudaError err, const char *file, const int line) { - if (cudaSuccess != err) { - fprintf(stderr, "%s(%i): CUDA Runtime Error %d: %s.\n", + if (cudaSuccess == err || cudaErrorCudartUnloading == err) + return; + fprintf(stderr, "%s(%i): CUDA Runtime Error %d: %s.\n", file, line, (int)err, cudaGetErrorString(err)); - exit(-1); - + exit(-1); } -#ifdef __cplusplus } -#endif #define cutilSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)