diff --git a/Makefile b/Makefile index fd6e78bc8..f7144b7db 100644 --- a/Makefile +++ b/Makefile @@ -80,7 +80,7 @@ ifeq ($(CAFFE_MLSL_SHUFFLE), 1) COMMON_FLAGS += -DCAFFE_MLSL_SHUFFLE endif -ifeq ($(FW_OVERLAP_OPT), 1) +ifneq ($(FW_OVERLAP_OPT), 0) COMMON_FLAGS += -DFW_OVERLAP_OPT endif endif @@ -547,6 +547,12 @@ LIBRARY_DIRS += $(LIB_BUILD_DIR) # Automatic dependency generation (nvcc is handled separately) CXXFLAGS += -MMD -MP +##########SGD FUSION####################### +ifeq ($(ENABLE_SGD_FUSION), 1) + COMMON_FLAGS += -DENABLE_SGD_FUSION +endif +########################################### +# # Complete build flags. COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CXXFLAGS += -std=c++11 -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS) diff --git a/Makefile.config.example b/Makefile.config.example index 8bfcc57a3..539a00a67 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -170,5 +170,8 @@ DISTRIBUTE_DIR := distribute # The ID of the GPU that 'make runtest' will use to run unit tests. TEST_GPUID := 0 +# Uncomment for enabling SGD fusion +# ENABLE_SGD_FUSION := 1 + # enable pretty build (comment to see full commands) Q ?= @ diff --git a/Makefile.mkldnn b/Makefile.mkldnn index ec1a70bc5..d113a8923 100644 --- a/Makefile.mkldnn +++ b/Makefile.mkldnn @@ -1,5 +1,5 @@ CAFFE_ROOTDIR := $(shell pwd) -MKLDNN_ROOTDIR := external/mkldnn +MKLDNN_ROOTDIR := $(CAFFE_ROOTDIR)/external/mkldnn MKLDNN_TMPDIR := $(MKLDNN_ROOTDIR)/tmp MKLDNN_SRCDIR := $(MKLDNN_ROOTDIR)/src MKLDNN_BUILDDIR := $(MKLDNN_ROOTDIR)/build @@ -22,7 +22,7 @@ ifneq (,$(findstring ccache,$(CC))) endif MKLDNN_GITHUB := https://github.com/01org/mkl-dnn.git -MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(CAFFE_ROOTDIR)/$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)" +MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)" ifeq ("$(wildcard $(MKLDNN_INSTALLDIR)/include/mkldnn.hpp)", "") mkldnn_download: @@ -32,8 +32,8 @@ mkldnn_download: mkldnn_build: mkldnn_download cmake $(MKLDNN_CMAKE_FLAGS) - make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l) - make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) install + make -C $(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l) + make -C $(MKLDNN_BUILDDIR) install else mkldnn_download: mkldnn_build: diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 67adf4ba7..b8c5577c6 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -122,7 +122,7 @@ if(USE_MLSL) if(CAFFE_MLSL_SHUFFLE) add_definitions("-DCAFFE_MLSL_SHUFFLE") endif() - if(FW_OVERLAP_OPT) + if(FW_OVERLAP_OPT OR NOT DEFINED FW_OVERLAP_OPT) message(STATUS "Forward overlapping optimization is enabled!") add_definitions("-DFW_OVERLAP_OPT") endif() diff --git a/examples/cpp_classification/batch_classification.cpp b/examples/cpp_classification/batch_classification.cpp index 374671baa..8295bf4e5 100644 --- a/examples/cpp_classification/batch_classification.cpp +++ b/examples/cpp_classification/batch_classification.cpp @@ -422,6 +422,10 @@ int main(int argc, char** argv) { cout<<"Use mean file: "< shape_data_; #endif vector shape_; - int count_; - int capacity_; + long count_; + long capacity_; DISABLE_COPY_AND_ASSIGN(Blob); }; // class Blob diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 45d65c799..5a95a7730 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -55,8 +55,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOG_BLOB(layer, blob, part, blob_id, description) \ do \ { \ - int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count()); \ - for (int idx = 0; idx < elems_to_log; idx++) \ + long elems_to_log = std::min(static_cast(MAX_ELEMS_TO_LOG), blob->count()); \ + for (long idx = 0; idx < elems_to_log; idx++) \ { \ LOG_LAYER(layer) << description \ << ", blob_id " << blob_id \ @@ -68,8 +68,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOG_PARAM_BLOB(blob, part, blob_id, description) \ do \ { \ - int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count()); \ - for (int idx = 0; idx < elems_to_log; idx++) \ + long elems_to_log = std::min(static_cast(MAX_ELEMS_TO_LOG), blob->count()); \ + for (long idx = 0; idx < elems_to_log; idx++) \ { \ DLOG(INFO) << description \ << ", blob_id " << blob_id \ @@ -521,7 +521,12 @@ class Layer { CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " "unspecified or specified once per top blob."; for (int top_id = 0; top_id < top.size(); ++top_id) { +#ifdef USE_MLSL + const Dtype loss_weight = layer_param_.loss_weight(top_id) / + GetDistribution().get_data_parts(); +#else const Dtype loss_weight = layer_param_.loss_weight(top_id); +#endif if (loss_weight == Dtype(0)) { continue; } this->set_loss(top_id, loss_weight); const int count = top[top_id]->count(); diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp index e83bab953..c777de30c 100644 --- a/include/caffe/layers/batch_norm_layer.hpp +++ b/include/caffe/layers/batch_norm_layer.hpp @@ -117,11 +117,19 @@ class BatchNormLayer : public Layer { const Dtype* data_to_be_replicated, FuncTy op_func); + void ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx); + void BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx); + Blob mean_, variance_, temp_, x_norm_; bool use_global_stats_; Dtype moving_average_fraction_; int channels_; Dtype eps_; + int num_stats_batches_; + int stats_batch_size_; // extra temporarary variables is used to carry out sums/broadcasting // using BLAS diff --git a/include/caffe/layers/mkl_layers.hpp b/include/caffe/layers/mkl_layers.hpp index 0d5d66416..c9806daee 100644 --- a/include/caffe/layers/mkl_layers.hpp +++ b/include/caffe/layers/mkl_layers.hpp @@ -481,12 +481,12 @@ class MKLBatchNormLayer : public Layer { batchNormFwd(static_cast(NULL)), batchNormFwdInference(static_cast(NULL)), batchNormBwd(static_cast(NULL)), - mean_buffer_(static_cast(NULL)), - variance_buffer_(static_cast(NULL)), scaleShift_buffer_(static_cast(NULL)), diffScaleShift_buffer_(static_cast(NULL)), layout_usr_(static_cast(NULL)), - use_global_stats_(false) + use_global_stats_(false), + num_stats_batches_(1), + stats_batch_size_(0) { PERFORMANCE_EVENT_ID_RESET(perf_id_fw_); PERFORMANCE_EVENT_ID_RESET(perf_id_bw_); @@ -515,6 +515,12 @@ class MKLBatchNormLayer : public Layer { virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + void ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx); + void BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx); + void Init(const vector*>& bottom, const vector*>& top); @@ -534,12 +540,14 @@ class MKLBatchNormLayer : public Layer { shared_ptr > bwd_bottom_diff; Blob temp_; dnnPrimitive_t batchNormFwd, batchNormFwdInference, batchNormBwd; - Dtype *mean_buffer_; - Dtype *variance_buffer_; + vector mean_buffers_; + vector variance_buffers_; Dtype *scaleShift_buffer_; Dtype *diffScaleShift_buffer_; dnnLayout_t layout_usr_; bool use_global_stats_; + int num_stats_batches_; + int stats_batch_size_; PERFORMANCE_EVENT_ID_DECL(perf_id_fw_); PERFORMANCE_EVENT_ID_DECL(perf_id_bw_); diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp index f63301e2a..f7ce1062e 100644 --- a/include/caffe/layers/mkldnn_layers.hpp +++ b/include/caffe/layers/mkldnn_layers.hpp @@ -68,7 +68,6 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { , fwd_top_data(), fwd_bottom_data() , bwd_top_diff(), bwd_bottom_diff() , BatchNormFwd_pd(), BatchNormBwd_pd() - , mean_memory(), variance_memory() , scaleshift_memory(), bwd_scaleshift_diff_memory() , output_memory(), bwd_bottom_diff_memory() , input_primitive(), bwd_top_diff_primitive() @@ -96,22 +95,32 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { void InitBatchNormBwd(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + void InitBatchNormFwdPrimitive(int stats_batch_idx); + void InitBatchNormBwdPrimitive(int stats_batch_idx); + template shared_ptr GetStatsBatchMemory( + shared_ptr > mkldnn_data, int idx); + void InitStatsBatchVars(int batch_size); shared_ptr > fwd_top_data, fwd_bottom_data; shared_ptr > bwd_top_diff, bwd_bottom_diff; shared_ptr BatchNormFwd_pd; shared_ptr BatchNormBwd_pd; - MKLDNNPrimitive BatchNormFwd, BatchNormBwd; - shared_ptr mean_memory, variance_memory; + vector > BatchNormFwd, BatchNormBwd; + vector > mean_memory, variance_memory; shared_ptr scaleshift_memory, bwd_scaleshift_diff_memory; shared_ptr output_memory, bwd_bottom_diff_memory; + vector > input_stats, output_stats, top_diff_stats, bottom_diff_stats; shared_ptr input_primitive, bwd_top_diff_primitive; int32_t num_, width_, height_, channels_; Dtype eps_, moving_average_fraction_; bool use_weight_bias_, bias_term_, use_global_stats_; + int num_stats_batches_; + int stats_batch_size_; + shared_ptr > scaleshift_blob_; + shared_ptr > scaleshift_acc_; PERFORMANCE_EVENT_ID_DECL(perf_id_fw_); PERFORMANCE_EVENT_ID_DECL(perf_id_bw_); diff --git a/include/caffe/mkldnn_memory.hpp b/include/caffe/mkldnn_memory.hpp index a59ce6e12..3b1a1c6ad 100644 --- a/include/caffe/mkldnn_memory.hpp +++ b/include/caffe/mkldnn_memory.hpp @@ -94,6 +94,7 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr if (_prv_memory == NULL) allocate(); return _internal_ptr; } + shared_ptr reorder_usr2prv() { return _reorder_usr2prv.aprimitive; } shared_ptr reorder_prv2usr() { return _reorder_prv2usr.aprimitive; } shared_ptr reorder_extprv2prv() { return _reorder_extprv2prv.aprimitive; } @@ -201,6 +202,8 @@ class MKLDNNMemoryDescriptor : public MKLDNNMemoryDescriptorBase { shared_ptr create_output_memory(Blob * blob, bool inplace = false); shared_ptr create_input(bool set_prv_ptr); shared_ptr create_output_memory(bool inplace = false); + Dtype* get_memory_ptr(long offset = 0); + shared_ptr get_memory_desc(); void set_mkldnn_primitive(MKLDNNPrimitive& mprimitive) { CHECK(mprimitive.aprimitive); _mkldnn_primitive = mprimitive; } MKLDNNPrimitive& mkldnn_primitive() { return _mkldnn_primitive; } diff --git a/include/caffe/multinode/multi_sync.hpp b/include/caffe/multinode/multi_sync.hpp index 6300c4876..905d9fce7 100644 --- a/include/caffe/multinode/multi_sync.hpp +++ b/include/caffe/multinode/multi_sync.hpp @@ -215,10 +215,6 @@ namespace caffe { } void on_iter_finished(int layer_id) { -#ifdef FW_OVERLAP_OPT - solver->set_layer_finished_flag(layer_id, false); -#endif - boost::shared_ptr> &layer = layers[layer_id]; if (layer->layerOp == nullptr) { return; diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp index a11da89de..09f6ff26e 100644 --- a/include/caffe/sgd_solvers.hpp +++ b/include/caffe/sgd_solvers.hpp @@ -81,6 +81,11 @@ class SGDSolver : public Solver { // of gradients/updates and is not needed in snapshots vector > > history_, update_, temp_; +#ifdef ENABLE_SGD_FUSION + //Fuse the Normalize, Regularize, ComputeUpdateValue and Update process together + void SGDFusion(int param_id, Dtype rate); +#endif /* ENABLE_SGD_FUSION */ + // loss history for 'plateau' LR policy (should be stored in snapshots) Dtype minimum_loss_; int iter_last_event_; diff --git a/include/caffe/util/apply_bn_stats_batch_size.hpp b/include/caffe/util/apply_bn_stats_batch_size.hpp new file mode 100644 index 000000000..872b2c5bf --- /dev/null +++ b/include/caffe/util/apply_bn_stats_batch_size.hpp @@ -0,0 +1,45 @@ +/* +All modification made by Intel Corporation: © 2017 Intel Corporation + +All contributions by the University of California: +Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014, 2015, the respective contributors +All rights reserved. +For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef APPLY_BN_STATS_BATCH_SIZE_HPP_ +#define APPLY_BN_STATS_BATCH_SIZE_HPP_ +#include "caffe/proto/caffe.pb.h" + +namespace caffe { +void ApplyBnStatsBatchSize(const NetParameter& param, + NetParameter* param_with_stats_batch_size); +} +#endif // APPLY_BN_STATS_BATCH_SIZE_HPP_ diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt new file mode 100644 index 000000000..4f4f21a93 --- /dev/null +++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt @@ -0,0 +1,19 @@ +net: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt" +test_iter: 1000 +test_interval: 156 +test_initialization: false +display: 40 +base_lr: 3.2 +lr_policy: "multistep" +stepvalue:4680 +stepvalue:9360 +stepvalue:12480 +gamma: 0.1 +max_iter: 14075 +warmup_iter: 780 # 1281167 / 8192 * 5 epochs +warmup_start_lr: 0.1 +momentum: 0.9 +weight_decay: 0.0001 +snapshot: 156 +snapshot_prefix: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/resnet_50_64_nodes_8k" +solver_mode: CPU diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt new file mode 100644 index 000000000..3dd57aaac --- /dev/null +++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt @@ -0,0 +1,3322 @@ +name: "ResNet-50" +bn_stats_batch_size: 32 +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 224 + scale: 0.0078125 + mean_value: 104 + mean_value: 117 + mean_value: 123 + random_aspect_ratio_param { + min_area_ratio: 0.08 + max_area_ratio: 1 + aspect_ratio_change: 0.75 + resize_param { + interp_mode: CUBIC + } + } + } + data_param { + source: "examples/imagenet/ilsvrc12_train_lmdb" + batch_size: 128 + backend: LMDB + prefetch: 2 + shuffle: true + } +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 224 + scale: 0.0078125 + mean_value: 104 + mean_value: 117 + mean_value: 123 + random_resize_param { + min_size: 256 + max_size: 256 + resize_param { + interp_mode: CUBIC + } + } + } + data_param { + source: "examples/imagenet/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} + +layer { + bottom: "data" + top: "conv1" + name: "conv1" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 7 + pad: 3 + stride: 2 + weight_filler { + type: "msra" + variance_norm: FAN_OUT + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "conv1" + top: "conv1" + name: "bn_conv1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "conv1" + top: "conv1" + name: "scale_conv1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "conv1" + top: "conv1" + name: "conv1_relu" + type: "ReLU" + relu_param { + } +} + +layer { + bottom: "conv1" + top: "pool1" + name: "pool1" + type: "Pooling" + pooling_param { + kernel_size: 3 + stride: 2 + pool: MAX + } +} + +layer { + bottom: "pool1" + top: "res2a_branch1" + name: "res2a_branch1" + type: "Convolution" + convolution_param { + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch1" + top: "res2a_branch1" + name: "bn2a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch1" + top: "res2a_branch1" + name: "scale2a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "pool1" + top: "res2a_branch2a" + name: "res2a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 64 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2a" + name: "bn2a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2a" + name: "scale2a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2a" + name: "res2a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2b" + name: "res2a_branch2b" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2b" + name: "bn2a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2b" + name: "scale2a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2b" + name: "res2a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2c" + name: "res2a_branch2c" + type: "Convolution" + convolution_param { + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch2c" + top: "res2a_branch2c" + name: "bn2a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch2c" + top: "res2a_branch2c" + name: "scale2a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a_branch1" + bottom: "res2a_branch2c" + top: "res2a" + name: "res2a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res2a" + top: "res2a" + name: "res2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2a" + top: "res2b_branch2a" + name: "res2b_branch2a" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2a" + name: "bn2b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2a" + name: "scale2b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2a" + name: "res2b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2b" + name: "res2b_branch2b" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2b" + name: "bn2b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2b" + name: "scale2b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2b" + name: "res2b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2c" + name: "res2b_branch2c" + type: "Convolution" + convolution_param { + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2b_branch2c" + top: "res2b_branch2c" + name: "bn2b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2b_branch2c" + top: "res2b_branch2c" + name: "scale2b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a" + bottom: "res2b_branch2c" + top: "res2b" + name: "res2b" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res2b" + top: "res2b" + name: "res2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2b" + top: "res2c_branch2a" + name: "res2c_branch2a" + type: "Convolution" + convolution_param { + + num_output: 64 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2a" + name: "bn2c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2a" + name: "scale2c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2a" + name: "res2c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2b" + name: "res2c_branch2b" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2b" + name: "bn2c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2b" + name: "scale2c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2b" + name: "res2c_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2c" + name: "res2c_branch2c" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2c_branch2c" + top: "res2c_branch2c" + name: "bn2c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res2c_branch2c" + top: "res2c_branch2c" + name: "scale2c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2b" + bottom: "res2c_branch2c" + top: "res2c" + name: "res2c" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res2c" + top: "res2c" + name: "res2c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2c" + top: "res3a_branch1" + name: "res3a_branch1" + type: "Convolution" + convolution_param { + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch1" + top: "res3a_branch1" + name: "bn3a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch1" + top: "res3a_branch1" + name: "scale3a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2c" + top: "res3a_branch2a" + name: "res3a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2a" + name: "bn3a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2a" + name: "scale3a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2a" + name: "res3a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2b" + name: "res3a_branch2b" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2b" + name: "bn3a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2b" + name: "scale3a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2b" + name: "res3a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2c" + name: "res3a_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch2c" + top: "res3a_branch2c" + name: "bn3a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch2c" + top: "res3a_branch2c" + name: "scale3a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a_branch1" + bottom: "res3a_branch2c" + top: "res3a" + name: "res3a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3a" + top: "res3a" + name: "res3a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3a" + top: "res3b_branch2a" + name: "res3b_branch2a" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2a" + name: "bn3b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2a" + name: "scale3b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2a" + name: "res3b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2b" + name: "res3b_branch2b" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2b" + name: "bn3b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2b" + name: "scale3b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2b" + name: "res3b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2c" + name: "res3b_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3b_branch2c" + top: "res3b_branch2c" + name: "bn3b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3b_branch2c" + top: "res3b_branch2c" + name: "scale3b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a" + bottom: "res3b_branch2c" + top: "res3b" + name: "res3b" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3b" + top: "res3b" + name: "res3b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3b" + top: "res3c_branch2a" + name: "res3c_branch2a" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2a" + name: "bn3c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2a" + name: "scale3c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2a" + name: "res3c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2b" + name: "res3c_branch2b" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2b" + name: "bn3c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2b" + name: "scale3c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2b" + name: "res3c_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2c" + name: "res3c_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3c_branch2c" + top: "res3c_branch2c" + name: "bn3c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3c_branch2c" + top: "res3c_branch2c" + name: "scale3c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3b" + bottom: "res3c_branch2c" + top: "res3c" + name: "res3c" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3c" + top: "res3c" + name: "res3c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3c" + top: "res3d_branch2a" + name: "res3d_branch2a" + type: "Convolution" + convolution_param { + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2a" + name: "bn3d_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2a" + name: "scale3d_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2a" + name: "res3d_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2b" + name: "res3d_branch2b" + type: "Convolution" + convolution_param { + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2b" + name: "bn3d_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2b" + name: "scale3d_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2b" + name: "res3d_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2c" + name: "res3d_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3d_branch2c" + top: "res3d_branch2c" + name: "bn3d_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res3d_branch2c" + top: "res3d_branch2c" + name: "scale3d_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3c" + bottom: "res3d_branch2c" + top: "res3d" + name: "res3d" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3d" + top: "res3d" + name: "res3d_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3d" + top: "res4a_branch1" + name: "res4a_branch1" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch1" + top: "res4a_branch1" + name: "bn4a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch1" + top: "res4a_branch1" + name: "scale4a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3d" + top: "res4a_branch2a" + name: "res4a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2a" + name: "bn4a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2a" + name: "scale4a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2a" + name: "res4a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2b" + name: "res4a_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2b" + name: "bn4a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2b" + name: "scale4a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2b" + name: "res4a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2c" + name: "res4a_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch2c" + top: "res4a_branch2c" + name: "bn4a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch2c" + top: "res4a_branch2c" + name: "scale4a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a_branch1" + bottom: "res4a_branch2c" + top: "res4a" + name: "res4a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4a" + top: "res4a" + name: "res4a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4a" + top: "res4b_branch2a" + name: "res4b_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2a" + name: "bn4b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2a" + name: "scale4b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2a" + name: "res4b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2b" + name: "res4b_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2b" + name: "bn4b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2b" + name: "scale4b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2b" + name: "res4b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2c" + name: "res4b_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4b_branch2c" + top: "res4b_branch2c" + name: "bn4b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4b_branch2c" + top: "res4b_branch2c" + name: "scale4b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a" + bottom: "res4b_branch2c" + top: "res4b" + name: "res4b" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4b" + top: "res4b" + name: "res4b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4b" + top: "res4c_branch2a" + name: "res4c_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2a" + name: "bn4c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2a" + name: "scale4c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2a" + name: "res4c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2b" + name: "res4c_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2b" + name: "bn4c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2b" + name: "scale4c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2b" + name: "res4c_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2c" + name: "res4c_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4c_branch2c" + top: "res4c_branch2c" + name: "bn4c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4c_branch2c" + top: "res4c_branch2c" + name: "scale4c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4b" + bottom: "res4c_branch2c" + top: "res4c" + name: "res4c" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4c" + top: "res4c" + name: "res4c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4c" + top: "res4d_branch2a" + name: "res4d_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2a" + name: "bn4d_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2a" + name: "scale4d_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2a" + name: "res4d_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2b" + name: "res4d_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2b" + name: "bn4d_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2b" + name: "scale4d_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2b" + name: "res4d_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2c" + name: "res4d_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4d_branch2c" + top: "res4d_branch2c" + name: "bn4d_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4d_branch2c" + top: "res4d_branch2c" + name: "scale4d_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4c" + bottom: "res4d_branch2c" + top: "res4d" + name: "res4d" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4d" + top: "res4d" + name: "res4d_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4d" + top: "res4e_branch2a" + name: "res4e_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2a" + name: "bn4e_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2a" + name: "scale4e_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2a" + name: "res4e_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2b" + name: "res4e_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2b" + name: "bn4e_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2b" + name: "scale4e_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2b" + name: "res4e_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2c" + name: "res4e_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4e_branch2c" + top: "res4e_branch2c" + name: "bn4e_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4e_branch2c" + top: "res4e_branch2c" + name: "scale4e_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4d" + bottom: "res4e_branch2c" + top: "res4e" + name: "res4e" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4e" + top: "res4e" + name: "res4e_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4e" + top: "res4f_branch2a" + name: "res4f_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2a" + name: "bn4f_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2a" + name: "scale4f_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2a" + name: "res4f_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2b" + name: "res4f_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2b" + name: "bn4f_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2b" + name: "scale4f_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2b" + name: "res4f_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2c" + name: "res4f_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4f_branch2c" + top: "res4f_branch2c" + name: "bn4f_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res4f_branch2c" + top: "res4f_branch2c" + name: "scale4f_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4e" + bottom: "res4f_branch2c" + top: "res4f" + name: "res4f" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4f" + top: "res4f" + name: "res4f_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4f" + top: "res5a_branch1" + name: "res5a_branch1" + type: "Convolution" + convolution_param { + + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch1" + top: "res5a_branch1" + name: "bn5a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch1" + top: "res5a_branch1" + name: "scale5a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4f" + top: "res5a_branch2a" + name: "res5a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2a" + name: "bn5a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2a" + name: "scale5a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2a" + name: "res5a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2b" + name: "res5a_branch2b" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 3 + pad: 1 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2b" + name: "bn5a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2b" + name: "scale5a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2b" + name: "res5a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2c" + name: "res5a_branch2c" + type: "Convolution" + convolution_param { + + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch2c" + top: "res5a_branch2c" + name: "bn5a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch2c" + top: "res5a_branch2c" + name: "scale5a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a_branch1" + bottom: "res5a_branch2c" + top: "res5a" + name: "res5a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res5a" + top: "res5a" + name: "res5a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5a" + top: "res5b_branch2a" + name: "res5b_branch2a" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2a" + name: "bn5b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2a" + name: "scale5b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2a" + name: "res5b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2b" + name: "res5b_branch2b" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2b" + name: "bn5b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2b" + name: "scale5b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2b" + name: "res5b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2c" + name: "res5b_branch2c" + type: "Convolution" + convolution_param { + + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5b_branch2c" + top: "res5b_branch2c" + name: "bn5b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5b_branch2c" + top: "res5b_branch2c" + name: "scale5b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a" + bottom: "res5b_branch2c" + top: "res5b" + name: "res5b" + type: "Eltwise" + eltwise_param { + } +} + +layer { + bottom: "res5b" + top: "res5b" + name: "res5b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5b" + top: "res5c_branch2a" + name: "res5c_branch2a" + type: "Convolution" + convolution_param { + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2a" + name: "bn5c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2a" + name: "scale5c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2a" + name: "res5c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2b" + name: "res5c_branch2b" + type: "Convolution" + convolution_param { + num_output: 512 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2b" + name: "bn5c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2b" + name: "scale5c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2b" + name: "res5c_branch2b_relu" + type: "ReLU" + relu_param { + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2c" + name: "res5c_branch2c" + type: "Convolution" + convolution_param { + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5c_branch2c" + top: "res5c_branch2c" + name: "bn5c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res5c_branch2c" + top: "res5c_branch2c" + name: "scale5c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5b" + bottom: "res5c_branch2c" + top: "res5c" + name: "res5c" + type: "Eltwise" + eltwise_param { + } +} + +layer { + bottom: "res5c" + top: "res5c" + name: "res5c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5c" + top: "pool5" + name: "pool5" + type: "Pooling" + pooling_param { + kernel_size: 7 + stride: 1 + pool: AVE + } +} + +layer { + bottom: "pool5" + top: "fc1000" + name: "fc1000" + type: "InnerProduct" + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "fc1000" + bottom: "label" + top: "loss" + name: "prob" + type: "SoftmaxWithLoss" +} +layer { + name: "loss3/top-1" + type: "Accuracy" + bottom: "fc1000" + bottom: "label" + top: "loss3/top-1" +} +layer { + name: "loss3/top-5" + type: "Accuracy" + bottom: "fc1000" + bottom: "label" + top: "loss3/top-5" + accuracy_param { + top_k: 5 + } +} diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp index b9dc23e24..3b02f509b 100644 --- a/python/caffe/_caffe.cpp +++ b/python/caffe/_caffe.cpp @@ -411,7 +411,7 @@ BOOST_PYTHON_MODULE(_caffe) { .add_property("channels", &Blob::channels) .add_property("height", &Blob::height) .add_property("width", &Blob::width) - .add_property("count", static_cast::*)() const>( + .add_property("count", static_cast::*)() const>( &Blob::count)) .def("reshape", bp::raw_function(&Blob_Reshape)) .add_property("data", bp::make_function(&Blob::mutable_cpu_data, diff --git a/scripts/run_intelcaffe.sh b/scripts/run_intelcaffe.sh new file mode 100755 index 000000000..29a5309ab --- /dev/null +++ b/scripts/run_intelcaffe.sh @@ -0,0 +1,604 @@ +#!/bin/sh + set -x + +benchmark_mode="all" + +# time/train/resume_train +mode="train" + +# it's assigned by detect_cpu +cpu_model=skx + +# a list of nodes +host_file="" + +# network parameters +network="opa" +tcp_netmask="" + +# specify number of MLSL ep servers in command +num_mlsl_servers=-1 + +# parameters for caffe time +iteration=0 +model_file="" +# parameters for resuming training +snapshot="" +# parameters for training +solver_file="" + +# specify engine for running caffe +engine="MKL2017" + +result_dir="" +debug="off" + +function usage +{ + script_name=$0 + echo "Usage:" + echo " $script_name --host host_file [--solver solver_file]" + echo " [--network opa/tcp] [--netmask tcp_netmask] [--debug on/off]" + echo " [--mode train/resume_train/time/none] [--benchmark all/qperf/mpi/none]" + echo " [--iteration iter] [--model_file deploy.prototxt]" + echo " [--snapshot snapshot.caffemodel]" + echo " [--num_mlsl_servers num_mlsl_servers]" + echo " [--output output_folder]" + echo "" + echo " Parameters:" + echo " host: host file includes list of nodes." + echo "" + echo " Optional parameters:" + echo " solver: specify solver file if mode is train/resume_train" + echo " network: opa(default), tcp" + echo " netmask: only used if network is tcp" + echo " debug: off(default). MLSL debug information is outputed if it's on" + echo " mode: train(default), resume_train, time, none(not to run caffe test)" + echo " benchmark: all(default). Includes qperf, all-reduce performance" + echo " Dependency: user needs to install qperf, IMB-MPI1;" + echo " and add them in system path." + echo " iteration and model_file: only used if mode is time (caffe time)" + echo " snapshot: only used if mode is resume_train" + echo " num_mlsl_servers: number of MLSL ep servers" + echo " output_folder: output folder for storing results" +} + +declare -a cpu_list=("Intel Xeon E5-26xx (Broadwell)" "Intel Xeon Phi 72xx (Knight Landing)" + "Intel Xeon Platinum 8180 (Skylake)" "Intel Xeon 6148 (Skylake)") + +function detect_cpu +{ + # detect cpu model + model_string=`lscpu | grep "Model name" | awk -F ':' '{print $2}'` + if [[ $model_string == *"72"* ]]; then + cpu_model=knl + elif [[ $model_string == *"8180"* ]]; then + cpu_model=skx + elif [[ $model_string == *"6148"* ]]; then + cpu_model=skx + elif [[ $model_string == *"E5-26"* ]]; then + cpu_model=bdw + else + echo "CPU model: $model_string" + echo " Use default settings, which may not be optimal ones." + fi +} + +function set_numa_node +{ + # detect numa mode: cache and flat mode for KNL + numa_node=($(numactl -H | grep "available" | awk -F ' ' '{print $2}')) + if [ $numa_node -eq 1 ]; then + echo "Cache mode." + # cache mode, use numa node 0 + numanode=0 + else + echo "Flat mode." + numanode=1 + fi +} + + +function check_dependency +{ + dep=$1 + which $dep >/dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "Warning: cannot find $dep" + return 1 + fi + return 0 +} + + +function init_mpi_envs +{ + # IMPI configuration + if [ "$network" == "opa" ]; then + export I_MPI_FABRICS=tmi + export I_MPI_TMI_PROVIDER=psm2 + if [ "$cpu_model" == "knl" ]; then + # PSM2 configuration + export PSM2_MQ_RNDV_HFI_WINDOW=4194304 #2097152 # to workaround PSM2 bug in IFS 10.2 and 10.3 + export PSM2_MQ_EAGER_SDMA_SZ=65536 + export PSM2_MQ_RNDV_HFI_THRESH=200000 + fi + + export PSM2_IDENTIFY=1 # for debug + elif [ "$network" == "tcp" ]; then + export I_MPI_FABRICS=tcp + export I_MPI_TCP_NETMASK=$tcp_netmask + else + echo "Invalid network: $network" + exit 1 + fi + + export I_MPI_FALLBACK=0 + export I_MPI_DEBUG=6 +} + + +function clear_shm +{ + clear_command="rm -rf /dev/shm/*" + check_shm_command="df -h | grep shm" + + # TODO: check if 50G is the minimum shm size? + min_shm_size=50 + shm_unit="G" + + for node in "${nodenames[@]}" + do + ssh ${node} "$clear_command" + shm_line=`ssh ${node} "$check_shm_command"` + shm_string=`echo $shm_line | awk -F ' ' '{print $(NF-2)}'` + unit="${shm_string:(-1)}" + shm_size=${shm_string::-1} + if [ "$unit" == "$shm_unit" ] && [ $shm_size -ge ${min_shm_size} ]; then + continue + else + echo "Error: /dev/shm size = ${shm_size}${unit}, on node: ${node}." + echo " It's less than minimum size: ${min_shm_size}${shm_unit}." + echo " Please clean or enlarge it." + exit 1 + fi + done +} + +function kill_zombie_processes +{ + kill_command="for process in ep_server caffe mpiexec.hydra; do for i in \$(ps -e | grep -w \$process | awk -F ' ' '{print \$1}'); do kill -9 \$i; echo \"\$process \$i killed.\"; done done" + for node in "${nodenames[@]}" + do + ssh ${node} "$kill_command" + done +} + +function clear_envs +{ + clear_shm + kill_zombie_processes +} + +function set_mlsl_vars +{ + if [ "${num_mlsl_servers}" -eq -1 ]; then + if [ ${numnodes} -eq 1 ]; then + numservers=0 + else + if [ ${cpu_model} == knl ]; then + numservers=4 + else + numservers=2 + fi + fi + else + numservers=$((num_mlsl_servers)) + fi + + echo "MLSL_NUM_SERVERS: $numservers" + export MLSL_NUM_SERVERS=${numservers} + + if [ ${numservers} -gt 0 ]; then + if [ ${cpu_model} == knl ]; then + listep=6,7,8,9,10,11,12,13 + else + listep=6,7,8,9 + fi + export MLSL_SERVER_AFFINITY="${listep}" + echo "MLSL_SERVER_AFFINITY: ${listep}" + fi + + # MLSL configuration + if [ "$debug" == "on" ]; then + export MLSL_LOG_LEVEL=3 + else + export MLSL_LOG_LEVEL=0 + fi +} + +function set_env_vars +{ + set_mlsl_vars + + ppncpu=1 + threadspercore=1 + + cores=`lscpu | grep "Core(s) per socket:" | awk '{print $4}'` + sockets=`lscpu | grep "Socket(s)" | awk '{print $2}'` + maxcores=$((cores*sockets)) + + numthreads=$(((maxcores-numservers)*threadspercore)) + numthreads_per_proc=$((numthreads/ppncpu)) + + export OMP_NUM_THREADS=${numthreads_per_proc} + + # OMP configuration + # threadspercore=1 + affinitystr="proclist=[0-5,$((5+numservers+1))-$((maxcores-1))],granularity=thread,explicit" + export KMP_HW_SUBSET=1t + export KMP_AFFINITY=$affinitystr +} + +function execute_command +{ + local xeonbin_=$1 + local result_dir_=$2 + + if [ ${cpu_model} == knl ]; then + exec_command="numactl --preferred=$numanode $xeonbin_" + else + exec_command="$xeonbin_" + fi + + if [ ${numnodes} -gt 1 ]; then + # Produce the configuration file for mpiexec. + # Each line of the config file contains a # host, environment, binary name. + cfile_=nodeconfig-${cpu_model}-${numnodes}.txt + rm -f $cfile_ + + for node in "${nodenames[@]}" + do + echo "-host ${node} -n $ppncpu $exec_command" >> $cfile_ + done + fi + + clear_envs + log_file=outputCluster-${cpu_model}-${numnodes}.txt + + sensors_bin="sensors" + check_dependency $sensors_bin + has_sensors=$? + if [ $has_sensors -eq 0 ]; then + sensor_log_file=sensors-${cpu_model}-${numnodes}-start.log + $sensors_bin >$sensor_log_file + mv $sensor_log_file $result_dir_/ + fi + + if [ ${numnodes} -eq 1 ]; then + time GLOG_minloglevel=0 $exec_command >${log_file} 2>&1 + else + init_mpi_envs + exec_command="-l -configfile $cfile_" + time GLOG_minloglevel=0 mpiexec.hydra $exec_command >${log_file} 2>&1 + fi + + if [ $has_sensors -eq 0 ]; then + sensor_log_file=sensors-${cpu_model}-${numnodes}-end.log + $sensors_bin >$sensor_log_file + mv $sensor_log_file $result_dir_/ + fi + mv $log_file $cfile_ $result_dir_/ +} + +function run_qperf_bench +{ + qperf_bin="qperf" + check_dependency $qperf_bin + if [ $? -ne 0 ]; then + echo "Skip qperf benchmark." + return + fi + + # measure bandwidth and latency + qperf_result_log="qperf_bench_result.log" + rm -f $qperf_result_log + + server_node="" + port=1234567 + qperf_param="-lp $port -oo msg_size:1024:512M:*2 -vu tcp_bw tcp_lat" + + for ((i=0; i> $qperf_result_log + echo >>$qperf_result_log + + for ((j=i+1; j>$qperf_result_log + done + done + + mv $qperf_result_log $result_dir/ +} + +function run_mpi_bench +{ + # MPI benchmark + mpibench_bin="IMB-MPI1" + check_dependency $mpibench_bin + if [ $? -ne 0 ]; then + echo "Skip MPI benchmark..." + return + fi + + xeonbin="$mpibench_bin allreduce" + + declare -a adjust_values=(1 2 3 5 7 8 9 0) + declare -a collective_values=('tmi' 'none') + + echo "Start mpi bench..." + for ((i=0; i<${#adjust_values[@]}; i++)) + do + for ((j=0; j<${#collective_values[@]}; j++)) + do + if [ ${adjust_values[$i]} -eq 0 ]; then + unset I_MPI_ADJUST_ALLREDUCE + else + export I_MPI_ADJUST_ALLREDUCE=${adjust_values[$i]} + fi + + if [ "${collective_values[$j]}" == "none" ]; then + unset I_MPI_COLLECTIVE_DEFAULTS + else + export I_MPI_COLLECTIVE_DEFAULTS=${collective_values[$j]} + fi + echo "iteration $i, ${j}..." + echo "I_MPI_ADJUST_ALLREDUCE=$I_MPI_ADJUST_ALLREDUCE" + echo "I_MPI_COLLECTIVE_DEFAULTS=$I_MPI_COLLECTIVE_DEFAULTS" + + test_result_dir=$result_dir/mpibench-${adjust_values[$i]}-${collective_values[$j]} + mkdir -p $test_result_dir + execute_command "$xeonbin" $test_result_dir + done + done + + # TODO: analyze the report and select the best algorithm and setting + unset I_MPI_COLLECTIVE_DEFAULTS + unset I_MPI_ADJUST_ALLREDUCE + + echo "Finished." +} + +function run_benchmark +{ + echo "Run benchmark with ${numnodes} nodes..." + if [ $numnodes -gt 1 ]; then + if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode" == "qperf" ]; then + run_qperf_bench + fi + + if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode == mpi" ]; then + set_env_vars + run_mpi_bench + fi + fi +} + +function run_caffe +{ + echo "Run caffe with ${numnodes} nodes..." + + if [ ${mode} == "time" ]; then + xeonbin="$caffe_bin time --iterations $iteration --model $model_file -engine=$engine" + else + xeonbin="$caffe_bin train --solver $solver_file -engine=$engine" + if [ ${mode} == "resume_train" ]; then + xeonbin+=" --snapshot=${snapshot}" + fi + fi + + set_env_vars + execute_command "$xeonbin" $result_dir +} + + +if [ $# -le 1 ]; then + usage + exit 0 +fi + +root_dir=$(cd $(dirname $(dirname $0)); pwd) +result_dir=${root_dir}/"result-`date +%Y%m%d%H%M%S`" + +while [[ $# -gt 1 ]] +do + key="$1" + case $key in + --solver) + solver_file="$2" + shift + ;; + --host) + host_file="$2" + shift + ;; + --network) + network="$2" + shift + ;; + --netmask) + tcp_netmask="$2" + shift + ;; + --debug) + debug="$2" + shift + ;; + --num_mlsl_servers) + num_mlsl_servers=$2 + shift + ;; + --mode) + mode=$2 + shift + ;; + --iteration) + iteration=$2 + shift + ;; + --model_file) + model_file=$2 + shift + ;; + --snapshot) + snapshot=$2 + shift + ;; + --engine) + engine=$2 + shift + ;; + --benchmark) + benchmark_mode=$2 + shift + ;; + --output) + result_dir=$2 + shift + ;; + *) + echo "Unknown option: $key" + usage + exit 1 + ;; + esac + shift +done + +# check parameters +if [ "$host_file" == "" ]; then + echo "Error: host file is NOT specified." + exit 1 +fi +if [ ! -f $host_file ]; then + echo "Error: host file does NOT exist." + exit 1 +fi + +echo "" +echo "CPUs with optimal settings:" +for ((i=0; i<${#cpu_list[@]}; i++)) +do + echo " ${cpu_list[$i]}" +done +echo "" +echo "Settings:" +echo " Host file: $host_file" +echo " Running mode: $mode" +echo " Benchmark: $benchmark_mode" +echo " Debug option: $debug" +echo " Engine: $engine" +echo " Number of MLSL servers: $num_mlsl_servers" +echo " -1: selected automatically according to CPU model." +echo " BDW/SKX: 2, KNL: 4" + + +if [ "$mode" == "train" ] || [ "$mode" == "resume_train" ]; then + if [ "$solver_file" == "" ]; then + echo "Error: solver file is NOT specified." + exit 1 + fi + if [ ! -f $solver_file ]; then + echo "Error: solver file does NOT exist." + exit 1 + fi + + echo " Solver file: $solver_file" + + if [ "$mode" == "resume_train" ]; then + if [ "$snapshot" == "" ]; then + echo "Error: snapshot is NOT specified." + exit 1 + fi + if [ ! -f $snapshot ]; then + echo "Eror: snapshot file does NOT exist." + exit 1 + fi + echo " Snapshot for resuming train: $snapshot" + fi +fi + +if [ "$mode" == "time" ]; then + if [ "$model_file" == "" ]; then + echo "Error: model file is NOT specified." + exit 1 + fi + if [ ! -f $model_file ]; then + echo "Eror: model file does NOT exist." + exit 1 + fi + + if [ $iteration -le 0 ]; then + echo "Error: iteration ($iteration) <= 0." + exit 1 + fi + echo " Iteration for running caffe time: $iteration" + echo " Model file for running caffe time: $model_file" +fi + +echo " Network: $network" +if [ "$network" == "tcp" ]; then + if [ "$tcp_netmask" == "" ]; then + echo "Error: TCP netmask is NOT specified." + exit 0 + fi + echo " Netmask for TCP network: $tcp_netmask" +fi + +# Names to configfile, binary (executable) files # +nodenames=( `cat $host_file | sort | uniq ` ) +if [ ${#nodenames[@]} -eq 0 ]; then + echo "Error: empty host file! Exit." + exit 0 +fi +numnodes=${#nodenames[@]} +echo "Number of nodes: $numnodes" + +detect_cpu + +if [ $cpu_model == knl ]; then + set_numa_node +fi + +if [ ! -d $result_dir ]; then + echo "Create result directory: $result_dir" + mkdir -p $result_dir +fi + +if [ "${benchmark_mode}" != "none" ]; then + run_benchmark +fi + +if [ "${mode}" != "none" ]; then + caffe_bin="./build/tools/caffe" + check_dependency $caffe_bin + if [ $? -ne 0 ]; then + echo "Exit." + exit 0 + fi + + run_caffe +fi + +echo "Result folder: $result_dir" diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index dd5546bde..48ae68dc7 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -72,7 +72,7 @@ void Blob::Reshape(const vector& shape) { for (int i = 0; i < shape.size(); ++i) { CHECK_GE(shape[i], 0); if (count_ != 0) { - CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX"; + CHECK_LE(shape[i], LONG_MAX / count_) << "blob size exceeds LONG_MAX"; } count_ *= shape[i]; if (shape_[i] != shape[i]) { @@ -369,7 +369,13 @@ Dtype Blob::asum_diff() const { switch (diff_->head()) { case SyncedMemory::SYNCED_PRV: case SyncedMemory::HEAD_AT_PRV: - return caffe_cpu_asum( prv_diff_count(), prv_diff()); + { + const Dtype* prv_ptr = prv_diff(); + if (prv_ptr == NULL) + return caffe_cpu_asum(count_, cpu_diff()); + else + return caffe_cpu_asum(prv_diff_count(), prv_diff()); + } case SyncedMemory::HEAD_AT_CPU: return caffe_cpu_asum(count_, cpu_diff()); case SyncedMemory::HEAD_AT_GPU: @@ -462,7 +468,11 @@ Dtype Blob::sumsq_diff() const { case SyncedMemory::SYNCED_PRV: case SyncedMemory::HEAD_AT_PRV: diff = prv_diff(); - sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff); + if (diff == NULL) { + diff = cpu_diff(); + sumsq = caffe_cpu_dot(count_, diff, diff); + } else + sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff); break; case SyncedMemory::HEAD_AT_CPU: diff = cpu_diff(); diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index 2b52007cc..0a6f83a21 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -92,6 +92,7 @@ shared_ptr > GetConvolutionLayer( for (int i = 0; i < conv_param.dilation_size(); ++i) { if (conv_param.dilation(i) > 1) { use_dilation = true; + break; } } #endif @@ -589,10 +590,10 @@ shared_ptr > GetEltwiseLayer(const LayerParameter& param) { #if defined(MKL2017_SUPPORTED) else if (ep.isEngine("MKL2017")) engine = EltwiseParameter_Engine_MKL2017; -#endif -#if defined(MKLDNN_SUPPORTED) - else if (ep.isEngine("MKLDNN")) - engine = EltwiseParameter_Engine_MKLDNN; +#endif +#if defined(MKLDNN_SUPPORTED) + else if (ep.isEngine("MKLDNN")) + engine = EltwiseParameter_Engine_MKLDNN; #endif } @@ -605,9 +606,9 @@ shared_ptr > GetEltwiseLayer(const LayerParameter& param) { } else if (engine == EltwiseParameter_Engine_MKL2017) { return shared_ptr >(new MKLEltwiseLayer(param)); #endif -#ifdef MKLDNN_SUPPORTED - } else if (engine == EltwiseParameter_Engine_MKLDNN) { - return shared_ptr >(new MKLDNNEltwiseLayer(param)); +#ifdef MKLDNN_SUPPORTED + } else if (engine == EltwiseParameter_Engine_MKLDNN) { + return shared_ptr >(new MKLDNNEltwiseLayer(param)); #endif } else { LOG(FATAL) << "Layer " << param.name() << " has unknow engine."; diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp index b7746d988..8331dd7d7 100644 --- a/src/caffe/layers/batch_norm_layer.cpp +++ b/src/caffe/layers/batch_norm_layer.cpp @@ -81,13 +81,22 @@ void BatchNormLayer::Reshape(const vector*>& bottom, CHECK_EQ(bottom[0]->shape(1), channels_); top[0]->ReshapeLike(*bottom[0]); + num_stats_batches_ = 1; + stats_batch_size_ = bottom[0]->shape(0); + BatchNormParameter param = this->layer_param_.batch_norm_param(); + if (!use_global_stats_ && param.stats_batch_size() > 0) { + CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0); + num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size(); + stats_batch_size_ = param.stats_batch_size(); + } + vector sz; sz.push_back(channels_); mean_.Reshape(sz); variance_.Reshape(sz); temp_.ReshapeLike(*bottom[0]); x_norm_.ReshapeLike(*bottom[0]); - sz[0]=bottom[0]->shape(0); + sz[0]=stats_batch_size_; batch_sum_multiplier_.Reshape(sz); int spatial_dim = bottom[0]->count(2); @@ -99,7 +108,7 @@ void BatchNormLayer::Reshape(const vector*>& bottom, caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data); } - int numbychans = channels_*bottom[0]->shape(0); + int numbychans = channels_*stats_batch_size_; if (num_by_chans_.num_axes() == 0 || num_by_chans_.shape(0) != numbychans) { sz[0] = numbychans; @@ -149,18 +158,20 @@ void BatchNormLayer::replicate_to_op(Dtype* buffer_to_write, } } - - template -void BatchNormLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - int num = bottom[0]->shape(0); +void BatchNormLayer::ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx) { + long data_stats_count = stats_batch_size_ * bottom[0]->count(1); + long data_offset = stats_batch_idx * data_stats_count; + const Dtype* bottom_data = bottom[0]->cpu_data() + data_offset; + Dtype* top_data = top[0]->mutable_cpu_data() + data_offset; + Dtype* temp_data = temp_.mutable_cpu_data() + data_offset; + Dtype* x_norm_data = x_norm_.mutable_cpu_data() + data_offset; + int num = stats_batch_size_; int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_); if (bottom[0] != top[0]) { - caffe_copy(bottom[0]->count(), bottom_data, top_data); + caffe_copy(data_stats_count, bottom_data, top_data); } if (use_global_stats_) { @@ -192,10 +203,10 @@ void BatchNormLayer::Forward_cpu(const vector*>& bottom, if (!use_global_stats_) { // compute variance using var(X) = E((X-EX)^2) - caffe_powx(top[0]->count(), top_data, Dtype(2), - temp_.mutable_cpu_data()); // (X-EX)^2 + caffe_powx(data_stats_count, top_data, Dtype(2), + temp_data); // (X-EX)^2 caffe_cpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, - 1. / (num * spatial_dim), temp_.cpu_data(), + 1. / (num * spatial_dim), temp_data, spatial_sum_multiplier_.cpu_data(), 0., num_by_chans_.mutable_cpu_data()); caffe_cpu_gemv(CblasTrans, num, channels_, 1., @@ -207,7 +218,7 @@ void BatchNormLayer::Forward_cpu(const vector*>& bottom, this->blobs_[2]->mutable_cpu_data()[0] += 1; caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(), moving_average_fraction_, this->blobs_[0]->mutable_cpu_data()); - int m = bottom[0]->count()/channels_; + int m = bottom[0]->count()/num_stats_batches_/channels_; Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1; caffe_cpu_axpby(variance_.count(), bias_correction_factor, variance_.cpu_data(), moving_average_fraction_, @@ -220,37 +231,40 @@ void BatchNormLayer::Forward_cpu(const vector*>& bottom, variance_.mutable_cpu_data()); // replicate variance to input size - this->replicate(temp_.mutable_cpu_data(), + this->replicate(temp_data, num, spatial_dim*channels_, spatial_dim, variance_.cpu_data()); - caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); + caffe_div(data_stats_count, top_data, temp_data, top_data); // TODO(cdoersch): The caching is only needed because later in-place layers // might clobber the data. Can we skip this if they won't? - caffe_copy(x_norm_.count(), top_data, - x_norm_.mutable_cpu_data()); + caffe_copy(data_stats_count, top_data, + x_norm_data); } template -void BatchNormLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { +void BatchNormLayer::BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx) { + long data_stats_count = stats_batch_size_ * bottom[0]->count(1); + long data_offset = stats_batch_idx * data_stats_count; const Dtype* top_diff; if (bottom[0] != top[0]) { - top_diff = top[0]->cpu_diff(); + top_diff = top[0]->cpu_diff() + data_offset; } else { - caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff()); - top_diff = x_norm_.cpu_diff(); + caffe_copy(data_stats_count, top[0]->cpu_diff() + data_offset, + x_norm_.mutable_cpu_diff() + data_offset); + top_diff = x_norm_.cpu_diff() + data_offset; } - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff() + data_offset; if (use_global_stats_) { - caffe_div(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff); + caffe_div(data_stats_count, top_diff, temp_.cpu_data() + data_offset, bottom_diff); return; } - const Dtype* top_data = x_norm_.cpu_data(); - int num = bottom[0]->shape()[0]; + const Dtype* top_data = x_norm_.cpu_data() + data_offset; + int num = stats_batch_size_; int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_); // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then // @@ -265,7 +279,7 @@ void BatchNormLayer::Backward_cpu(const vector*>& top, // dimensions except the channels dimension where required. // sum(dE/dY \cdot Y) - caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_mul(data_stats_count, top_data, top_diff, bottom_diff); caffe_cpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., bottom_diff, spatial_sum_multiplier_.cpu_data(), 0., num_by_chans_.mutable_cpu_data()); @@ -280,7 +294,7 @@ void BatchNormLayer::Backward_cpu(const vector*>& top, mean_.cpu_data()); // sum(dE/dY \cdot Y) \cdot Y - caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + caffe_mul(data_stats_count, top_data, bottom_diff, bottom_diff); // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y caffe_cpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., @@ -300,12 +314,29 @@ void BatchNormLayer::Backward_cpu(const vector*>& top, std::plus()); // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y - caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, + caffe_cpu_axpby(data_stats_count, Dtype(1), top_diff, Dtype(-1. / (num * spatial_dim)), bottom_diff); // note: temp_ still contains sqrt(var(X)+eps), computed during the forward // pass. - caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); + caffe_div(data_stats_count, bottom_diff, temp_.cpu_data() + data_offset, bottom_diff); +} + +template +void BatchNormLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + for (int i = 0; i < num_stats_batches_; i++) { + ForwardStatsBatch_cpu(bottom, top, i); + } +} + +template +void BatchNormLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + for (int i = 0; i < num_stats_batches_; i++) { + BackwardStatsBatch_cpu(top, propagate_down, bottom, i); + } } diff --git a/src/caffe/layers/mkl_batch_norm_layer.cpp b/src/caffe/layers/mkl_batch_norm_layer.cpp index 6dce50243..a24500c69 100755 --- a/src/caffe/layers/mkl_batch_norm_layer.cpp +++ b/src/caffe/layers/mkl_batch_norm_layer.cpp @@ -52,8 +52,12 @@ MKLBatchNormLayer::~MKLBatchNormLayer() { dnnDelete(batchNormFwdInference); dnnDelete(batchNormBwd); dnnLayoutDelete(layout_usr_); - dnnReleaseBuffer(mean_buffer_); - dnnReleaseBuffer(variance_buffer_); + for (int i = 0; i < mean_buffers_.size(); i++) { + dnnReleaseBuffer(mean_buffers_[i]); + } + for (int i = 0; i < variance_buffers_.size(); i++) { + dnnReleaseBuffer(variance_buffers_[i]); + } dnnReleaseBuffer(scaleShift_buffer_); dnnReleaseBuffer(diffScaleShift_buffer_); } @@ -71,6 +75,15 @@ void MKLBatchNormLayer::Init(const vector*>& bottom, if (this->layer_param_.batch_norm_param().has_use_global_stats()) use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats(); + num_stats_batches_ = 1; + stats_batch_size_ = bottom[0]->shape(0); + BatchNormParameter param = this->layer_param_.batch_norm_param(); + if (!use_global_stats_ && param.stats_batch_size() > 0) { + CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0); + num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size(); + stats_batch_size_ = param.stats_batch_size(); + } + CHECK(use_weight_bias_) << "BatchNorm without scaling have not supported yet"; size_t dim = 4, sizes[4], strides[4]; @@ -99,18 +112,25 @@ void MKLBatchNormLayer::Init(const vector*>& bottom, // TODO: Make a cleanup routine to avoid // copy of following code in the Destructor - dnnError_t e; - dnnLayoutDelete(layout_usr_); - e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); - CHECK_EQ(e, E_SUCCESS); - fwd_bottom_data->create_user_layout(dim, sizes, strides, false); fwd_top_data ->create_user_layout(dim, sizes, strides, false); bwd_bottom_diff->create_user_layout(dim, sizes, strides, false); bwd_top_diff ->create_user_layout(dim, sizes, strides, false); - dnnReleaseBuffer(mean_buffer_); - dnnReleaseBuffer(variance_buffer_); + sizes[3] /= num_stats_batches_; + dnnError_t e; + dnnLayoutDelete(layout_usr_); + e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); + CHECK_EQ(e, E_SUCCESS); + + for (int i = 0; i < mean_buffers_.size(); i++) { + dnnReleaseBuffer(mean_buffers_[i]); + } + for (int i = 0; i < variance_buffers_.size(); i++) { + dnnReleaseBuffer(variance_buffers_[i]); + } + mean_buffers_.resize(num_stats_batches_, NULL); + variance_buffers_.resize(num_stats_batches_, NULL); dnnReleaseBuffer(scaleShift_buffer_); dnnReleaseBuffer(diffScaleShift_buffer_); @@ -223,26 +243,30 @@ void MKLBatchNormLayer::Reshape(const vector*>& bottom, strides[2] = sizes[0]*sizes[1]; strides[3] = sizes[0]*sizes[1]*sizes[2]; - dnnError_t e; - dnnLayoutDelete(layout_usr_); - e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); - CHECK_EQ(e, E_SUCCESS); fwd_bottom_data->create_user_layout(dim, sizes, strides, false); fwd_top_data ->create_user_layout(dim, sizes, strides, false); bwd_bottom_diff->create_user_layout(dim, sizes, strides, false); bwd_top_diff ->create_user_layout(dim, sizes, strides, false); + + sizes[3] /= num_stats_batches_; + dnnError_t e; + dnnLayoutDelete(layout_usr_); + e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); + CHECK_EQ(e, E_SUCCESS); } } template -void MKLBatchNormLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { +void MKLBatchNormLayer::ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx) { + long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1); void* bottom_data = reinterpret_cast(const_cast(bottom[0]->prv_data())); int is_first_pass = 0; - unsigned int amount_to_copy =0; + long amount_to_copy =0; - if (NULL != bottom_data) { + // TODO: support private memory with num_stats_batches_ > 1 + if (NULL != bottom_data && num_stats_batches_ == 1) { amount_to_copy = bottom[0]->prv_data_count(); // Is it the first pass? Create a primitive. if (batchNormFwd == NULL) { @@ -311,7 +335,7 @@ void MKLBatchNormLayer::Forward_cpu( } bottom_data = reinterpret_cast(const_cast(bottom[0]->cpu_data())); - amount_to_copy = bottom[0]->count(); + amount_to_copy = bottom[0]->count() / num_stats_batches_; } if (is_first_pass == 1) { dnnError_t e; @@ -319,18 +343,22 @@ void MKLBatchNormLayer::Forward_cpu( e = dnnLayoutCreateFromPrimitive( &mean_buffer_l, batchNormFwd, dnnResourceMean); CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&mean_buffer_), mean_buffer_l); - CHECK_EQ(e, E_SUCCESS); + for (int i = 0; i < num_stats_batches_; i++) { + e = dnnAllocateBuffer( + reinterpret_cast(&mean_buffers_[i]), mean_buffer_l); + CHECK_EQ(e, E_SUCCESS); + } dnnLayoutDelete(mean_buffer_l); dnnLayout_t variance_buffer_l = NULL; e = dnnLayoutCreateFromPrimitive( &variance_buffer_l, batchNormFwd, dnnResourceVariance); CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&variance_buffer_), variance_buffer_l); - CHECK_EQ(e, E_SUCCESS); + for (int i = 0; i < num_stats_batches_; i++) { + e = dnnAllocateBuffer( + reinterpret_cast(&variance_buffers_[i]), variance_buffer_l); + CHECK_EQ(e, E_SUCCESS); + } dnnLayoutDelete(variance_buffer_l); dnnLayout_t diffScaleShift_buffer_l = NULL; @@ -374,8 +402,8 @@ void MKLBatchNormLayer::Forward_cpu( // Note that this is only necessary for Backward; we skip this if not // doing Backward // TODO: make a caffe_coppy working on blobs - caffe_copy(amount_to_copy, static_cast(bottom_data), - temp_.mutable_cpu_data()); + caffe_copy(amount_to_copy, static_cast(bottom_data) + data_offset, + temp_.mutable_cpu_data() + data_offset); } if (use_global_stats_) { @@ -383,24 +411,25 @@ void MKLBatchNormLayer::Forward_cpu( const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ? 0 : 1 / this->blobs_[2]->cpu_data()[0]; caffe_cpu_scale(this->blobs_[0]->count(), scale_factor, - this->blobs_[0]->cpu_data(), mean_buffer_); + this->blobs_[0]->cpu_data(), mean_buffers_[stats_batch_idx]); caffe_cpu_scale(this->blobs_[1]->count(), scale_factor, - this->blobs_[1]->cpu_data(), variance_buffer_); + this->blobs_[1]->cpu_data(), variance_buffers_[stats_batch_idx]); } dnnError_t e; void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceMean] = mean_buffer_; - BatchNorm_res[dnnResourceVariance] = variance_buffer_; - BatchNorm_res[dnnResourceSrc] = bottom_data; + BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset; BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_; if (fwd_top_data->conversion_needed()) { top[0]->set_prv_data_descriptor(fwd_top_data); + data_offset = stats_batch_idx * (top[0]->prv_data_count() / num_stats_batches_); BatchNorm_res[dnnResourceDst] = - reinterpret_cast(top[0]->mutable_prv_data()); + reinterpret_cast(top[0]->mutable_prv_data() + data_offset); } else { BatchNorm_res[dnnResourceDst] = - reinterpret_cast(top[0]->mutable_cpu_data()); + reinterpret_cast(top[0]->mutable_cpu_data() + data_offset); DLOG(INFO) << "Using cpu_data for top in DnnBatchNorm."; } @@ -415,20 +444,21 @@ void MKLBatchNormLayer::Forward_cpu( // compute and save moving average this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_; this->blobs_[2]->mutable_cpu_data()[0] += 1; - caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffer_, + caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffers_[stats_batch_idx], moving_average_fraction_, this->blobs_[0]->mutable_cpu_data()); - int m = bottom[0]->count()/channels_; + int m = bottom[0]->count()/num_stats_batches_/channels_; Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1; caffe_cpu_axpby(this->blobs_[1]->count(), bias_correction_factor, - variance_buffer_, moving_average_fraction_, + variance_buffers_[stats_batch_idx], moving_average_fraction_, this->blobs_[1]->mutable_cpu_data()); } } template -void MKLBatchNormLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { +void MKLBatchNormLayer::BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx) { + long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1); void *bottom_data = NULL; if (bottom[0] == top[0]) { bottom_data = reinterpret_cast( @@ -437,7 +467,7 @@ void MKLBatchNormLayer::Backward_cpu( bottom_data = reinterpret_cast( const_cast(bottom[0]->prv_data())); - if (NULL == bottom_data) + if (NULL == bottom_data || num_stats_batches_ > 1) bottom_data = reinterpret_cast( const_cast(bottom[0]->cpu_data())); @@ -445,19 +475,19 @@ void MKLBatchNormLayer::Backward_cpu( dnnError_t e; void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceMean] = mean_buffer_; - BatchNorm_res[dnnResourceVariance] = variance_buffer_; - BatchNorm_res[dnnResourceSrc] = bottom_data; + BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset; BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_; BatchNorm_res[dnnResourceDiffScaleShift] = diffScaleShift_buffer_; - - BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(top[0], - true); + BatchNorm_res[dnnResourceDiffDst] = + bwd_top_diff->get_converted_prv(top[0], true) + data_offset; if (bwd_bottom_diff->conversion_needed()) { bottom[0]->set_prv_diff_descriptor(bwd_bottom_diff); - BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff(); + data_offset = stats_batch_idx * (bottom[0]->prv_diff_count() / num_stats_batches_); + BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff() + data_offset; } else { - BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff(); + BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff() + data_offset; } PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKL_NAME("BW")); @@ -479,6 +509,23 @@ void MKLBatchNormLayer::Backward_cpu( } } +template +void MKLBatchNormLayer::Forward_cpu( + const vector*>& bottom, const vector*>& top) { + for (int i = 0; i < num_stats_batches_; i++) { + ForwardStatsBatch_cpu(bottom, top, i); + } +} + +template +void MKLBatchNormLayer::Backward_cpu( + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + for (int i = 0; i < num_stats_batches_; i++) { + BackwardStatsBatch_cpu(top, propagate_down, bottom, i); + } +} + #ifdef CPU_ONLY STUB_GPU(MKLBatchNormLayer); diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp index 4db92b943..f1edfebd4 100644 --- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp +++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp @@ -44,6 +44,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace caffe { +template +void MKLDNNBatchNormLayer::InitStatsBatchVars(int batch_size) { + num_stats_batches_ = 1; + stats_batch_size_ = batch_size; + BatchNormParameter param = this->layer_param_.batch_norm_param(); + if (!use_global_stats_ && param.stats_batch_size() > 0) { + CHECK_EQ(batch_size % param.stats_batch_size(), 0); + num_stats_batches_ = batch_size / param.stats_batch_size(); + stats_batch_size_ = param.stats_batch_size(); + } +} + template void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom ,const vector*>& top) @@ -62,6 +74,10 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom bias_term_ = this->layer_param_.batch_norm_param().bias_term(); moving_average_fraction_ = this->layer_param_.batch_norm_param().moving_average_fraction(); use_global_stats_ = this->phase_ == TEST; + if (this->layer_param_.batch_norm_param().has_use_global_stats()) + use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats(); + + InitStatsBatchVars(num_); this->blobs_.resize(3 + (use_weight_bias_ ? 1:0) + (use_weight_bias_ && bias_term_ ? 1:0)); @@ -76,6 +92,22 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom this->blobs_[i]->mutable_cpu_data()); } + //IntelCaffe treat scale and shift as different blobs, so current MKL-DNN integration has additional copies from Caffe to MKL-DNN buffer on fwd pass and from MKL-DNN to Caffe buffer on bwd pass. + //Optimization: use the temp blob to combine the scale and shift together. Avoid the additional copies. + // Initialize scale and shift combination blob + vector scaleshift_blob_shape(1); + scaleshift_blob_shape[0] = 2*channels_; + scaleshift_blob_.reset(new Blob(scaleshift_blob_shape)); + //Should initialize the scaleshift_blob_ buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN + caffe_set(scaleshift_blob_shape[0], static_cast(0), + scaleshift_blob_->mutable_cpu_data()); + shared_ptr > scaleshift_diff_blob = scaleshift_blob_; + scaleshift_acc_ = scaleshift_blob_; + if (num_stats_batches_ > 1) { + this->scaleshift_acc_.reset(new Blob(scaleshift_blob_shape)); + scaleshift_diff_blob = scaleshift_acc_; + } + if (use_weight_bias_) { // Initialize scale and shift vector scaleshift_shape(1); @@ -83,6 +115,8 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom VLOG(1) << "MKLDNNBatchNormLayer::LayerSetUp: channels_ = " << channels_; this->blobs_[3].reset(new Blob(scaleshift_shape)); + this->blobs_[3]->set_cpu_data(scaleshift_blob_->mutable_cpu_data()); + this->blobs_[3]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff()); FillerParameter filler_param(this->layer_param_.batch_norm_param().filler()); if (!this->layer_param_.batch_norm_param().has_filler()) { filler_param.set_type("constant"); @@ -92,8 +126,10 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom VLOG(1) << "MKLDNNBatchNormLayer::LayerSetUp: scaleshift " << __LINE__ << ":" << this->layer_param_.name(); filler->Fill(this->blobs_[3].get()); - if ( bias_term_ ) { + if (bias_term_) { this->blobs_[4].reset(new Blob(scaleshift_shape)); + this->blobs_[4]->set_cpu_data(scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels_)); + this->blobs_[4]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff() + scaleshift_blob_->offset(channels_)); FillerParameter bias_filler_param(this->layer_param_.batch_norm_param().bias_filler()); if (!this->layer_param_.batch_norm_param().has_bias_filler()) { bias_filler_param.set_type("constant"); @@ -130,6 +166,8 @@ void MKLDNNBatchNormLayer::Reshape(const vector*>& bottom this->num_ = bottom[0]->num(); this->channels_ = bottom[0]->channels(); + InitStatsBatchVars(this->num_); + //Fix: should reshape the top blob with the real size of bottom blob //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_); #ifdef DEBUG @@ -159,8 +197,9 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott memory::data_type mpcsn = memory::data_type::f32; // ---- Initialize memory descriptors ------------- - shared_ptr input_md, output_md, scaleshift_md; - shared_ptr usr_mpd, prv_mpd, scaleshift_mpd; + shared_ptr input_md, input_stats_md, output_md, scaleshift_md; + shared_ptr usr_mpd, prv_mpd; + shared_ptr scaleshift_mpd; if (bottom_data_is_prv) { shared_ptr > mem_descr = get_mkldnn_prv_descriptor(bottom[0]); @@ -172,9 +211,13 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott usr_mpd.reset(new memory::primitive_desc(*input_md, cpu_engine)); } output_md = input_md; + input_stats_md.reset(new memory::desc(*input_md)); + CHECK(input_stats_md->data.ndims > 0 && + input_stats_md->data.dims[0] == this->num_); + input_stats_md->data.dims[0] = stats_batch_size_; // ---- Initialize BatchNorm primitive descriptor ------------- - batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_md, eps_, flags); + batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_stats_md, eps_, flags); // ---- Determining engine to use ----------------------- std::string subengines = this->layer_param_.engine(); if (subengines == "" || subengines == "MKLDNN") @@ -196,7 +239,7 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott // ---- Create memory --------------------- if (use_weight_bias_) { - scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc())); + scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_data())); } // --- init primitive and prv_memory descriptors ---------------------- @@ -206,44 +249,13 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott fwd_top_data.reset(new MKLDNNData(usr_mpd, prv_mpd, top[0], this)); output_memory = fwd_top_data->create_output_memory(); - // ---- Create BatchNorm -------------------- - if (this->phase_ == TEST && !use_global_stats_) { - if (use_weight_bias_) { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *scaleshift_memory, *output_memory)); - } else { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *output_memory)); - } - } else { - mean_memory.reset(new memory(BatchNormFwd_pd->mean_primitive_desc())); - variance_memory.reset(new memory(BatchNormFwd_pd->variance_primitive_desc())); - - if (use_global_stats_) { - caffe_copy(this->channels_, this->blobs_[0]->cpu_data(), - static_cast(mean_memory->get_data_handle())); - caffe_copy(this->channels_, this->blobs_[1]->cpu_data(), - static_cast(variance_memory->get_data_handle())); - if (use_weight_bias_) { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, (const primitive::at)*mean_memory, - (const primitive::at)*variance_memory, *scaleshift_memory, - *output_memory)); - } else { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, (const primitive::at)*mean_memory, - (const primitive::at)*variance_memory, *output_memory)); - } - } else { - if (use_weight_bias_) { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *scaleshift_memory, *output_memory, - *mean_memory, *variance_memory)); - } else { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *output_memory, *mean_memory, *variance_memory)); - } - } + mean_memory.resize(num_stats_batches_); + variance_memory.resize(num_stats_batches_); + input_stats.resize(num_stats_batches_); + output_stats.resize(num_stats_batches_); + BatchNormFwd.resize(num_stats_batches_); + for (int i = 0; i < num_stats_batches_; i++) { + InitBatchNormFwdPrimitive(i); } //fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd); //Wrong passed primitive! (TODO: Checking!) @@ -272,6 +284,70 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott } } +template +template +shared_ptr MKLDNNBatchNormLayer::GetStatsBatchMemory( + shared_ptr > mkldnn_mem, int idx) { + long data_offset = + idx * stats_batch_size_ * this->channels_ * this->width_ * this->height_; + engine cpu_engine = CpuEngine::Instance().get_engine(); + shared_ptr stats_md = mkldnn_mem->get_memory_desc(); + CHECK(stats_md->data.ndims > 0 && + stats_md->data.dims[0] == this->num_); + stats_md->data.dims[0] = stats_batch_size_; + shared_ptr stats_mpd( + new memory::primitive_desc(*stats_md, cpu_engine)); + shared_ptr stats( + new memory(*stats_mpd, mkldnn_mem->get_memory_ptr(data_offset))); + return stats; +} + +template +void MKLDNNBatchNormLayer::InitBatchNormFwdPrimitive(int idx) { + input_stats[idx] = GetStatsBatchMemory(fwd_bottom_data, idx); + output_stats[idx] = GetStatsBatchMemory(fwd_top_data, idx); + + // ---- Create BatchNorm -------------------- + if (this->phase_ == TEST && !use_global_stats_) { + if (use_weight_bias_) { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *scaleshift_memory, + *output_stats[idx])); + } else { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *output_stats[idx])); + } + } else { + mean_memory[idx].reset(new memory(BatchNormFwd_pd->mean_primitive_desc())); + variance_memory[idx].reset(new memory(BatchNormFwd_pd->variance_primitive_desc())); + + if (use_global_stats_) { + caffe_copy(this->channels_, this->blobs_[0]->cpu_data(), + static_cast(mean_memory[idx]->get_data_handle())); + caffe_copy(this->channels_, this->blobs_[1]->cpu_data(), + static_cast(variance_memory[idx]->get_data_handle())); + if (use_weight_bias_) { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], (const primitive::at)*mean_memory[idx], + (const primitive::at)*variance_memory[idx], *scaleshift_memory, + *output_stats[idx])); + } else { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], (const primitive::at)*mean_memory[idx], + (const primitive::at)*variance_memory[idx], *output_stats[idx])); + } + } else { + if (use_weight_bias_) { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *scaleshift_memory, *output_stats[idx], + *mean_memory[idx], *variance_memory[idx])); + } else { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *output_stats[idx], *mean_memory[idx], *variance_memory[idx])); + } + } + } +} template void MKLDNNBatchNormLayer::Forward_cpu(const vector*>& bottom @@ -289,49 +365,40 @@ void MKLDNNBatchNormLayer::Forward_cpu(const vector*>& bottom // update top that head at prv fwd_top_data->sync_before_write(); - if (use_global_stats_) { + for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) { + if (use_global_stats_) { // use the stored mean/variance estimates. const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ? 0 : 1 / this->blobs_[2]->cpu_data()[0]; - Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle()); - Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle()); + Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle()); + Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle()); //TODO: optimize, do this operation in the InitBatchNorm, so no need to calculate each time caffe_cpu_scale(this->blobs_[0]->count(), scale_factor, this->blobs_[0]->cpu_data(), mean_buffer_); caffe_cpu_scale(this->blobs_[1]->count(), scale_factor, this->blobs_[1]->cpu_data(), variance_buffer_); - } - if (use_weight_bias_) { - Dtype* scaleShift_buffer_ = (Dtype *)(scaleshift_memory->get_data_handle()); - // Fill ScaleShift buffer - for (int i = 0; i < this->channels_; i++) { - scaleShift_buffer_[i] = this->blobs_[3]->cpu_data()[i]; - scaleShift_buffer_[channels_ + i] = 0; - if (bias_term_) { - scaleShift_buffer_[channels_ + i] = this->blobs_[4]->cpu_data()[i]; - } - } - } - - PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW")); - PERFORMANCE_MEASUREMENT_BEGIN(); - BatchNormFwd.submit(); - PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_); + } + + PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW")); + PERFORMANCE_MEASUREMENT_BEGIN(); + BatchNormFwd[stats_batch_idx].submit(); + PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_); - if (this->phase_ == TRAIN && !use_global_stats_) { + if (this->phase_ == TRAIN && !use_global_stats_) { // compute and save moving average - Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle()); - Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle()); + Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle()); + Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle()); this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_; this->blobs_[2]->mutable_cpu_data()[0] += 1; caffe_cpu_axpby(this->channels_, Dtype(1), mean_buffer_, moving_average_fraction_, this->blobs_[0]->mutable_cpu_data()); - int m = bottom[0]->count()/channels_; + int m = bottom[0]->count()/num_stats_batches_/channels_; Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1; caffe_cpu_axpby(this->channels_, bias_correction_factor, variance_buffer_, moving_average_fraction_, this->blobs_[1]->mutable_cpu_data()); + } } } @@ -359,7 +426,7 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( memory::data_type mpcsn = memory::data_type::f32; // ---- Initialize memory descriptors ------------- - shared_ptr top_diff_md, top_data_md; + shared_ptr top_diff_md, top_diff_stats_md, top_data_md, output_stats_md; shared_ptr usr_diff_mpd(NULL), prv_diff_mpd(NULL); if (top_diff_is_prv) { shared_ptr > mem_descr @@ -371,10 +438,18 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( top_diff_md.reset(new memory::desc({{n, c, h, w}}, mpcsn, memory::format::nchw)); //MKLDNN batch norm only support 4D memory descriptor! usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine)); } + top_diff_stats_md.reset(new memory::desc(*top_diff_md)); + CHECK(top_diff_stats_md->data.ndims > 0 && + top_diff_stats_md->data.dims[0] == this->num_); + top_diff_stats_md->data.dims[0] = stats_batch_size_; + output_stats_md.reset(new memory::desc(output_memory->get_primitive_desc().desc())); + CHECK(output_stats_md->data.ndims > 0 && + output_stats_md->data.dims[0] == this->num_); + output_stats_md->data.dims[0] = stats_batch_size_; // ---- Initialize bnrm primitive descriptor ------------- batch_normalization_backward::desc BatchNormBwd_desc(prop_kind::backward, - *top_diff_md, output_memory->get_primitive_desc().desc(), eps_, + *top_diff_stats_md, *output_stats_md, eps_, flags); // ---- Determining engine to use ----------------------- std::string subengines = this->layer_param_.engine(); @@ -396,6 +471,11 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( CHECK(BatchNormBwd_pd); + if (use_weight_bias_) { + bwd_scaleshift_diff_memory.reset(new memory( + BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_diff())); + } + // --- init primitive and prv_memory descriptors ---------------------- bwd_top_diff.reset(new MKLDNNDiff(usr_diff_mpd, prv_diff_mpd, top[0], this)); bwd_top_diff->name = "bwd_top_diff_data @ " + this->layer_param_.name(); @@ -405,17 +485,11 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( bwd_bottom_diff->name = "bwd_bottom_diff_data @ " + this->layer_param_.name(); bwd_bottom_diff_memory = bwd_bottom_diff->create_output_memory(inplace); - if (use_weight_bias_) { - bwd_scaleshift_diff_memory.reset(new memory( - BatchNormFwd_pd->weights_primitive_desc())); - BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd, - *input_primitive, *mean_memory, *variance_memory, - *bwd_top_diff_primitive, *scaleshift_memory, - *bwd_bottom_diff_memory, *bwd_scaleshift_diff_memory)); - } else { - BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd, - *input_primitive, *mean_memory, *variance_memory, - *bwd_top_diff_primitive, *bwd_bottom_diff_memory)); + top_diff_stats.resize(num_stats_batches_); + bottom_diff_stats.resize(num_stats_batches_); + BatchNormBwd.resize(num_stats_batches_); + for (int i = 0; i < num_stats_batches_; i++) { + InitBatchNormBwdPrimitive(i); } //bwd_top_diff->set_mkldnn_primitive(BatchNormBwd); //Wrong passed primitive! (TODO: Checking!) @@ -427,6 +501,23 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer); } +template +void MKLDNNBatchNormLayer::InitBatchNormBwdPrimitive(int idx) { + top_diff_stats[idx] = GetStatsBatchMemory(bwd_top_diff, idx); + bottom_diff_stats[idx] = GetStatsBatchMemory(bwd_bottom_diff, idx); + + if (use_weight_bias_) { + BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd, + *input_stats[idx], *mean_memory[idx], *variance_memory[idx], + *top_diff_stats[idx], *scaleshift_memory, + *bottom_diff_stats[idx], *bwd_scaleshift_diff_memory)); + } else { + BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd, + *input_stats[idx], *mean_memory[idx], *variance_memory[idx], + *top_diff_stats[idx], *bottom_diff_stats[idx])); + } +} + template void MKLDNNBatchNormLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) @@ -443,53 +534,50 @@ void MKLDNNBatchNormLayer::Backward_cpu(const vector*>& top, // update bottom that head at prv bwd_bottom_diff->sync_before_write(); - PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW")); - PERFORMANCE_MEASUREMENT_BEGIN(); + for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) { + + PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW")); + PERFORMANCE_MEASUREMENT_BEGIN(); #ifdef DEBUG - if (bottom[0]->prv_data() != NULL) - { + if (bottom[0]->prv_data() != NULL) + { LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data(); - } - else - { + } + else + { LOG(INFO) << "Debug: Bottom prv data is NULL!"; - } - - if (top[0]->prv_diff() != NULL) - { + } + + if (top[0]->prv_diff() != NULL) + { LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff(); - } - else - { + } + else + { LOG(INFO) << "Debug: Top prv diff is NULL!"; LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff(); - } + } #endif - BatchNormBwd.submit(); + BatchNormBwd[stats_batch_idx].submit(); #ifdef DEBUG - if (bottom[0]->prv_diff() != NULL) - { + if (bottom[0]->prv_diff() != NULL) + { LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff(); - } - else - { + } + else + { LOG(INFO) << "Debug: Bottom prv diff is NULL!"; LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff(); - } + } #endif - PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_); - - /* FIXME: this wouldn't work with lazy stream */ - if (use_weight_bias_) { - Dtype* dw = (Dtype *)(bwd_scaleshift_diff_memory->get_data_handle()); - for (int i = 0; i < this->channels_; i++) - this->blobs_[3]->mutable_cpu_diff()[i] = dw[i]; - - if (bias_term_) { - dw += channels_; - for (int i = 0; i < this->channels_; i++) - this->blobs_[4]->mutable_cpu_diff()[i] = dw[i]; - } + PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_); + if (num_stats_batches_ > 1) { + CHECK(scaleshift_blob_ != scaleshift_acc_); + CHECK(scaleshift_blob_->count() == scaleshift_acc_->count()); + caffe_cpu_axpby(scaleshift_acc_->count(), Dtype(1), + scaleshift_blob_->mutable_cpu_diff(), + Dtype(1), scaleshift_acc_->mutable_cpu_diff()); + } } } diff --git a/src/caffe/layers/mkldnn_split_layer.cpp b/src/caffe/layers/mkldnn_split_layer.cpp index ab2c5156a..12359c141 100644 --- a/src/caffe/layers/mkldnn_split_layer.cpp +++ b/src/caffe/layers/mkldnn_split_layer.cpp @@ -94,10 +94,15 @@ void MKLDNNSplitLayer::InitSplitBwd(const vector*>& bottom, // Dimensions of bottom and top blobs. There is a number of // top blobs each of the same size as the bottom one - memory::dims bottom_tz = {static_cast(this->sizes_src_[0]), - static_cast(this->sizes_src_[1]), - static_cast(this->sizes_src_[2]), - static_cast(this->sizes_src_[3])}; + memory::dims bottom_tz; + bottom_tz.resize(4); + for(int i=0; i<4; i++) { + if(i < this->sizes_src_.size()) { + bottom_tz[i] = static_cast(this->sizes_src_[i]); + } else { + bottom_tz[i] = 1; + } + } shared_ptr prv_diff_dst_mpd; shared_ptr usr_diff_dst_mpd( diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp index bacb6ae61..c53cff7ff 100644 --- a/src/caffe/mkldnn_memory.cpp +++ b/src/caffe/mkldnn_memory.cpp @@ -212,8 +212,7 @@ void MKLDNNMemoryDescriptor::convert_from_extprv(shared_ptr_reorder_extprv2prv_pd == NULL) return; - if (this->_extprv_memory_pd->desc().data.format == this->_prv_memory_pd->desc().data.format && - this->_extprv_memory_pd->desc().data.data_type == this->_prv_memory_pd->desc().data.data_type) + if (*this->_extprv_memory_pd == *this->_prv_memory_pd) { #ifdef DEBUG LOG(INFO) << "The format and data_type of _extprv_memory_pd and _prv_memory_pd is same, no need do conversion."; @@ -453,6 +452,32 @@ shared_ptr MKLDNNMemoryDescriptor::create_output_memory( return omem; } +template +Dtype* MKLDNNMemoryDescriptor::get_memory_ptr(long offset) { + if (this->conversion_needed()) { + // TODO: support DFP16 offset + if (this->prv_ptr() != NULL) return (Dtype*)this->prv_ptr() + offset; + // when _internal_ptr is null, having same private layout as _blob + else return is_diff ? + (Dtype*)this->_blob->prv_diff() + offset : + (Dtype*)this->_blob->prv_data() + offset; + } else { + return const_cast( + is_diff ? this->_blob->cpu_diff() + offset : this->_blob->cpu_data() + offset); + } +} + +template +shared_ptr MKLDNNMemoryDescriptor::get_memory_desc() { + shared_ptr desc; + if (this->conversion_needed()) { + desc.reset(new memory::desc(this->prv_memory_pd()->desc())); + } else { + desc.reset(new memory::desc(this->usr_memory_pd()->desc())); + } + return desc; +} + template shared_ptr > get_mkldnn_prv_descriptor(Blob* blob) { diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp index 13ad8da2b..59eec8c7c 100644 --- a/src/caffe/multinode/multi_solver.cpp +++ b/src/caffe/multinode/multi_solver.cpp @@ -105,12 +105,15 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { for (int i = 0; i < layers.size(); ++i) { #ifdef FW_OVERLAP_OPT - if (first && IsSkipWaitGradient(i) == false) { + if (first) { while (layer_finished_flags_[i] == false) { + if (IsSkipWaitGradient(i)) + break; WaitAndUpdateGradient(i); if (layer_finished_flags_[i]) break; + // wait and update gradient for next layers for (int k=i+1; k::ForwardBackwardImpl(bool first, bool last) { break; } } + layer_finished_flags_[i] = false; } #endif @@ -129,6 +133,11 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { LAYER_TIMING_STOP(forward, i); } + // Clear parameter diffs after communication is finished (that is, after + // calling WaitGradientComm) + if (first) + root_solver_->net()->ClearParamDiffs(); + for (int i = layers.size() - 1; i >= 0; --i) { if (!layer_need_backward[i]) { continue; @@ -160,6 +169,10 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { if (last) { #endif for (int i = 0; i < layers.size(); ++i) { +#ifdef FW_OVERLAP_OPT + if (layer_finished_flags_[i]) + continue; +#endif if (IsSkipWaitGradient(i)) { #ifdef FW_OVERLAP_OPT finished_count++; @@ -167,10 +180,6 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { #endif continue; } -#ifdef FW_OVERLAP_OPT - if (layer_finished_flags_[i]) - continue; -#endif WaitAndUpdateGradient(i); #ifdef FW_OVERLAP_OPT @@ -190,7 +199,6 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { template Dtype MultiSolver::ForwardBackward() { Dtype loss = 0; - root_solver_->net()->ClearParamDiffs(); for (int i = 0; i < iter_size; ++i) { loss += ForwardBackwardImpl( (i == 0), (i + 1 == iter_size)); diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 0a8aeb981..9fda127c6 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -62,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caffe/multinode/mlsl.hpp" #include "caffe/multinode/apply_mn_param.hpp" #include "caffe/util/remove_batch_norm.hpp" +#include "caffe/util/apply_bn_stats_batch_size.hpp" PERFORMANCE_CREATE_MONITOR(); @@ -147,6 +148,12 @@ void Net::Init(const NetParameter& in_param) { this->kept_bn_layers_.push_back(param.compile_net_state().kept_bn_layers(idx)); } + NetParameter param_with_stats_batch_size; + if (param.has_bn_stats_batch_size()) { + ApplyBnStatsBatchSize(param, ¶m_with_stats_batch_size); + param = param_with_stats_batch_size; + } + #ifdef USE_MLSL NetParameter param_with_mn; if (mn::is_multinode()) { @@ -628,13 +635,24 @@ void Net::CompilationRuleTwo(const NetParameter& param, // Note: Currently merging of convolution and relu layers is feasible // If current layer is Convolution of MKLDNN engine.. if ((layer_param->type().compare("Convolution") == 0) && - ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN) - || (((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) && - (param.engine().compare(0, 6, "MKLDNN") == 0 - && param.engine().find(":DLA", 6) == string::npos)) || - (param.engine() == "" && - layer_param->engine().compare(0, 6, "MKLDNN") == 0 && - layer_param->engine().find(":DLA", 6) == string::npos)))) { + ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN) || + ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) && + (layer_param->engine().compare(0, 6, "MKLDNN") == 0) && + (layer_param->engine().find(":DLA", 6) == string::npos)) || + ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) && + (layer_param->engine() == "") && + (param.engine().compare(0, 6, "MKLDNN") == 0 && + param.engine().find(":DLA", 6) == string::npos)))) { + // check if Dialation is larger than 1. if yes, don't fuse the following Relu layer with this conv layer + // as MKLDNN doesn't support dilation convolution yet. + bool dilation = false; + for (int i = 0; i < layer_param->convolution_param().dilation_size(); ++i) { + if (layer_param->convolution_param().dilation(i) > 1) { + dilation = true; + break; + } + } + std::vector consumer_layer_params; GetBlobConsumers(consumer_layer_params, layer_param->top(0), param, i+1 < param.layer_size() ? i+1 : i); @@ -644,14 +662,16 @@ void Net::CompilationRuleTwo(const NetParameter& param, // Consumer layer of blob produced by Conv // has to be ReLU layer with one Input Blob - if ((consumer_layer_param.type().compare("ReLU") == 0) && - ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN) - || (((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) && - (param.engine().compare(0, 6, "MKLDNN") == 0 - && param.engine().find(":DLA", 6) == string::npos)) || - (param.engine() == "" && - layer_param->engine().compare(0, 6, "MKLDNN") == 0 && - layer_param->engine().find(":DLA", 6) == string::npos)))) { + if (!dilation && + (consumer_layer_param.type().compare("ReLU") == 0) && + ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN) || + ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) && + (consumer_layer_param.engine().compare(0, 6, "MKLDNN") == 0 && + consumer_layer_param.engine().find(":DLA", 6) == string::npos)) || + ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) && + (consumer_layer_param.engine() == "") && + (param.engine().compare(0, 6, "MKLDNN") == 0 && + param.engine().find(":DLA", 6) == string::npos)))) { string& convolution_top_blob_name = const_cast(layer_param->top(0)); @@ -715,11 +735,12 @@ void Net::CompilationRuleThree(const NetParameter& param, // If current layer is BatchNorm of MKL2017 engine.. if (((layer_param->type().compare("BatchNorm") == 0) && ((layer_param->batch_norm_param().engine() == - BatchNormParameter_Engine_MKL2017) + BatchNormParameter_Engine_MKL2017 || layer_param->batch_norm_param().engine() == + BatchNormParameter_Engine_MKLDNN) || ((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_DEFAULT) && - param.engine().compare("MKL2017") == 0))) && - (layer_param->top(0) == layer_param->bottom(0) )) { + (param.engine().compare("MKL2017") == 0 || param.engine().compare("MKLDNN") == 0)))) && + (layer_param->top(0) == layer_param->bottom(0))) { std::string& batch_norm_top = const_cast(layer_param->top(0)); std::vector consumer_layer_params; GetBlobConsumers(consumer_layer_params, diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index eaf9b6e6b..cd6cb761f 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -208,6 +208,9 @@ message NetParameter { optional string engine = 9 [default = ""]; + // Batch size used for BatchNorm statistics, 0 would use the batch size of bottom blob + optional uint32 bn_stats_batch_size = 11 [default = 0]; + // The layers that make up the net. Each of their configurations, including // connectivity and behavior, is specified as a LayerParameter. repeated LayerParameter layer = 100; // ID 100 so layers are printed last. @@ -900,6 +903,8 @@ message BatchNormParameter { optional bool bias_term = 6 [default = true]; // whether to have bias terms optional FillerParameter filler = 7; // The filler for the weight optional FillerParameter bias_filler = 8; // The filler for the bias + // Batch size used for statistics, 0 would use the batch size of bottom blob + optional uint32 stats_batch_size = 9 [default = 0]; } message SplitParameter { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 3c8d1e66b..f7e7ac1cd 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -321,7 +321,12 @@ void Solver::Step(int iters) { const string& output_name = net_->blob_names()[net_->output_blob_indices()[j]]; const Dtype loss_weight = - net_->blob_loss_weights()[net_->output_blob_indices()[j]]; + net_->blob_loss_weights()[net_->output_blob_indices()[j]] +#ifdef USE_MLSL + * mn::get_distrib()->get_data_parts() +#endif + ; + for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { @@ -928,6 +933,10 @@ void Solver::Restore(const char* state_file) { template void Solver::UpdateSmoothedLoss(Dtype loss, int start_iter, int average_loss) { +#ifdef USE_MLSL + loss *= mn::get_distrib()->get_data_parts(); +#endif + if (losses_.size() < average_loss) { losses_.push_back(loss); int size = losses_.size(); diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 264ac954f..5347dcdf7 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caffe/util/io.hpp" #include "caffe/util/upgrade_proto.hpp" + namespace caffe { template Dtype SGDSolver::GetWarmUpLR(int cur_iter, int warmup_iter, Dtype warmup_start_lr) { @@ -208,13 +209,27 @@ void SGDSolver::ApplyUpdate(int param_id) { return; } +#ifdef ENABLE_SGD_FUSION + if (Caffe::mode() == Caffe::CPU) + { + //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD"; + //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD"; + SGDFusion(param_id, rate); + return; + } +#endif /* ENABLE_SGD_FUSION */ + + //LOG(INFO) << "No Fusion: Param_id: " << param_id; Normalize(param_id); + LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Normalize:"); Regularize(param_id); + LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Regularize:"); ComputeUpdateValue(param_id, rate); + LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: wtinc:"); LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight before update:"); @@ -224,25 +239,241 @@ void SGDSolver::ApplyUpdate(int param_id) { LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight after update:"); } +#ifdef ENABLE_SGD_FUSION +//Math function for fusion +//Function 1: axpy_axpby_copy +//Start: For L1 Regularize_ComputeUpdateValue_Fusion +template +void axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff, + const Dtype rate, const Dtype momentum, Dtype* history_data); + +template <> +void axpy_axpby_copy(size_t count, const float decay, const float* net_params_data, float *net_params_diff, + const float rate, const float momentum, float* history_data) +{ + float temp_result = 0.; +#ifdef _OPENMP +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) +#pragma simd +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + history_data[i] = temp_result; + net_params_diff[i] = temp_result; + } +} + +template <> +void axpy_axpby_copy(size_t count, const double decay, const double* net_params_data, double *net_params_diff, + const double rate, const double momentum, double* history_data) +{ + double temp_result = 0.; +#ifdef _OPENMP +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) +#pragma simd +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + history_data[i] = temp_result; + net_params_diff[i] = temp_result; + } +} +//End: For L1 Regularize_ComputeUpdateValue_Fusion + +//Function 2: axpy_axpby_copy_axpy +//Start: For L2 Regularize_ComputeUpdateValue_Update_Fusion +template +void axpy_axpby_copy_axpy(size_t count, const Dtype decay, Dtype* net_params_data, Dtype *net_params_diff, + const Dtype rate, const Dtype momentum, Dtype* history_data, const Dtype update_param); + +template <> +void axpy_axpby_copy_axpy(size_t count, const float decay, float* net_params_data, float *net_params_diff, + const float rate, const float momentum, float* history_data, const float update_param) +{ + float temp_result = 0.; +#ifdef _OPENMP +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) +#pragma simd +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + history_data[i] = temp_result; + net_params_diff[i] = temp_result; + net_params_data[i] = update_param * temp_result + net_params_data[i]; + } +} + +template <> +void axpy_axpby_copy_axpy(size_t count, const double decay, double* net_params_data, double *net_params_diff, + const double rate, const double momentum, double* history_data, const double update_param) +{ + double temp_result = 0.; +#ifdef _OPENMP +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) +#pragma simd +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + net_params_diff[i] = temp_result; + net_params_data[i] = update_param * temp_result + net_params_data[i]; + } +} +//End: For L2 Regularize_ComputeUpdateValue_Update_Fusion + + +template +void SGDSolver::SGDFusion(int param_id, Dtype rate) { +//LOG(INFO) << "Fusion: Param_id: " << param_id; + +//#pragma region 1. Common initialization + //Normalize initialization + bool skip_Normalize_stage_flag = false; + if (this->param_.iter_size() == 1) { skip_Normalize_stage_flag = true; } + + // Scale gradient to counterbalance accumulation. + const vector*>& net_params = this->net_->learnable_params(); + + //Regularize initialization + const vector& net_params_weight_decay = + this->net_->params_weight_decay(); + Dtype weight_decay = this->param_.weight_decay(); + string regularization_type = this->param_.regularization_type(); + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + //ComputeUpdateValue initialization + const vector& net_params_lr = this->net_->params_lr(); + Dtype momentum = this->param_.momentum(); + Dtype local_rate = rate * net_params_lr[param_id]; +//#pragma endregion + +//#pragma region 2. Common condition judgement + bool prv_diff_condition_flag = false; + if (net_params[param_id]->prv_diff() + && (net_params[param_id]->prv_diff_count() + == net_params[param_id]->count())) { + prv_diff_condition_flag = true; + } +//#pragma endregion + +//#pragma region 3. Normalize stage + if (skip_Normalize_stage_flag == false) + { + const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); + + if (prv_diff_condition_flag) { + caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization, + net_params[param_id]->mutable_prv_diff()); + } + else { + caffe_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_cpu_diff()); + } + } +//#pragma endregion + +//For most common topologies from BVLC, all skipped the Normalize stage, and use L2 regularization +//If prv_diff_condition_flag == true, then prv_data_condition_flag == true (1) +//If prv_diff_condition_flag == false, then prv_data_condition_flag == false (2) +//Another case is local_decay == 0, prv_diff_condition_flag == false (3) +//So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value +//We can extend the fusion in L1 regularization by axpy_axpby_copy +//We extend the fusion of Update stage in L2 regularization by axpy_axpby_copy_axpy, +//then need to change execute_separate_ComputeUpdateValue_stage_flag to execute_separate_ComputeUpdateValue_Update_stage_flag +//Simplify the execute_separate_ComputeUpdateValue_Update_stage_flag to is_separate_ComputeUpdateValue_Update + bool is_separate_ComputeUpdateValue_Update = true; + //Regularize stage (Fused ComputeUpdateValue_stage in some situations) + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + if (net_params[param_id]->prv_data() + && (net_params[param_id]->prv_data_count() + == net_params[param_id]->count())) { + CHECK_EQ(true, + net_params[param_id]->get_prv_data_descriptor()->layout_compare( + net_params[param_id]->get_prv_diff_descriptor())); + if (prv_diff_condition_flag) { + axpy_axpby_copy_axpy(net_params[param_id]->prv_data_count(), local_decay, + net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); + + is_separate_ComputeUpdateValue_Update = false; + } + } else { + if (!prv_diff_condition_flag) + { + axpy_axpby_copy_axpy(net_params[param_id]->count(), local_decay, + net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); + + is_separate_ComputeUpdateValue_Update = false; + } + } + } else if (regularization_type == "L1") { + caffe_cpu_sign(net_params[param_id]->count(), + net_params[param_id]->cpu_data(), + temp_[param_id]->mutable_cpu_data()); + + axpy_axpby_copy(net_params[param_id]->count(), local_decay, + temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data()); + + is_separate_ComputeUpdateValue_Update = false; + + //Update stage (separate) + net_params[param_id]->Update(); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + + //ComputeUpdateValue_Update stage (separate) + if (is_separate_ComputeUpdateValue_Update == true) + { + //Include the situation: regularization_type == "Unknown" + //Include situations (3): local_decay == 0 + //No Regularize stage, only ComputeUpdateValue stage + //ComputeUpdateValue stage + if (prv_diff_condition_flag) { + caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate, + net_params[param_id]->prv_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_prv_diff()); + } else { + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } + + //Update stage (separate) + net_params[param_id]->Update(); + } +} +#endif /* ENABLE_SGD_FUSION */ + template void SGDSolver::Normalize(int param_id) { -#ifdef USE_MLSL - if ((this->param_.iter_size() == 1) && !mn::is_multinode()) { + if (this->param_.iter_size() == 1) { + //LOG(INFO) << "Normalize stage: Normalize stage is skipped."; return; } -#else /* !USE_MLSL */ - if (this->param_.iter_size() == 1) { return; } -#endif /* USE_MLSL */ + //LOG(INFO) << "Normalize stage: Normalize stage is not skipped."; // Scale gradient to counterbalance accumulation. const vector*>& net_params = this->net_->learnable_params(); - -#ifdef USE_MLSL - const Dtype accum_normalization = Dtype(1.) / (this->param_.iter_size() * mn::get_nodes_count()); -#else /* !USE_MLSL */ + const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); -#endif /* USE_MLSL */ switch (Caffe::mode()) { case Caffe::CPU: { @@ -250,8 +481,7 @@ void SGDSolver::Normalize(int param_id) { if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() == net_params[param_id]->count())) { - - caffe_scal(net_params[param_id]->count(), accum_normalization, + caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization, net_params[param_id]->mutable_prv_diff()); } else { @@ -295,7 +525,7 @@ void SGDSolver::Regularize(int param_id) { net_params[param_id]->get_prv_data_descriptor()->layout_compare( net_params[param_id]->get_prv_diff_descriptor())); - caffe_axpy(net_params[param_id]->count(), + caffe_axpy(net_params[param_id]->prv_data_count(), local_decay, net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff()); @@ -376,8 +606,7 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() == net_params[param_id]->count())) { - - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate, net_params[param_id]->prv_diff(), momentum, history_[param_id]->mutable_cpu_data()); @@ -392,6 +621,12 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); + + if (net_params[param_id]->prv_diff() + && (net_params[param_id]->prv_diff_count() + != net_params[param_id]->count())) { + net_params[param_id]->mutable_prv_diff(); + } } break; } diff --git a/src/caffe/util/apply_bn_stats_batch_size.cpp b/src/caffe/util/apply_bn_stats_batch_size.cpp new file mode 100644 index 000000000..078cf6bc5 --- /dev/null +++ b/src/caffe/util/apply_bn_stats_batch_size.cpp @@ -0,0 +1,57 @@ +/* +All modification made by Intel Corporation: © 2017 Intel Corporation + +All contributions by the University of California: +Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014, 2015, the respective contributors +All rights reserved. +For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "caffe/common.hpp" +#include "caffe/util/apply_bn_stats_batch_size.hpp" + +namespace caffe { +void ApplyBnStatsBatchSize(const NetParameter& param, + NetParameter* param_with_stats_batch_size) { + CHECK(param.has_bn_stats_batch_size()); + param_with_stats_batch_size->CopyFrom(param); + param_with_stats_batch_size->clear_layer(); + int bn_stats_batch_size = param.bn_stats_batch_size(); + for (int i = 0; i < param.layer_size(); i++) { + LayerParameter *layer_param = param_with_stats_batch_size->add_layer(); + layer_param->CopyFrom(param.layer(i)); + if (layer_param->type() == "BatchNorm") { + layer_param->mutable_batch_norm_param()->set_stats_batch_size(bn_stats_batch_size); + } + } +} +} diff --git a/tools/caffe.cpp b/tools/caffe.cpp index 231209127..5d0ea7f49 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -547,6 +547,22 @@ int time() { const vector*> >& top_vecs = caffe_net.top_vecs(); const vector >& bottom_need_backward = caffe_net.bottom_need_backward(); + + // Warm up 5 iterations here, because the first several iteration times + // have huge variance in some machines. + int warmup_iterations = 5; + for (int j = 0; j < warmup_iterations; ++j) { + for (int i = 0; i < layers.size(); ++i) { + layers[i]->Forward(bottom_vecs[i], top_vecs[i]); + } + if (!FLAGS_forward_only) { + for (int i = layers.size() - 1; i >= 0; --i) { + layers[i]->Backward(top_vecs[i], bottom_need_backward[i], + bottom_vecs[i]); + } + } + } + LOG(INFO) << "*** Benchmark begins ***"; LOG(INFO) << "Testing for " << FLAGS_iterations << " iterations."; Timer total_timer;