diff --git a/Makefile b/Makefile
index fd6e78bc8..f7144b7db 100644
--- a/Makefile
+++ b/Makefile
@@ -80,7 +80,7 @@ ifeq ($(CAFFE_MLSL_SHUFFLE), 1)
 	COMMON_FLAGS += -DCAFFE_MLSL_SHUFFLE
 endif
 
-ifeq ($(FW_OVERLAP_OPT), 1)
+ifneq ($(FW_OVERLAP_OPT), 0)
 	COMMON_FLAGS += -DFW_OVERLAP_OPT
 endif
 endif
@@ -547,6 +547,12 @@ LIBRARY_DIRS += $(LIB_BUILD_DIR)
 # Automatic dependency generation (nvcc is handled separately)
 CXXFLAGS += -MMD -MP
 
+##########SGD FUSION#######################
+ifeq ($(ENABLE_SGD_FUSION), 1)
+        COMMON_FLAGS += -DENABLE_SGD_FUSION
+endif
+###########################################
+#
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -std=c++11 -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
diff --git a/Makefile.config.example b/Makefile.config.example
index 8bfcc57a3..539a00a67 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -170,5 +170,8 @@ DISTRIBUTE_DIR := distribute
 # The ID of the GPU that 'make runtest' will use to run unit tests.
 TEST_GPUID := 0
 
+# Uncomment for enabling SGD fusion
+# ENABLE_SGD_FUSION := 1
+
 # enable pretty build (comment to see full commands)
 Q ?= @
diff --git a/Makefile.mkldnn b/Makefile.mkldnn
index ec1a70bc5..d113a8923 100644
--- a/Makefile.mkldnn
+++ b/Makefile.mkldnn
@@ -1,5 +1,5 @@
 CAFFE_ROOTDIR := $(shell pwd)
-MKLDNN_ROOTDIR := external/mkldnn
+MKLDNN_ROOTDIR := $(CAFFE_ROOTDIR)/external/mkldnn
 MKLDNN_TMPDIR := $(MKLDNN_ROOTDIR)/tmp
 MKLDNN_SRCDIR := $(MKLDNN_ROOTDIR)/src
 MKLDNN_BUILDDIR := $(MKLDNN_ROOTDIR)/build
@@ -22,7 +22,7 @@ ifneq (,$(findstring ccache,$(CC)))
 endif
 
 MKLDNN_GITHUB := https://github.com/01org/mkl-dnn.git
-MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(CAFFE_ROOTDIR)/$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
+MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
 
 ifeq ("$(wildcard $(MKLDNN_INSTALLDIR)/include/mkldnn.hpp)", "")
 mkldnn_download:
@@ -32,8 +32,8 @@ mkldnn_download:
 
 mkldnn_build: mkldnn_download
 	cmake $(MKLDNN_CMAKE_FLAGS)
-	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
-	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) install
+	make -C $(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
+	make -C $(MKLDNN_BUILDDIR) install
 else
 mkldnn_download:
 mkldnn_build:
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 67adf4ba7..b8c5577c6 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -122,7 +122,7 @@ if(USE_MLSL)
   if(CAFFE_MLSL_SHUFFLE)
     add_definitions("-DCAFFE_MLSL_SHUFFLE")
   endif()
-  if(FW_OVERLAP_OPT)
+  if(FW_OVERLAP_OPT OR NOT DEFINED FW_OVERLAP_OPT)
     message(STATUS "Forward overlapping optimization is enabled!")
     add_definitions("-DFW_OVERLAP_OPT")
   endif()
diff --git a/examples/cpp_classification/batch_classification.cpp b/examples/cpp_classification/batch_classification.cpp
index 374671baa..8295bf4e5 100644
--- a/examples/cpp_classification/batch_classification.cpp
+++ b/examples/cpp_classification/batch_classification.cpp
@@ -422,6 +422,10 @@ int main(int argc, char** argv) {
         cout<<"Use mean file: "<<FLAGS_mean_file<<endl;
     }
 
+#ifdef USE_MLSL
+    caffe::mn::init(&argc,&argv);
+#endif
+
     Classifier classifier(FLAGS_model, FLAGS_weights, FLAGS_mean_file,
             FLAGS_mean_value, FLAGS_label_file, FLAGS_engine, FLAGS_batch_size);
 
diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp
index 5b8aa21e4..3d8448033 100644
--- a/examples/cpp_classification/classification.cpp
+++ b/examples/cpp_classification/classification.cpp
@@ -285,6 +285,10 @@ int main(int argc, char** argv) {
     engine = argv[6];
   }
 
+#ifdef USE_MLSL
+  caffe::mn::init(&argc,&argv);
+#endif
+
   Classifier classifier(model_file, trained_file, mean_file, label_file, engine);
 
 
diff --git a/examples/pycaffe/tune_model.py b/examples/pycaffe/tune_model.py
index 8305b081b..628adf9c0 100644
--- a/examples/pycaffe/tune_model.py
+++ b/examples/pycaffe/tune_model.py
@@ -23,7 +23,7 @@ def tuneModelDefinition(model_path, iteration):
     caffe_path = os.path.join(working_dir, "..", "..", "build", "tools", "caffe")
     if not os.path.exists(caffe_path):
         print "Caffe binary does not exist; please build Caffe binary first."
-        sys,exit(1)
+        sys.exit(1)
 
     base_model_name = os.path.basename(model_path)
     model_dir = os.path.dirname(model_path)
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 3295f7ab1..47d0d751c 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -109,7 +109,7 @@ class Blob {
     return shape_[CanonicalAxisIndex(index)];
   }
   inline int num_axes() const { return shape_.size(); }
-  inline int count() const { return count_; }
+  inline long count() const { return count_; }
 
   /**
    * @brief Compute the volume of a slice; i.e., the product of dimensions
@@ -332,8 +332,8 @@ class Blob {
   shared_ptr<SyncedMemory> shape_data_;
 #endif
   vector<int> shape_;
-  int count_;
-  int capacity_;
+  long count_;
+  long capacity_;
 
   DISABLE_COPY_AND_ASSIGN(Blob);
 };  // class Blob
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 45d65c799..5a95a7730 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -55,8 +55,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOG_BLOB(layer, blob, part, blob_id, description)              \
   do                                                                   \
   {                                                                    \
-      int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count());    \
-      for (int idx = 0; idx < elems_to_log; idx++)                     \
+      long elems_to_log = std::min(static_cast<long>(MAX_ELEMS_TO_LOG), blob->count());    \
+      for (long idx = 0; idx < elems_to_log; idx++)                     \
       {                                                                \
           LOG_LAYER(layer) << description                              \
                            << ", blob_id " << blob_id                  \
@@ -68,8 +68,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOG_PARAM_BLOB(blob, part, blob_id, description)               \
   do                                                                   \
   {                                                                    \
-      int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count());    \
-      for (int idx = 0; idx < elems_to_log; idx++)                     \
+      long elems_to_log = std::min(static_cast<long>(MAX_ELEMS_TO_LOG), blob->count());    \
+      for (long idx = 0; idx < elems_to_log; idx++)                     \
       {                                                                \
           DLOG(INFO) << description                                    \
                      << ", blob_id " << blob_id                        \
@@ -521,7 +521,12 @@ class Layer {
       CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
           "unspecified or specified once per top blob.";
       for (int top_id = 0; top_id < top.size(); ++top_id) {
+#ifdef USE_MLSL
+        const Dtype loss_weight = layer_param_.loss_weight(top_id) /
+          GetDistribution().get_data_parts();
+#else
         const Dtype loss_weight = layer_param_.loss_weight(top_id);
+#endif
         if (loss_weight == Dtype(0)) { continue; }
         this->set_loss(top_id, loss_weight);
         const int count = top[top_id]->count();
diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp
index e83bab953..c777de30c 100644
--- a/include/caffe/layers/batch_norm_layer.hpp
+++ b/include/caffe/layers/batch_norm_layer.hpp
@@ -117,11 +117,19 @@ class BatchNormLayer : public Layer<Dtype> {
                        const Dtype* data_to_be_replicated,
                        FuncTy op_func);
 
+  void ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, int stats_batch_idx);
+  void BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+      int stats_batch_idx);
+
   Blob<Dtype> mean_, variance_, temp_, x_norm_;
   bool use_global_stats_;
   Dtype moving_average_fraction_;
   int channels_;
   Dtype eps_;
+  int num_stats_batches_;
+  int stats_batch_size_;
 
   // extra temporarary variables is used to carry out sums/broadcasting
   // using BLAS
diff --git a/include/caffe/layers/mkl_layers.hpp b/include/caffe/layers/mkl_layers.hpp
index 0d5d66416..c9806daee 100644
--- a/include/caffe/layers/mkl_layers.hpp
+++ b/include/caffe/layers/mkl_layers.hpp
@@ -481,12 +481,12 @@ class MKLBatchNormLayer : public Layer<Dtype> {
         batchNormFwd(static_cast<dnnPrimitive_t>(NULL)),
         batchNormFwdInference(static_cast<dnnPrimitive_t>(NULL)),
         batchNormBwd(static_cast<dnnPrimitive_t>(NULL)),
-        mean_buffer_(static_cast<Dtype*>(NULL)),
-        variance_buffer_(static_cast<Dtype*>(NULL)),
         scaleShift_buffer_(static_cast<Dtype*>(NULL)),
         diffScaleShift_buffer_(static_cast<Dtype*>(NULL)),
         layout_usr_(static_cast<dnnLayout_t>(NULL)),
-        use_global_stats_(false)
+        use_global_stats_(false),
+        num_stats_batches_(1),
+        stats_batch_size_(0)
       {
         PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
         PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
@@ -515,6 +515,12 @@ class MKLBatchNormLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  void ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, int stats_batch_idx);
+  void BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+      int stats_batch_idx);
+
   void Init(const vector<Blob<Dtype>*>& bottom,
             const vector<Blob<Dtype>*>& top);
 
@@ -534,12 +540,14 @@ class MKLBatchNormLayer : public Layer<Dtype> {
   shared_ptr<MKLDiff<Dtype> > bwd_bottom_diff;
   Blob<Dtype> temp_;
   dnnPrimitive_t batchNormFwd, batchNormFwdInference, batchNormBwd;
-  Dtype *mean_buffer_;
-  Dtype *variance_buffer_;
+  vector<Dtype *> mean_buffers_;
+  vector<Dtype *> variance_buffers_;
   Dtype *scaleShift_buffer_;
   Dtype *diffScaleShift_buffer_;
   dnnLayout_t layout_usr_;
   bool use_global_stats_;
+  int num_stats_batches_;
+  int stats_batch_size_;
 
   PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
   PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
index f63301e2a..f7ce1062e 100644
--- a/include/caffe/layers/mkldnn_layers.hpp
+++ b/include/caffe/layers/mkldnn_layers.hpp
@@ -68,7 +68,6 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
         , fwd_top_data(), fwd_bottom_data()
         , bwd_top_diff(), bwd_bottom_diff()
         , BatchNormFwd_pd(), BatchNormBwd_pd()
-        , mean_memory(), variance_memory()
         , scaleshift_memory(), bwd_scaleshift_diff_memory()
         , output_memory(), bwd_bottom_diff_memory()
         , input_primitive(), bwd_top_diff_primitive()
@@ -96,22 +95,32 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
     void InitBatchNormBwd(const vector<Blob<Dtype>*>& top,
             const vector<bool>& propagate_down,
             const vector<Blob<Dtype>*>& bottom);
+    void InitBatchNormFwdPrimitive(int stats_batch_idx);
+    void InitBatchNormBwdPrimitive(int stats_batch_idx);
+    template <bool diff> shared_ptr<memory> GetStatsBatchMemory(
+      shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_data, int idx);
+    void InitStatsBatchVars(int batch_size);
     shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
     shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
     shared_ptr<batch_normalization_forward::primitive_desc> BatchNormFwd_pd;
     shared_ptr<batch_normalization_backward::primitive_desc> BatchNormBwd_pd;
 
-    MKLDNNPrimitive<Dtype> BatchNormFwd, BatchNormBwd;
-    shared_ptr<memory> mean_memory, variance_memory;
+    vector<MKLDNNPrimitive<Dtype> > BatchNormFwd, BatchNormBwd;
+    vector<shared_ptr<memory> > mean_memory, variance_memory;
 
     shared_ptr<memory> scaleshift_memory, bwd_scaleshift_diff_memory;
     shared_ptr<memory> output_memory, bwd_bottom_diff_memory;
+    vector<shared_ptr<memory> > input_stats, output_stats, top_diff_stats, bottom_diff_stats;
 
     shared_ptr<primitive> input_primitive, bwd_top_diff_primitive;
 
     int32_t num_, width_, height_, channels_;
     Dtype eps_, moving_average_fraction_;
     bool use_weight_bias_, bias_term_, use_global_stats_;
+    int num_stats_batches_;
+    int stats_batch_size_;
+    shared_ptr<Blob<Dtype> > scaleshift_blob_;
+    shared_ptr<Blob<Dtype> > scaleshift_acc_;
 
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
diff --git a/include/caffe/mkldnn_memory.hpp b/include/caffe/mkldnn_memory.hpp
index a59ce6e12..3b1a1c6ad 100644
--- a/include/caffe/mkldnn_memory.hpp
+++ b/include/caffe/mkldnn_memory.hpp
@@ -94,6 +94,7 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
         if (_prv_memory == NULL) allocate();
         return _internal_ptr;
     }
+
     shared_ptr<primitive>  reorder_usr2prv() { return _reorder_usr2prv.aprimitive; }
     shared_ptr<primitive>  reorder_prv2usr() { return _reorder_prv2usr.aprimitive; }
     shared_ptr<primitive>  reorder_extprv2prv() { return _reorder_extprv2prv.aprimitive; }
@@ -201,6 +202,8 @@ class MKLDNNMemoryDescriptor : public MKLDNNMemoryDescriptorBase<Dtype> {
     shared_ptr<memory> create_output_memory(Blob<Dtype> * blob, bool inplace = false);
     shared_ptr<primitive> create_input(bool set_prv_ptr);
     shared_ptr<memory> create_output_memory(bool inplace = false);
+    Dtype* get_memory_ptr(long offset = 0);
+    shared_ptr<memory::desc> get_memory_desc();
 
     void set_mkldnn_primitive(MKLDNNPrimitive<Dtype>& mprimitive) { CHECK(mprimitive.aprimitive); _mkldnn_primitive = mprimitive;  }
     MKLDNNPrimitive<Dtype>&  mkldnn_primitive() { return _mkldnn_primitive; }
diff --git a/include/caffe/multinode/multi_sync.hpp b/include/caffe/multinode/multi_sync.hpp
index 6300c4876..905d9fce7 100644
--- a/include/caffe/multinode/multi_sync.hpp
+++ b/include/caffe/multinode/multi_sync.hpp
@@ -215,10 +215,6 @@ namespace caffe {
     }
 
     void on_iter_finished(int layer_id) {
-#ifdef FW_OVERLAP_OPT
-      solver->set_layer_finished_flag(layer_id, false);
-#endif
-
       boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
       if (layer->layerOp == nullptr) {
         return;
diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
index a11da89de..09f6ff26e 100644
--- a/include/caffe/sgd_solvers.hpp
+++ b/include/caffe/sgd_solvers.hpp
@@ -81,6 +81,11 @@ class SGDSolver : public Solver<Dtype> {
   //   of gradients/updates and is not needed in snapshots
   vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
 
+#ifdef ENABLE_SGD_FUSION
+  //Fuse the Normalize, Regularize, ComputeUpdateValue and Update process together
+  void SGDFusion(int param_id, Dtype rate);
+#endif /* ENABLE_SGD_FUSION */
+
   // loss history for 'plateau' LR policy (should be stored in snapshots)
   Dtype minimum_loss_;
   int iter_last_event_;
diff --git a/include/caffe/util/apply_bn_stats_batch_size.hpp b/include/caffe/util/apply_bn_stats_batch_size.hpp
new file mode 100644
index 000000000..872b2c5bf
--- /dev/null
+++ b/include/caffe/util/apply_bn_stats_batch_size.hpp
@@ -0,0 +1,45 @@
+/*
+All modification made by Intel Corporation: © 2017 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef APPLY_BN_STATS_BATCH_SIZE_HPP_
+#define APPLY_BN_STATS_BATCH_SIZE_HPP_
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+void ApplyBnStatsBatchSize(const NetParameter& param,
+    NetParameter* param_with_stats_batch_size);
+}
+#endif // APPLY_BN_STATS_BATCH_SIZE_HPP_
diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt
new file mode 100644
index 000000000..4f4f21a93
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt
@@ -0,0 +1,19 @@
+net: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt"
+test_iter: 1000
+test_interval: 156
+test_initialization: false
+display: 40
+base_lr: 3.2
+lr_policy: "multistep"
+stepvalue:4680
+stepvalue:9360
+stepvalue:12480
+gamma: 0.1
+max_iter: 14075
+warmup_iter: 780 # 1281167 / 8192 * 5 epochs
+warmup_start_lr: 0.1
+momentum: 0.9
+weight_decay: 0.0001
+snapshot: 156
+snapshot_prefix: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/resnet_50_64_nodes_8k"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt
new file mode 100644
index 000000000..3dd57aaac
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt
@@ -0,0 +1,3322 @@
+name: "ResNet-50"
+bn_stats_batch_size: 32
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_aspect_ratio_param {
+      min_area_ratio: 0.08
+      max_area_ratio: 1
+      aspect_ratio_change: 0.75
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    batch_size: 128
+    backend: LMDB
+    prefetch: 2
+    shuffle: true
+  }
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_resize_param {
+      min_size: 256
+      max_size: 256
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+
+layer {
+  bottom: "data"
+  top: "conv1"
+  name: "conv1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 7
+    pad: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+      variance_norm: FAN_OUT
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "bn_conv1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "scale_conv1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "conv1_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "pool1"
+  name: "pool1"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 3
+    stride: 2
+    pool: MAX
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch1"
+  name: "res2a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "bn2a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "scale2a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "bn2a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "scale2a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "bn2a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "scale2a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2c"
+  name: "res2a_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "bn2a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "scale2a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  bottom: "res2a_branch2c"
+  top: "res2a"
+  name: "res2a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2a"
+  name: "res2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "bn2b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "scale2b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "bn2b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "scale2b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2c"
+  name: "res2b_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "bn2b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "scale2b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a"
+  bottom: "res2b_branch2c"
+  top: "res2b"
+  name: "res2b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2b"
+  name: "res2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "bn2c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "scale2c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "bn2c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "scale2c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2c"
+  name: "res2c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "bn2c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "scale2c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b"
+  bottom: "res2c_branch2c"
+  top: "res2c"
+  name: "res2c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res2c"
+  name: "res2c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch1"
+  name: "res3a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "bn3a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "scale3a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "bn3a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "scale3a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "bn3a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "scale3a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2c"
+  name: "res3a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "bn3a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "scale3a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  bottom: "res3a_branch2c"
+  top: "res3a"
+  name: "res3a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3a"
+  name: "res3a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "bn3b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "scale3b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "bn3b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "scale3b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2c"
+  name: "res3b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "bn3b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "scale3b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a"
+  bottom: "res3b_branch2c"
+  top: "res3b"
+  name: "res3b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3b"
+  name: "res3b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "bn3c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "scale3c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "bn3c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "scale3c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2c"
+  name: "res3c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "bn3c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "scale3c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b"
+  bottom: "res3c_branch2c"
+  top: "res3c"
+  name: "res3c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3c"
+  name: "res3c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "bn3d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "scale3d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "bn3d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "scale3d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2c"
+  name: "res3d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "bn3d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "scale3d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c"
+  bottom: "res3d_branch2c"
+  top: "res3d"
+  name: "res3d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res3d"
+  name: "res3d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch1"
+  name: "res4a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "bn4a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "scale4a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "bn4a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "scale4a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "bn4a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "scale4a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2c"
+  name: "res4a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "bn4a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "scale4a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  bottom: "res4a_branch2c"
+  top: "res4a"
+  name: "res4a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4a"
+  name: "res4a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "bn4b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "scale4b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "bn4b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "scale4b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2c"
+  name: "res4b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "bn4b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "scale4b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a"
+  bottom: "res4b_branch2c"
+  top: "res4b"
+  name: "res4b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4b"
+  name: "res4b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "bn4c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "scale4c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "bn4c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "scale4c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2c"
+  name: "res4c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "bn4c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "scale4c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b"
+  bottom: "res4c_branch2c"
+  top: "res4c"
+  name: "res4c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4c"
+  name: "res4c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "bn4d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "scale4d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "bn4d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "scale4d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2c"
+  name: "res4d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "bn4d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "scale4d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c"
+  bottom: "res4d_branch2c"
+  top: "res4d"
+  name: "res4d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4d"
+  name: "res4d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "bn4e_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "scale4e_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "bn4e_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "scale4e_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2c"
+  name: "res4e_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "bn4e_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "scale4e_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d"
+  bottom: "res4e_branch2c"
+  top: "res4e"
+  name: "res4e"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4e"
+  name: "res4e_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "bn4f_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "scale4f_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "bn4f_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "scale4f_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2c"
+  name: "res4f_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "bn4f_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "scale4f_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e"
+  bottom: "res4f_branch2c"
+  top: "res4f"
+  name: "res4f"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res4f"
+  name: "res4f_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch1"
+  name: "res5a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "bn5a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "scale5a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "bn5a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "scale5a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "bn5a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "scale5a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2c"
+  name: "res5a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "bn5a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "scale5a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  bottom: "res5a_branch2c"
+  top: "res5a"
+  name: "res5a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5a"
+  name: "res5a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "bn5b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "scale5b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "bn5b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "scale5b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2c"
+  name: "res5b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "bn5b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "scale5b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a"
+  bottom: "res5b_branch2c"
+  top: "res5b"
+  name: "res5b"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5b"
+  name: "res5b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "bn5c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "scale5c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "bn5c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "scale5c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2c"
+  name: "res5c_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "bn5c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "scale5c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b"
+  bottom: "res5c_branch2c"
+  top: "res5c"
+  name: "res5c"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "res5c"
+  name: "res5c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "pool5"
+  name: "pool5"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 7
+    stride: 1
+    pool: AVE
+  }
+}
+
+layer {
+  bottom: "pool5"
+  top: "fc1000"
+  name: "fc1000"
+  type: "InnerProduct"
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss"
+  name: "prob"
+  type: "SoftmaxWithLoss"
+}
+layer {
+  name: "loss3/top-1"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-1"
+}
+layer {
+  name: "loss3/top-5"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-5"
+  accuracy_param {
+    top_k: 5
+  }
+}
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index b9dc23e24..3b02f509b 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -411,7 +411,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("channels", &Blob<Dtype>::channels)
     .add_property("height",   &Blob<Dtype>::height)
     .add_property("width",    &Blob<Dtype>::width)
-    .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
+    .add_property("count",    static_cast<long (Blob<Dtype>::*)() const>(
         &Blob<Dtype>::count))
     .def("reshape",           bp::raw_function(&Blob_Reshape))
     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
diff --git a/scripts/run_intelcaffe.sh b/scripts/run_intelcaffe.sh
new file mode 100755
index 000000000..29a5309ab
--- /dev/null
+++ b/scripts/run_intelcaffe.sh
@@ -0,0 +1,604 @@
+#!/bin/sh
+ set -x
+
+benchmark_mode="all"
+
+# time/train/resume_train
+mode="train"
+
+# it's assigned by detect_cpu
+cpu_model=skx
+
+# a list of nodes
+host_file=""
+
+# network parameters
+network="opa"
+tcp_netmask=""
+
+# specify number of MLSL ep servers in command
+num_mlsl_servers=-1
+
+# parameters for caffe time
+iteration=0
+model_file=""
+# parameters for resuming training
+snapshot=""
+# parameters for training
+solver_file=""
+
+# specify engine for running caffe
+engine="MKL2017"
+
+result_dir=""
+debug="off"
+
+function usage
+{
+    script_name=$0
+    echo "Usage:"
+    echo "  $script_name --host host_file [--solver solver_file]"
+    echo "               [--network opa/tcp] [--netmask tcp_netmask] [--debug on/off]"
+    echo "               [--mode train/resume_train/time/none] [--benchmark all/qperf/mpi/none]"
+    echo "               [--iteration iter] [--model_file deploy.prototxt]"
+    echo "               [--snapshot snapshot.caffemodel]"
+    echo "               [--num_mlsl_servers num_mlsl_servers]"
+    echo "               [--output output_folder]"
+    echo ""
+    echo "  Parameters:"
+    echo "    host: host file includes list of nodes."
+    echo ""
+    echo "  Optional parameters:"
+    echo "    solver: specify solver file if mode is train/resume_train"
+    echo "    network: opa(default), tcp"
+    echo "    netmask: only used if network is tcp"
+    echo "    debug: off(default). MLSL debug information is outputed if it's on"
+    echo "    mode: train(default), resume_train, time, none(not to run caffe test)"
+    echo "    benchmark: all(default). Includes qperf, all-reduce performance"
+    echo "      Dependency: user needs to install qperf, IMB-MPI1;"
+    echo "                  and add them in system path."
+    echo "    iteration and model_file: only used if mode is time (caffe time)"
+    echo "    snapshot: only used if mode is resume_train"
+    echo "    num_mlsl_servers: number of MLSL ep servers"
+    echo "    output_folder: output folder for storing results"
+}
+
+declare -a cpu_list=("Intel Xeon E5-26xx (Broadwell)" "Intel Xeon Phi 72xx (Knight Landing)" 
+                     "Intel Xeon Platinum 8180 (Skylake)" "Intel Xeon 6148 (Skylake)")
+
+function detect_cpu
+{
+    # detect cpu model
+    model_string=`lscpu | grep "Model name" | awk -F ':' '{print $2}'`
+    if [[ $model_string == *"72"* ]]; then
+        cpu_model=knl
+    elif [[ $model_string == *"8180"* ]]; then
+        cpu_model=skx
+    elif [[ $model_string == *"6148"* ]]; then
+        cpu_model=skx
+    elif [[ $model_string == *"E5-26"* ]]; then
+        cpu_model=bdw
+    else
+        echo "CPU model: $model_string"
+        echo "  Use default settings, which may not be optimal ones."
+    fi
+}
+
+function set_numa_node
+{
+    # detect numa mode: cache and flat mode for KNL
+    numa_node=($(numactl -H | grep "available" | awk -F ' ' '{print $2}'))
+    if [ $numa_node -eq 1 ]; then
+        echo "Cache mode."
+        # cache mode, use numa node 0
+        numanode=0
+    else
+        echo "Flat mode."
+        numanode=1
+    fi
+}
+
+
+function check_dependency
+{
+    dep=$1
+    which $dep >/dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        echo "Warning: cannot find $dep"
+        return 1
+    fi
+    return 0
+}
+
+
+function init_mpi_envs
+{
+    # IMPI configuration
+    if [ "$network" == "opa" ]; then
+        export I_MPI_FABRICS=tmi
+        export I_MPI_TMI_PROVIDER=psm2
+        if [ "$cpu_model" == "knl" ];  then
+            # PSM2 configuration
+            export PSM2_MQ_RNDV_HFI_WINDOW=4194304 #2097152 # to workaround PSM2 bug in IFS 10.2 and 10.3
+            export PSM2_MQ_EAGER_SDMA_SZ=65536
+            export PSM2_MQ_RNDV_HFI_THRESH=200000
+        fi
+
+        export PSM2_IDENTIFY=1 # for debug
+    elif [ "$network" == "tcp" ]; then
+        export I_MPI_FABRICS=tcp
+        export I_MPI_TCP_NETMASK=$tcp_netmask
+    else
+        echo "Invalid network: $network"
+        exit 1
+    fi
+
+    export I_MPI_FALLBACK=0
+    export I_MPI_DEBUG=6
+}
+
+
+function clear_shm
+{
+    clear_command="rm -rf /dev/shm/*"
+    check_shm_command="df -h | grep shm"
+
+    # TODO: check if 50G is the minimum shm size?
+    min_shm_size=50
+    shm_unit="G"
+
+    for node in "${nodenames[@]}"
+    do
+        ssh ${node} "$clear_command"
+        shm_line=`ssh ${node} "$check_shm_command"`
+        shm_string=`echo $shm_line | awk -F ' ' '{print $(NF-2)}'`
+        unit="${shm_string:(-1)}"
+        shm_size=${shm_string::-1}
+        if [ "$unit" == "$shm_unit" ] && [ $shm_size -ge ${min_shm_size} ]; then
+            continue
+        else
+            echo "Error: /dev/shm size = ${shm_size}${unit}, on node: ${node}."
+            echo "       It's less than minimum size: ${min_shm_size}${shm_unit}."
+            echo "       Please clean or enlarge it."
+            exit 1
+        fi
+    done
+}
+
+function kill_zombie_processes
+{
+    kill_command="for process in ep_server caffe mpiexec.hydra; do for i in \$(ps -e | grep -w \$process | awk -F ' ' '{print \$1}'); do kill -9 \$i; echo \"\$process \$i killed.\"; done done"
+    for node in "${nodenames[@]}"
+    do
+        ssh ${node} "$kill_command"
+    done
+}
+
+function clear_envs
+{
+    clear_shm
+    kill_zombie_processes
+}
+
+function set_mlsl_vars
+{
+    if [ "${num_mlsl_servers}" -eq -1 ]; then
+        if [ ${numnodes} -eq 1 ]; then
+            numservers=0
+        else
+            if [ ${cpu_model} == knl ]; then
+                numservers=4
+            else
+                numservers=2
+            fi
+        fi
+    else
+        numservers=$((num_mlsl_servers))
+    fi
+
+    echo "MLSL_NUM_SERVERS: $numservers"
+    export MLSL_NUM_SERVERS=${numservers}
+
+    if [ ${numservers} -gt 0 ]; then
+        if [ ${cpu_model} == knl ]; then
+            listep=6,7,8,9,10,11,12,13
+        else
+            listep=6,7,8,9
+        fi
+        export MLSL_SERVER_AFFINITY="${listep}"
+        echo "MLSL_SERVER_AFFINITY: ${listep}"
+    fi
+
+    # MLSL configuration
+    if [ "$debug" == "on" ]; then
+        export MLSL_LOG_LEVEL=3
+    else
+        export MLSL_LOG_LEVEL=0
+    fi
+}
+
+function set_env_vars
+{
+    set_mlsl_vars
+
+    ppncpu=1
+    threadspercore=1
+
+    cores=`lscpu | grep "Core(s) per socket:" | awk '{print $4}'`
+    sockets=`lscpu | grep "Socket(s)" | awk  '{print $2}'`
+    maxcores=$((cores*sockets))
+
+    numthreads=$(((maxcores-numservers)*threadspercore))
+    numthreads_per_proc=$((numthreads/ppncpu))
+
+    export OMP_NUM_THREADS=${numthreads_per_proc}
+
+    # OMP configuration
+    # threadspercore=1
+    affinitystr="proclist=[0-5,$((5+numservers+1))-$((maxcores-1))],granularity=thread,explicit"
+    export KMP_HW_SUBSET=1t
+    export KMP_AFFINITY=$affinitystr
+}
+
+function execute_command
+{
+    local xeonbin_=$1
+    local result_dir_=$2
+
+    if [ ${cpu_model} == knl ]; then
+        exec_command="numactl --preferred=$numanode $xeonbin_"
+    else
+        exec_command="$xeonbin_"
+    fi
+
+    if [ ${numnodes} -gt 1 ]; then
+        # Produce the configuration file for mpiexec. 
+        # Each line of the config file contains a # host, environment, binary name.
+        cfile_=nodeconfig-${cpu_model}-${numnodes}.txt
+        rm -f $cfile_
+
+        for node in "${nodenames[@]}"
+        do
+            echo "-host ${node} -n $ppncpu $exec_command" >> $cfile_
+        done
+    fi
+
+    clear_envs
+    log_file=outputCluster-${cpu_model}-${numnodes}.txt
+
+    sensors_bin="sensors"
+    check_dependency $sensors_bin
+    has_sensors=$?
+    if [ $has_sensors -eq 0 ]; then
+        sensor_log_file=sensors-${cpu_model}-${numnodes}-start.log
+        $sensors_bin >$sensor_log_file
+        mv $sensor_log_file $result_dir_/
+    fi
+    
+    if [ ${numnodes} -eq 1 ]; then
+        time GLOG_minloglevel=0 $exec_command >${log_file} 2>&1
+    else
+        init_mpi_envs
+        exec_command="-l -configfile $cfile_"
+        time GLOG_minloglevel=0 mpiexec.hydra $exec_command >${log_file} 2>&1 
+    fi
+
+    if [ $has_sensors -eq 0 ]; then
+        sensor_log_file=sensors-${cpu_model}-${numnodes}-end.log
+        $sensors_bin >$sensor_log_file
+        mv $sensor_log_file $result_dir_/
+    fi
+    mv $log_file $cfile_ $result_dir_/
+}
+
+function run_qperf_bench
+{
+    qperf_bin="qperf"
+    check_dependency $qperf_bin
+    if [ $? -ne 0 ]; then
+        echo "Skip qperf benchmark."
+        return
+    fi
+
+    # measure bandwidth and latency
+    qperf_result_log="qperf_bench_result.log"
+    rm -f $qperf_result_log
+
+    server_node=""
+    port=1234567
+    qperf_param="-lp $port -oo msg_size:1024:512M:*2 -vu tcp_bw tcp_lat"
+
+    for ((i=0; i<numnodes-1; i++))
+    do
+        server_node=${nodenames[$i]}
+        echo "Run qperf server on ${server_node}..." | tee -a $qperf_result_log
+        ssh -f $server_node "$qperf_bin -lp $port" >> $qperf_result_log
+        echo >>$qperf_result_log
+
+        for ((j=i+1; j<numnodes; j++))
+        do
+            client_node=${nodenames[$j]}
+            echo "Run qperf client on ${client_node}..." | tee -a $qperf_result_log
+            qperf_command="$qperf_bin $server_node $qperf_param"
+            if [ ${j} == ${numnodes} ]; then
+                qperf_command+=" quit"
+            fi
+            echo "ssh $client_node $qperf_command" | tee -a $qperf_result_log
+            ssh $client_node "$qperf_command" | tee -a $qperf_result_log
+            echo >>$qperf_result_log
+        done
+    done
+
+    mv $qperf_result_log $result_dir/
+}
+
+function run_mpi_bench
+{
+    # MPI benchmark
+    mpibench_bin="IMB-MPI1"
+    check_dependency $mpibench_bin
+    if [ $? -ne 0 ]; then
+        echo "Skip MPI benchmark..."
+        return
+    fi
+
+    xeonbin="$mpibench_bin allreduce"
+
+    declare -a adjust_values=(1 2 3 5 7 8 9 0)
+    declare -a collective_values=('tmi' 'none')
+
+    echo "Start mpi bench..."
+    for ((i=0; i<${#adjust_values[@]}; i++))
+    do
+        for ((j=0; j<${#collective_values[@]}; j++))
+        do
+            if [ ${adjust_values[$i]} -eq 0 ]; then
+                unset I_MPI_ADJUST_ALLREDUCE
+            else
+                export I_MPI_ADJUST_ALLREDUCE=${adjust_values[$i]}
+            fi
+
+            if [ "${collective_values[$j]}" == "none" ]; then
+                unset I_MPI_COLLECTIVE_DEFAULTS
+            else
+                export I_MPI_COLLECTIVE_DEFAULTS=${collective_values[$j]}
+            fi
+            echo "iteration $i, ${j}..."
+            echo "I_MPI_ADJUST_ALLREDUCE=$I_MPI_ADJUST_ALLREDUCE"
+            echo "I_MPI_COLLECTIVE_DEFAULTS=$I_MPI_COLLECTIVE_DEFAULTS"
+
+            test_result_dir=$result_dir/mpibench-${adjust_values[$i]}-${collective_values[$j]}
+            mkdir -p $test_result_dir
+            execute_command "$xeonbin" $test_result_dir
+        done
+    done
+
+    # TODO: analyze the report and select the best algorithm and setting
+    unset I_MPI_COLLECTIVE_DEFAULTS
+    unset I_MPI_ADJUST_ALLREDUCE
+
+    echo "Finished."
+}
+
+function run_benchmark
+{
+    echo "Run benchmark with ${numnodes} nodes..."
+    if [ $numnodes -gt 1 ]; then
+        if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode" == "qperf" ]; then
+            run_qperf_bench
+        fi
+
+        if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode == mpi" ]; then
+            set_env_vars
+            run_mpi_bench
+        fi
+    fi
+}
+
+function run_caffe
+{
+    echo "Run caffe with ${numnodes} nodes..."
+
+    if [ ${mode} == "time" ]; then
+        xeonbin="$caffe_bin time --iterations $iteration --model $model_file  -engine=$engine"
+    else
+        xeonbin="$caffe_bin train --solver $solver_file -engine=$engine"
+        if [ ${mode} == "resume_train" ]; then
+            xeonbin+=" --snapshot=${snapshot}"
+        fi
+    fi
+
+    set_env_vars
+    execute_command "$xeonbin" $result_dir
+}
+
+
+if [ $# -le 1 ]; then
+    usage
+    exit 0
+fi
+
+root_dir=$(cd $(dirname $(dirname $0)); pwd)
+result_dir=${root_dir}/"result-`date +%Y%m%d%H%M%S`"
+
+while [[ $# -gt 1 ]]
+do
+    key="$1"
+    case $key in
+        --solver)
+            solver_file="$2"
+            shift
+            ;;
+        --host)
+            host_file="$2"
+            shift
+            ;;
+        --network)
+            network="$2"
+            shift
+            ;;
+        --netmask)
+            tcp_netmask="$2"
+            shift
+            ;;
+        --debug)
+            debug="$2"
+            shift
+            ;;
+        --num_mlsl_servers)
+            num_mlsl_servers=$2
+            shift
+            ;;
+        --mode)
+            mode=$2
+            shift
+            ;;
+        --iteration)
+            iteration=$2
+            shift
+            ;;
+        --model_file)
+            model_file=$2
+            shift
+            ;;
+        --snapshot)
+            snapshot=$2
+            shift
+            ;;
+        --engine)
+            engine=$2
+            shift
+            ;;
+        --benchmark)
+            benchmark_mode=$2
+            shift
+            ;;
+        --output)
+            result_dir=$2
+            shift
+            ;;
+        *)
+            echo "Unknown option: $key"
+            usage
+            exit 1
+            ;;
+    esac
+    shift
+done
+
+# check parameters
+if [ "$host_file" == "" ]; then
+    echo "Error: host file is NOT specified."
+    exit 1
+fi
+if [ ! -f $host_file ]; then
+    echo "Error: host file does NOT exist."
+    exit 1
+fi
+
+echo ""
+echo "CPUs with optimal settings:"
+for ((i=0; i<${#cpu_list[@]}; i++))
+do
+    echo "    ${cpu_list[$i]}"
+done
+echo ""
+echo "Settings:"
+echo "    Host file: $host_file"
+echo "    Running mode: $mode"
+echo "    Benchmark: $benchmark_mode"
+echo "    Debug option: $debug"
+echo "    Engine: $engine"
+echo "    Number of MLSL servers: $num_mlsl_servers"
+echo "        -1: selected automatically according to CPU model."
+echo "            BDW/SKX: 2, KNL: 4"
+
+
+if [ "$mode" == "train" ] || [ "$mode" == "resume_train" ]; then
+    if [ "$solver_file" == "" ]; then
+        echo "Error: solver file is NOT specified."
+        exit 1
+    fi
+    if [ ! -f $solver_file ]; then
+        echo "Error: solver file does NOT exist."
+        exit 1
+    fi
+
+    echo "    Solver file: $solver_file"
+
+    if [ "$mode" == "resume_train" ]; then
+        if [ "$snapshot" == "" ]; then
+            echo "Error: snapshot is NOT specified."
+            exit 1
+        fi
+        if [ ! -f $snapshot ]; then
+            echo "Eror: snapshot file does NOT exist."
+            exit 1
+        fi
+        echo "    Snapshot for resuming train: $snapshot"
+    fi
+fi
+
+if [ "$mode" == "time" ]; then
+    if [ "$model_file" == "" ]; then
+        echo "Error: model file is NOT specified."
+        exit 1
+    fi
+    if [ ! -f $model_file ]; then
+        echo "Eror: model file does NOT exist."
+        exit 1
+    fi
+
+    if [ $iteration -le 0 ]; then
+        echo "Error: iteration ($iteration) <= 0."
+        exit 1
+    fi        
+    echo "    Iteration for running caffe time: $iteration"
+    echo "    Model file for running caffe time: $model_file"
+fi
+
+echo "    Network: $network"
+if [ "$network" == "tcp" ]; then
+    if  [ "$tcp_netmask" == "" ]; then
+        echo "Error: TCP netmask is NOT specified."
+        exit 0
+    fi
+    echo "    Netmask for TCP network: $tcp_netmask"
+fi
+
+# Names to configfile, binary (executable) files #
+nodenames=( `cat $host_file | sort | uniq ` )
+if [ ${#nodenames[@]} -eq 0 ]; then
+    echo "Error: empty host file! Exit."
+    exit 0
+fi
+numnodes=${#nodenames[@]}
+echo "Number of nodes: $numnodes"
+
+detect_cpu
+
+if [ $cpu_model == knl ]; then
+    set_numa_node
+fi
+
+if [ ! -d $result_dir ]; then
+    echo "Create result directory: $result_dir"
+    mkdir -p $result_dir
+fi
+
+if [ "${benchmark_mode}" != "none" ]; then
+    run_benchmark
+fi
+
+if [ "${mode}" != "none" ]; then
+    caffe_bin="./build/tools/caffe"
+    check_dependency $caffe_bin
+    if [ $? -ne 0 ]; then
+        echo "Exit."
+        exit 0
+    fi
+
+    run_caffe
+fi
+
+echo "Result folder: $result_dir"
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index dd5546bde..48ae68dc7 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -72,7 +72,7 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
   for (int i = 0; i < shape.size(); ++i) {
     CHECK_GE(shape[i], 0);
     if (count_ != 0) {
-      CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
+      CHECK_LE(shape[i], LONG_MAX / count_) << "blob size exceeds LONG_MAX";
     }
     count_ *= shape[i];
     if (shape_[i] != shape[i]) {
@@ -369,7 +369,13 @@ Dtype Blob<Dtype>::asum_diff() const {
   switch (diff_->head()) {
   case SyncedMemory::SYNCED_PRV:
   case SyncedMemory::HEAD_AT_PRV:
-    return caffe_cpu_asum( prv_diff_count(), prv_diff());
+    {
+      const Dtype* prv_ptr = prv_diff();
+      if (prv_ptr == NULL)
+        return caffe_cpu_asum(count_, cpu_diff());
+      else
+        return caffe_cpu_asum(prv_diff_count(), prv_diff());
+    }
   case SyncedMemory::HEAD_AT_CPU:
     return caffe_cpu_asum(count_, cpu_diff());
   case SyncedMemory::HEAD_AT_GPU:
@@ -462,7 +468,11 @@ Dtype Blob<Dtype>::sumsq_diff() const {
   case SyncedMemory::SYNCED_PRV:
   case SyncedMemory::HEAD_AT_PRV:
       diff = prv_diff();
-      sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff);
+      if (diff == NULL) {
+        diff = cpu_diff();
+        sumsq = caffe_cpu_dot(count_, diff, diff); 
+      } else
+        sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff);
       break;
   case SyncedMemory::HEAD_AT_CPU:
     diff = cpu_diff();
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 2b52007cc..0a6f83a21 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -92,6 +92,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
   for (int i = 0; i < conv_param.dilation_size(); ++i) {
     if (conv_param.dilation(i) > 1) {
       use_dilation = true;
+      break;
     }
   }
 #endif
@@ -589,10 +590,10 @@ shared_ptr<Layer<Dtype> > GetEltwiseLayer(const LayerParameter& param) {
 #if defined(MKL2017_SUPPORTED)
     else if (ep.isEngine("MKL2017"))
       engine = EltwiseParameter_Engine_MKL2017;
-#endif
-#if defined(MKLDNN_SUPPORTED)
-    else if (ep.isEngine("MKLDNN"))
-      engine = EltwiseParameter_Engine_MKLDNN;
+#endif
+#if defined(MKLDNN_SUPPORTED)
+    else if (ep.isEngine("MKLDNN"))
+      engine = EltwiseParameter_Engine_MKLDNN;
 #endif
   }
 
@@ -605,9 +606,9 @@ shared_ptr<Layer<Dtype> > GetEltwiseLayer(const LayerParameter& param) {
   } else if (engine == EltwiseParameter_Engine_MKL2017) {
     return shared_ptr<Layer<Dtype> >(new MKLEltwiseLayer<Dtype>(param));
 #endif
-#ifdef MKLDNN_SUPPORTED
-  } else if (engine == EltwiseParameter_Engine_MKLDNN) {
-    return shared_ptr<Layer<Dtype> >(new MKLDNNEltwiseLayer<Dtype>(param));
+#ifdef MKLDNN_SUPPORTED
+  } else if (engine == EltwiseParameter_Engine_MKLDNN) {
+    return shared_ptr<Layer<Dtype> >(new MKLDNNEltwiseLayer<Dtype>(param));
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknow engine.";
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index b7746d988..8331dd7d7 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -81,13 +81,22 @@ void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_EQ(bottom[0]->shape(1), channels_);
   top[0]->ReshapeLike(*bottom[0]);
 
+  num_stats_batches_ = 1;
+  stats_batch_size_ = bottom[0]->shape(0);
+  BatchNormParameter param = this->layer_param_.batch_norm_param();
+  if (!use_global_stats_ && param.stats_batch_size() > 0) {
+    CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0);
+    num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size();
+    stats_batch_size_ = param.stats_batch_size();
+  }
+
   vector<int> sz;
   sz.push_back(channels_);
   mean_.Reshape(sz);
   variance_.Reshape(sz);
   temp_.ReshapeLike(*bottom[0]);
   x_norm_.ReshapeLike(*bottom[0]);
-  sz[0]=bottom[0]->shape(0);
+  sz[0]=stats_batch_size_;
   batch_sum_multiplier_.Reshape(sz);
 
   int spatial_dim = bottom[0]->count(2);
@@ -99,7 +108,7 @@ void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data);
   }
 
-  int numbychans = channels_*bottom[0]->shape(0);
+  int numbychans = channels_*stats_batch_size_;
   if (num_by_chans_.num_axes() == 0 ||
       num_by_chans_.shape(0) != numbychans) {
     sz[0] = numbychans;
@@ -149,18 +158,20 @@ void BatchNormLayer<Dtype>::replicate_to_op(Dtype* buffer_to_write,
   }
 }
 
-
-
 template <typename Dtype>
-void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  int num = bottom[0]->shape(0);
+void BatchNormLayer<Dtype>::ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top, int stats_batch_idx) {
+  long data_stats_count = stats_batch_size_ * bottom[0]->count(1);
+  long data_offset = stats_batch_idx * data_stats_count;
+  const Dtype* bottom_data = bottom[0]->cpu_data() + data_offset;
+  Dtype* top_data = top[0]->mutable_cpu_data() + data_offset;
+  Dtype* temp_data = temp_.mutable_cpu_data() + data_offset;
+  Dtype* x_norm_data = x_norm_.mutable_cpu_data() + data_offset;
+  int num = stats_batch_size_;
   int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
 
   if (bottom[0] != top[0]) {
-    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+    caffe_copy(data_stats_count, bottom_data, top_data);
   }
 
   if (use_global_stats_) {
@@ -192,10 +203,10 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_cpu_data());  // (X-EX)^2
+    caffe_powx(data_stats_count, top_data, Dtype(2),
+        temp_data);  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
-        1. / (num * spatial_dim), temp_.cpu_data(),
+        1. / (num * spatial_dim), temp_data,
         spatial_sum_multiplier_.cpu_data(), 0.,
         num_by_chans_.mutable_cpu_data());
     caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
@@ -207,7 +218,7 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     this->blobs_[2]->mutable_cpu_data()[0] += 1;
     caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
         moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
-    int m = bottom[0]->count()/channels_;
+    int m = bottom[0]->count()/num_stats_batches_/channels_;
     Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
     caffe_cpu_axpby(variance_.count(), bias_correction_factor,
         variance_.cpu_data(), moving_average_fraction_,
@@ -220,37 +231,40 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
              variance_.mutable_cpu_data());
 
   // replicate variance to input size
-  this->replicate(temp_.mutable_cpu_data(),
+  this->replicate(temp_data,
                   num,
                   spatial_dim*channels_,
                   spatial_dim,
                   variance_.cpu_data());
 
-  caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
+  caffe_div(data_stats_count, top_data, temp_data, top_data);
   // TODO(cdoersch): The caching is only needed because later in-place layers
   //                 might clobber the data.  Can we skip this if they won't?
-  caffe_copy(x_norm_.count(), top_data,
-      x_norm_.mutable_cpu_data());
+  caffe_copy(data_stats_count, top_data,
+      x_norm_data);
 }
 
 template <typename Dtype>
-void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+void BatchNormLayer<Dtype>::BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+    int stats_batch_idx) {
+  long data_stats_count = stats_batch_size_ * bottom[0]->count(1);
+  long data_offset = stats_batch_idx * data_stats_count;
   const Dtype* top_diff;
   if (bottom[0] != top[0]) {
-    top_diff = top[0]->cpu_diff();
+    top_diff = top[0]->cpu_diff() + data_offset;
   } else {
-    caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff());
-    top_diff = x_norm_.cpu_diff();
+    caffe_copy(data_stats_count, top[0]->cpu_diff() + data_offset,
+               x_norm_.mutable_cpu_diff() + data_offset);
+    top_diff = x_norm_.cpu_diff() + data_offset;
   }
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff() + data_offset;
   if (use_global_stats_) {
-    caffe_div(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff);
+    caffe_div(data_stats_count, top_diff, temp_.cpu_data() + data_offset, bottom_diff);
     return;
   }
-  const Dtype* top_data = x_norm_.cpu_data();
-  int num = bottom[0]->shape()[0];
+  const Dtype* top_data = x_norm_.cpu_data() + data_offset;
+  int num = stats_batch_size_;
   int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
   // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
   //
@@ -265,7 +279,7 @@ void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   // dimensions except the channels dimension where required.
 
   // sum(dE/dY \cdot Y)
-  caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
+  caffe_mul(data_stats_count, top_data, top_diff, bottom_diff);
   caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
       bottom_diff, spatial_sum_multiplier_.cpu_data(), 0.,
       num_by_chans_.mutable_cpu_data());
@@ -280,7 +294,7 @@ void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                   mean_.cpu_data());
 
   // sum(dE/dY \cdot Y) \cdot Y
-  caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+  caffe_mul(data_stats_count, top_data, bottom_diff, bottom_diff);
 
   // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
   caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
@@ -300,12 +314,29 @@ void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                   std::plus<Dtype>());
 
   // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
-  caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff,
+  caffe_cpu_axpby(data_stats_count, Dtype(1), top_diff,
       Dtype(-1. / (num * spatial_dim)), bottom_diff);
 
   // note: temp_ still contains sqrt(var(X)+eps), computed during the forward
   // pass.
-  caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
+  caffe_div(data_stats_count, bottom_diff, temp_.cpu_data() + data_offset, bottom_diff);
+}
+
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    ForwardStatsBatch_cpu(bottom, top, i);
+  }
+}
+
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    BackwardStatsBatch_cpu(top, propagate_down, bottom, i);
+  }
 }
 
 
diff --git a/src/caffe/layers/mkl_batch_norm_layer.cpp b/src/caffe/layers/mkl_batch_norm_layer.cpp
index 6dce50243..a24500c69 100755
--- a/src/caffe/layers/mkl_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkl_batch_norm_layer.cpp
@@ -52,8 +52,12 @@ MKLBatchNormLayer<Dtype>::~MKLBatchNormLayer() {
   dnnDelete<Dtype>(batchNormFwdInference);
   dnnDelete<Dtype>(batchNormBwd);
   dnnLayoutDelete<Dtype>(layout_usr_);
-  dnnReleaseBuffer<Dtype>(mean_buffer_);
-  dnnReleaseBuffer<Dtype>(variance_buffer_);
+  for (int i = 0; i < mean_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(mean_buffers_[i]);
+  }
+  for (int i = 0; i < variance_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(variance_buffers_[i]);
+  }
   dnnReleaseBuffer<Dtype>(scaleShift_buffer_);
   dnnReleaseBuffer<Dtype>(diffScaleShift_buffer_);
 }
@@ -71,6 +75,15 @@ void MKLBatchNormLayer<Dtype>::Init(const vector<Blob<Dtype>*>& bottom,
   if (this->layer_param_.batch_norm_param().has_use_global_stats())
     use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats();
 
+  num_stats_batches_ = 1;
+  stats_batch_size_ = bottom[0]->shape(0);
+  BatchNormParameter param = this->layer_param_.batch_norm_param();
+  if (!use_global_stats_ && param.stats_batch_size() > 0) {
+    CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0);
+    num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size();
+    stats_batch_size_ = param.stats_batch_size();
+  }
+
   CHECK(use_weight_bias_) << "BatchNorm without scaling have not supported yet";
 
   size_t dim = 4, sizes[4], strides[4];
@@ -99,18 +112,25 @@ void MKLBatchNormLayer<Dtype>::Init(const vector<Blob<Dtype>*>& bottom,
   // TODO: Make a cleanup routine to avoid
   // copy of following code in the Destructor
 
-  dnnError_t e;
-  dnnLayoutDelete<Dtype>(layout_usr_);
-  e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
-  CHECK_EQ(e, E_SUCCESS);
-
   fwd_bottom_data->create_user_layout(dim, sizes, strides, false);
   fwd_top_data   ->create_user_layout(dim, sizes, strides, false);
   bwd_bottom_diff->create_user_layout(dim, sizes, strides, false);
   bwd_top_diff   ->create_user_layout(dim, sizes, strides, false);
 
-  dnnReleaseBuffer<Dtype>(mean_buffer_);
-  dnnReleaseBuffer<Dtype>(variance_buffer_);
+  sizes[3] /= num_stats_batches_;
+  dnnError_t e;
+  dnnLayoutDelete<Dtype>(layout_usr_);
+  e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
+  CHECK_EQ(e, E_SUCCESS);
+
+  for (int i = 0; i < mean_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(mean_buffers_[i]);
+  }
+  for (int i = 0; i < variance_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(variance_buffers_[i]);
+  }
+  mean_buffers_.resize(num_stats_batches_, NULL);
+  variance_buffers_.resize(num_stats_batches_, NULL);
   dnnReleaseBuffer<Dtype>(scaleShift_buffer_);
   dnnReleaseBuffer<Dtype>(diffScaleShift_buffer_);
 
@@ -223,26 +243,30 @@ void MKLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     strides[2] = sizes[0]*sizes[1];
     strides[3] = sizes[0]*sizes[1]*sizes[2];
 
-    dnnError_t e;
-    dnnLayoutDelete<Dtype>(layout_usr_);
-    e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
-    CHECK_EQ(e, E_SUCCESS);
     fwd_bottom_data->create_user_layout(dim, sizes, strides, false);
     fwd_top_data   ->create_user_layout(dim, sizes, strides, false);
     bwd_bottom_diff->create_user_layout(dim, sizes, strides, false);
     bwd_top_diff   ->create_user_layout(dim, sizes, strides, false);
+
+    sizes[3] /= num_stats_batches_;
+    dnnError_t e;
+    dnnLayoutDelete<Dtype>(layout_usr_);
+    e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
+    CHECK_EQ(e, E_SUCCESS);
   }
 }
 
 template <typename Dtype>
-void MKLBatchNormLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void MKLBatchNormLayer<Dtype>::ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top, int stats_batch_idx) {
+  long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1);
   void* bottom_data =
     reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->prv_data()));
   int is_first_pass = 0;
-  unsigned int amount_to_copy =0;
+  long amount_to_copy =0;
 
-  if (NULL != bottom_data) {
+  // TODO: support private memory with num_stats_batches_ > 1
+  if (NULL != bottom_data && num_stats_batches_ == 1) {
     amount_to_copy = bottom[0]->prv_data_count();
     // Is it the first pass? Create a primitive.
     if (batchNormFwd == NULL) {
@@ -311,7 +335,7 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
     }
     bottom_data =
       reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->cpu_data()));
-    amount_to_copy = bottom[0]->count();
+    amount_to_copy = bottom[0]->count() / num_stats_batches_;
   }
   if (is_first_pass == 1) {
       dnnError_t e;
@@ -319,18 +343,22 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
       e = dnnLayoutCreateFromPrimitive<Dtype>(
         &mean_buffer_l, batchNormFwd, dnnResourceMean);
       CHECK_EQ(e, E_SUCCESS);
-      e = dnnAllocateBuffer<Dtype>(
-        reinterpret_cast<void**>(&mean_buffer_), mean_buffer_l);
-      CHECK_EQ(e, E_SUCCESS);
+      for (int i = 0; i < num_stats_batches_; i++) {
+        e = dnnAllocateBuffer<Dtype>(
+          reinterpret_cast<void**>(&mean_buffers_[i]), mean_buffer_l);
+        CHECK_EQ(e, E_SUCCESS);
+      }
       dnnLayoutDelete<Dtype>(mean_buffer_l);
 
       dnnLayout_t variance_buffer_l = NULL;
       e = dnnLayoutCreateFromPrimitive<Dtype>(
         &variance_buffer_l, batchNormFwd, dnnResourceVariance);
       CHECK_EQ(e, E_SUCCESS);
-      e = dnnAllocateBuffer<Dtype>(
-        reinterpret_cast<void**>(&variance_buffer_), variance_buffer_l);
-      CHECK_EQ(e, E_SUCCESS);
+      for (int i = 0; i < num_stats_batches_; i++) {
+        e = dnnAllocateBuffer<Dtype>(
+          reinterpret_cast<void**>(&variance_buffers_[i]), variance_buffer_l);
+        CHECK_EQ(e, E_SUCCESS);
+      }
       dnnLayoutDelete<Dtype>(variance_buffer_l);
 
        dnnLayout_t diffScaleShift_buffer_l = NULL;
@@ -374,8 +402,8 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
     // Note that this is only necessary for Backward; we skip this if not
     // doing Backward
     // TODO: make a caffe_coppy working on blobs
-    caffe_copy(amount_to_copy, static_cast<Dtype*>(bottom_data),
-               temp_.mutable_cpu_data());
+    caffe_copy(amount_to_copy, static_cast<Dtype*>(bottom_data) + data_offset,
+               temp_.mutable_cpu_data() + data_offset);
   }
 
   if (use_global_stats_) {
@@ -383,24 +411,25 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
     const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
                                0 : 1 / this->blobs_[2]->cpu_data()[0];
     caffe_cpu_scale(this->blobs_[0]->count(), scale_factor,
-                    this->blobs_[0]->cpu_data(), mean_buffer_);
+                    this->blobs_[0]->cpu_data(), mean_buffers_[stats_batch_idx]);
     caffe_cpu_scale(this->blobs_[1]->count(), scale_factor,
-                    this->blobs_[1]->cpu_data(), variance_buffer_);
+                    this->blobs_[1]->cpu_data(), variance_buffers_[stats_batch_idx]);
   }
 
   dnnError_t e;
   void* BatchNorm_res[dnnResourceNumber];
-  BatchNorm_res[dnnResourceMean] = mean_buffer_;
-  BatchNorm_res[dnnResourceVariance] = variance_buffer_;
-  BatchNorm_res[dnnResourceSrc] = bottom_data;
+  BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset;
   BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_;
   if (fwd_top_data->conversion_needed()) {
     top[0]->set_prv_data_descriptor(fwd_top_data);
+    data_offset = stats_batch_idx * (top[0]->prv_data_count() / num_stats_batches_);
     BatchNorm_res[dnnResourceDst] =
-            reinterpret_cast<void *>(top[0]->mutable_prv_data());
+            reinterpret_cast<void *>(top[0]->mutable_prv_data() + data_offset);
   } else {
     BatchNorm_res[dnnResourceDst] =
-            reinterpret_cast<void *>(top[0]->mutable_cpu_data());
+            reinterpret_cast<void *>(top[0]->mutable_cpu_data() + data_offset);
     DLOG(INFO) << "Using cpu_data for top in DnnBatchNorm.";
   }
 
@@ -415,20 +444,21 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
      // compute and save moving average
     this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
     this->blobs_[2]->mutable_cpu_data()[0] += 1;
-    caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffer_,
+    caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffers_[stats_batch_idx],
         moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
-    int m = bottom[0]->count()/channels_;
+    int m = bottom[0]->count()/num_stats_batches_/channels_;
     Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
     caffe_cpu_axpby(this->blobs_[1]->count(), bias_correction_factor,
-        variance_buffer_, moving_average_fraction_,
+        variance_buffers_[stats_batch_idx], moving_average_fraction_,
         this->blobs_[1]->mutable_cpu_data());
   }
 }
 
 template <typename Dtype>
-void MKLBatchNormLayer<Dtype>::Backward_cpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+void MKLBatchNormLayer<Dtype>::BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+    int stats_batch_idx) {
+  long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1);
   void *bottom_data = NULL;
   if (bottom[0] == top[0]) {
     bottom_data = reinterpret_cast<void *>(
@@ -437,7 +467,7 @@ void MKLBatchNormLayer<Dtype>::Backward_cpu(
     bottom_data =
             reinterpret_cast<void *>(
                         const_cast<Dtype*>(bottom[0]->prv_data()));
-    if (NULL == bottom_data)
+    if (NULL == bottom_data || num_stats_batches_ > 1)
       bottom_data =
             reinterpret_cast<void *>(
                         const_cast<Dtype*>(bottom[0]->cpu_data()));
@@ -445,19 +475,19 @@ void MKLBatchNormLayer<Dtype>::Backward_cpu(
 
   dnnError_t e;
   void* BatchNorm_res[dnnResourceNumber];
-  BatchNorm_res[dnnResourceMean] = mean_buffer_;
-  BatchNorm_res[dnnResourceVariance] = variance_buffer_;
-  BatchNorm_res[dnnResourceSrc] = bottom_data;
+  BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset;
   BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_;
   BatchNorm_res[dnnResourceDiffScaleShift] = diffScaleShift_buffer_;
-
-  BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(top[0],
-          true);
+  BatchNorm_res[dnnResourceDiffDst] =
+    bwd_top_diff->get_converted_prv(top[0], true) + data_offset;
   if (bwd_bottom_diff->conversion_needed()) {
     bottom[0]->set_prv_diff_descriptor(bwd_bottom_diff);
-    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff();
+    data_offset = stats_batch_idx * (bottom[0]->prv_diff_count() / num_stats_batches_);
+    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff() + data_offset;
   } else {
-    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff();
+    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff() + data_offset;
   }
 
   PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKL_NAME("BW"));
@@ -479,6 +509,23 @@ void MKLBatchNormLayer<Dtype>::Backward_cpu(
   }
 }
 
+template <typename Dtype>
+void MKLBatchNormLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    ForwardStatsBatch_cpu(bottom, top, i);
+  }
+}
+
+template <typename Dtype>
+void MKLBatchNormLayer<Dtype>::Backward_cpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    BackwardStatsBatch_cpu(top, propagate_down, bottom, i);
+  }
+}
+
 
 #ifdef CPU_ONLY
 STUB_GPU(MKLBatchNormLayer);
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index 4db92b943..f1edfebd4 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -44,6 +44,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace caffe {
 
+template <typename Dtype>
+void MKLDNNBatchNormLayer<Dtype>::InitStatsBatchVars(int batch_size) {
+    num_stats_batches_ = 1;
+    stats_batch_size_ = batch_size;
+    BatchNormParameter param = this->layer_param_.batch_norm_param();
+    if (!use_global_stats_ && param.stats_batch_size() > 0) {
+      CHECK_EQ(batch_size % param.stats_batch_size(), 0);
+      num_stats_batches_ = batch_size / param.stats_batch_size();
+      stats_batch_size_ = param.stats_batch_size();
+    }
+}
+
 template <typename Dtype>
 void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
                                         ,const vector<Blob<Dtype>*>& top)
@@ -62,6 +74,10 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     bias_term_ = this->layer_param_.batch_norm_param().bias_term();
     moving_average_fraction_ = this->layer_param_.batch_norm_param().moving_average_fraction();
     use_global_stats_ = this->phase_ == TEST;
+    if (this->layer_param_.batch_norm_param().has_use_global_stats())
+      use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats();
+
+    InitStatsBatchVars(num_);
 
     this->blobs_.resize(3 + (use_weight_bias_ ? 1:0) + (use_weight_bias_ && bias_term_ ? 1:0));
 
@@ -76,6 +92,22 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
             this->blobs_[i]->mutable_cpu_data());
     }
 
+    //IntelCaffe treat scale and shift as different blobs, so current MKL-DNN integration has additional copies from Caffe to MKL-DNN buffer on fwd pass and from MKL-DNN to Caffe buffer on bwd pass.
+    //Optimization: use the temp blob to combine the scale and shift together. Avoid the additional copies.
+    // Initialize scale and shift combination blob
+    vector<int> scaleshift_blob_shape(1);
+    scaleshift_blob_shape[0] = 2*channels_;
+    scaleshift_blob_.reset(new Blob<Dtype>(scaleshift_blob_shape));
+    //Should initialize the scaleshift_blob_ buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN
+    caffe_set(scaleshift_blob_shape[0], static_cast<Dtype>(0),
+              scaleshift_blob_->mutable_cpu_data());
+    shared_ptr<Blob<Dtype> > scaleshift_diff_blob = scaleshift_blob_;
+    scaleshift_acc_ = scaleshift_blob_;
+    if (num_stats_batches_ > 1) {
+      this->scaleshift_acc_.reset(new Blob<Dtype>(scaleshift_blob_shape));
+      scaleshift_diff_blob = scaleshift_acc_;
+    }
+
     if (use_weight_bias_) {
         // Initialize scale and shift
         vector<int> scaleshift_shape(1);
@@ -83,6 +115,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
         VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::LayerSetUp: channels_  = " << channels_;
 
         this->blobs_[3].reset(new Blob<Dtype>(scaleshift_shape));
+        this->blobs_[3]->set_cpu_data(scaleshift_blob_->mutable_cpu_data());
+        this->blobs_[3]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff());
         FillerParameter filler_param(this->layer_param_.batch_norm_param().filler());
         if (!this->layer_param_.batch_norm_param().has_filler()) {
             filler_param.set_type("constant");
@@ -92,8 +126,10 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
         VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::LayerSetUp: scaleshift " << __LINE__ << ":" << this->layer_param_.name();
         filler->Fill(this->blobs_[3].get());
 
-        if ( bias_term_ ) {
+        if (bias_term_) {
             this->blobs_[4].reset(new Blob<Dtype>(scaleshift_shape));
+            this->blobs_[4]->set_cpu_data(scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels_));
+            this->blobs_[4]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff() + scaleshift_blob_->offset(channels_));
             FillerParameter bias_filler_param(this->layer_param_.batch_norm_param().bias_filler());
             if (!this->layer_param_.batch_norm_param().has_bias_filler()) {
                 bias_filler_param.set_type("constant");
@@ -130,6 +166,8 @@ void MKLDNNBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
     this->num_ = bottom[0]->num();
     this->channels_ = bottom[0]->channels();
 
+    InitStatsBatchVars(this->num_);
+
     //Fix: should reshape the top blob with the real size of bottom blob
     //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_);
 #ifdef DEBUG
@@ -159,8 +197,9 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     memory::data_type mpcsn = memory::data_type::f32;
     
     // ---- Initialize memory descriptors -------------
-    shared_ptr<memory::desc> input_md, output_md, scaleshift_md;
-    shared_ptr<memory::primitive_desc> usr_mpd, prv_mpd, scaleshift_mpd;
+    shared_ptr<memory::desc> input_md, input_stats_md, output_md, scaleshift_md;
+    shared_ptr<memory::primitive_desc> usr_mpd, prv_mpd;
+    shared_ptr<memory::primitive_desc> scaleshift_mpd;
     if (bottom_data_is_prv) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, false> > mem_descr
             = get_mkldnn_prv_descriptor<Dtype, false>(bottom[0]);
@@ -172,9 +211,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
         usr_mpd.reset(new memory::primitive_desc(*input_md, cpu_engine));
     }
     output_md = input_md;
+    input_stats_md.reset(new memory::desc(*input_md));
+    CHECK(input_stats_md->data.ndims > 0 &&
+          input_stats_md->data.dims[0] == this->num_);
+    input_stats_md->data.dims[0] = stats_batch_size_;
 
     // ---- Initialize BatchNorm primitive descriptor -------------
-    batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_md, eps_, flags);
+    batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_stats_md, eps_, flags);
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
     if (subengines == "" || subengines == "MKLDNN")
@@ -196,7 +239,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
 
     // ---- Create memory  ---------------------
     if (use_weight_bias_) {
-        scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc()));
+        scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_data()));
     }
 
     // ---  init primitive and prv_memory descriptors ----------------------
@@ -206,44 +249,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     fwd_top_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_mpd, top[0], this));
     output_memory = fwd_top_data->create_output_memory();
 
-    // ---- Create BatchNorm --------------------
-    if (this->phase_ == TEST && !use_global_stats_) {
-        if (use_weight_bias_) {
-            BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                    *input_primitive, *scaleshift_memory, *output_memory));
-        } else {
-            BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                    *input_primitive, *output_memory));
-        }
-    } else {
-        mean_memory.reset(new memory(BatchNormFwd_pd->mean_primitive_desc()));
-        variance_memory.reset(new memory(BatchNormFwd_pd->variance_primitive_desc()));
-
-        if (use_global_stats_) {
-            caffe_copy<Dtype>(this->channels_, this->blobs_[0]->cpu_data(),
-                static_cast<Dtype *>(mean_memory->get_data_handle()));
-            caffe_copy<Dtype>(this->channels_, this->blobs_[1]->cpu_data(),
-               static_cast<Dtype *>(variance_memory->get_data_handle()));
-            if (use_weight_bias_) {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, (const primitive::at)*mean_memory,
-                        (const primitive::at)*variance_memory, *scaleshift_memory,
-                        *output_memory));
-            } else {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, (const primitive::at)*mean_memory,
-                        (const primitive::at)*variance_memory, *output_memory));
-            }
-        } else {
-            if (use_weight_bias_) {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, *scaleshift_memory, *output_memory,
-                        *mean_memory, *variance_memory));
-            } else {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, *output_memory, *mean_memory, *variance_memory));
-            }
-        }
+    mean_memory.resize(num_stats_batches_);
+    variance_memory.resize(num_stats_batches_);
+    input_stats.resize(num_stats_batches_);
+    output_stats.resize(num_stats_batches_);
+    BatchNormFwd.resize(num_stats_batches_);
+    for (int i = 0; i < num_stats_batches_; i++) {
+      InitBatchNormFwdPrimitive(i);
     }
 
     //fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd);  //Wrong passed primitive! (TODO: Checking!)
@@ -272,6 +284,70 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     }
 }
 
+template <typename Dtype>
+template <bool diff>
+shared_ptr<memory> MKLDNNBatchNormLayer<Dtype>::GetStatsBatchMemory(
+  shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_mem, int idx) {
+    long data_offset =
+      idx * stats_batch_size_ * this->channels_ * this->width_ * this->height_;
+    engine cpu_engine = CpuEngine::Instance().get_engine();
+    shared_ptr<memory::desc> stats_md = mkldnn_mem->get_memory_desc();
+    CHECK(stats_md->data.ndims > 0 &&
+          stats_md->data.dims[0] == this->num_);
+    stats_md->data.dims[0] = stats_batch_size_;
+    shared_ptr<memory::primitive_desc> stats_mpd(
+      new memory::primitive_desc(*stats_md, cpu_engine));
+    shared_ptr<memory> stats(
+      new memory(*stats_mpd, mkldnn_mem->get_memory_ptr(data_offset)));
+    return stats;
+}
+
+template <typename Dtype>
+void MKLDNNBatchNormLayer<Dtype>::InitBatchNormFwdPrimitive(int idx) {
+    input_stats[idx] = GetStatsBatchMemory<false>(fwd_bottom_data, idx);
+    output_stats[idx] = GetStatsBatchMemory<false>(fwd_top_data, idx);
+
+    // ---- Create BatchNorm --------------------
+    if (this->phase_ == TEST && !use_global_stats_) {
+        if (use_weight_bias_) {
+            BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                    *input_stats[idx], *scaleshift_memory,
+                    *output_stats[idx]));
+        } else {
+            BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                    *input_stats[idx], *output_stats[idx]));
+        }
+    } else {
+        mean_memory[idx].reset(new memory(BatchNormFwd_pd->mean_primitive_desc()));
+        variance_memory[idx].reset(new memory(BatchNormFwd_pd->variance_primitive_desc()));
+
+        if (use_global_stats_) {
+            caffe_copy<Dtype>(this->channels_, this->blobs_[0]->cpu_data(),
+                static_cast<Dtype *>(mean_memory[idx]->get_data_handle()));
+            caffe_copy<Dtype>(this->channels_, this->blobs_[1]->cpu_data(),
+               static_cast<Dtype *>(variance_memory[idx]->get_data_handle()));
+            if (use_weight_bias_) {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], (const primitive::at)*mean_memory[idx],
+                        (const primitive::at)*variance_memory[idx], *scaleshift_memory,
+                        *output_stats[idx]));
+            } else {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], (const primitive::at)*mean_memory[idx],
+                        (const primitive::at)*variance_memory[idx], *output_stats[idx]));
+            }
+        } else {
+            if (use_weight_bias_) {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], *scaleshift_memory, *output_stats[idx],
+                        *mean_memory[idx], *variance_memory[idx]));
+            } else {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], *output_stats[idx], *mean_memory[idx], *variance_memory[idx]));
+            }
+        }
+    }
+}
 
 template <typename Dtype>
 void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
@@ -289,49 +365,40 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
     // update top that head at prv
     fwd_top_data->sync_before_write();
 
-    if (use_global_stats_) {
+    for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) {
+      if (use_global_stats_) {
         // use the stored mean/variance estimates.
         const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
             0 : 1 / this->blobs_[2]->cpu_data()[0];
-        Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle());
-        Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle());
+        Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle());
+        Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle());
 
         //TODO: optimize, do this operation in the InitBatchNorm, so no need to calculate each time
         caffe_cpu_scale(this->blobs_[0]->count(), scale_factor,
                     this->blobs_[0]->cpu_data(), mean_buffer_);
         caffe_cpu_scale(this->blobs_[1]->count(), scale_factor,
                     this->blobs_[1]->cpu_data(), variance_buffer_);
-    }
-    if (use_weight_bias_) {
-        Dtype* scaleShift_buffer_ = (Dtype *)(scaleshift_memory->get_data_handle());
-        // Fill ScaleShift buffer
-        for (int i = 0; i < this->channels_; i++) {
-            scaleShift_buffer_[i] = this->blobs_[3]->cpu_data()[i];
-            scaleShift_buffer_[channels_ + i] = 0;
-            if (bias_term_) {
-                scaleShift_buffer_[channels_ + i] = this->blobs_[4]->cpu_data()[i];
-            }
-        }
-    }
-
-    PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW"));
-    PERFORMANCE_MEASUREMENT_BEGIN();
-    BatchNormFwd.submit();
-    PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_);
+      }
+      
+      PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW"));
+      PERFORMANCE_MEASUREMENT_BEGIN();
+      BatchNormFwd[stats_batch_idx].submit();
+      PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_);
 
-    if (this->phase_ == TRAIN && !use_global_stats_) {
+      if (this->phase_ == TRAIN && !use_global_stats_) {
         // compute and save moving average
-        Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle());
-        Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle());
+        Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle());
+        Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle());
         this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
         this->blobs_[2]->mutable_cpu_data()[0] += 1;
         caffe_cpu_axpby<Dtype>(this->channels_, Dtype(1), mean_buffer_,
             moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
-        int m = bottom[0]->count()/channels_;
+        int m = bottom[0]->count()/num_stats_batches_/channels_;
         Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
         caffe_cpu_axpby<Dtype>(this->channels_, bias_correction_factor,
             variance_buffer_, moving_average_fraction_,
             this->blobs_[1]->mutable_cpu_data());
+      }
     }
 
 }
@@ -359,7 +426,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
     memory::data_type mpcsn = memory::data_type::f32;
 
     // ---- Initialize memory descriptors -------------
-    shared_ptr<memory::desc> top_diff_md, top_data_md;
+    shared_ptr<memory::desc> top_diff_md, top_diff_stats_md, top_data_md, output_stats_md;
     shared_ptr<memory::primitive_desc> usr_diff_mpd(NULL), prv_diff_mpd(NULL);
     if (top_diff_is_prv) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, true> > mem_descr
@@ -371,10 +438,18 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
         top_diff_md.reset(new memory::desc({{n, c, h, w}}, mpcsn, memory::format::nchw));   //MKLDNN batch norm only support 4D memory descriptor!
         usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine));
     }
+    top_diff_stats_md.reset(new memory::desc(*top_diff_md));
+    CHECK(top_diff_stats_md->data.ndims > 0 &&
+          top_diff_stats_md->data.dims[0] == this->num_);
+    top_diff_stats_md->data.dims[0] = stats_batch_size_;
+    output_stats_md.reset(new memory::desc(output_memory->get_primitive_desc().desc()));
+    CHECK(output_stats_md->data.ndims > 0 &&
+          output_stats_md->data.dims[0] == this->num_);
+    output_stats_md->data.dims[0] = stats_batch_size_;
 
     // ---- Initialize bnrm primitive descriptor -------------
     batch_normalization_backward::desc BatchNormBwd_desc(prop_kind::backward,
-            *top_diff_md, output_memory->get_primitive_desc().desc(), eps_,
+            *top_diff_stats_md, *output_stats_md, eps_,
             flags);
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
@@ -396,6 +471,11 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
 
     CHECK(BatchNormBwd_pd);
 
+    if (use_weight_bias_) {
+        bwd_scaleshift_diff_memory.reset(new memory(
+                    BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_diff()));
+    }
+
     // ---  init primitive and prv_memory descriptors ----------------------
     bwd_top_diff.reset(new MKLDNNDiff<Dtype>(usr_diff_mpd, prv_diff_mpd, top[0], this));
     bwd_top_diff->name = "bwd_top_diff_data   @ " + this->layer_param_.name();
@@ -405,17 +485,11 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
     bwd_bottom_diff->name = "bwd_bottom_diff_data   @ " + this->layer_param_.name();
     bwd_bottom_diff_memory = bwd_bottom_diff->create_output_memory(inplace);
 
-    if (use_weight_bias_) {
-        bwd_scaleshift_diff_memory.reset(new memory(
-                    BatchNormFwd_pd->weights_primitive_desc()));
-        BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd,
-                    *input_primitive, *mean_memory, *variance_memory,
-                    *bwd_top_diff_primitive, *scaleshift_memory,
-                    *bwd_bottom_diff_memory, *bwd_scaleshift_diff_memory));
-    } else {
-        BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd,
-                    *input_primitive, *mean_memory, *variance_memory,
-                    *bwd_top_diff_primitive, *bwd_bottom_diff_memory));
+    top_diff_stats.resize(num_stats_batches_);
+    bottom_diff_stats.resize(num_stats_batches_);
+    BatchNormBwd.resize(num_stats_batches_);
+    for (int i = 0; i < num_stats_batches_; i++) {
+      InitBatchNormBwdPrimitive(i);
     }
 
     //bwd_top_diff->set_mkldnn_primitive(BatchNormBwd);     //Wrong passed primitive! (TODO: Checking!)
@@ -427,6 +501,23 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
     bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
 }
 
+template <typename Dtype>
+void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwdPrimitive(int idx) {
+    top_diff_stats[idx] = GetStatsBatchMemory<true>(bwd_top_diff, idx);
+    bottom_diff_stats[idx] = GetStatsBatchMemory<true>(bwd_bottom_diff, idx);
+
+    if (use_weight_bias_) {
+        BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                    *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
+                    *top_diff_stats[idx], *scaleshift_memory,
+                    *bottom_diff_stats[idx], *bwd_scaleshift_diff_memory));
+    } else {
+        BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                    *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
+                    *top_diff_stats[idx], *bottom_diff_stats[idx]));
+    }
+}
+
 template <typename Dtype>
 void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
@@ -443,53 +534,50 @@ void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     // update bottom that head at prv
     bwd_bottom_diff->sync_before_write();
 
-    PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
-    PERFORMANCE_MEASUREMENT_BEGIN();
+    for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) {
+
+      PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
+      PERFORMANCE_MEASUREMENT_BEGIN();
 #ifdef DEBUG
-    if (bottom[0]->prv_data() != NULL)
-    {
+      if (bottom[0]->prv_data() != NULL)
+      {
         LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data();
-    }
-    else
-    {
+      }
+      else
+      {
         LOG(INFO) << "Debug: Bottom prv data is NULL!";
-    }
-
-    if (top[0]->prv_diff() != NULL)
-    {
+      }
+      
+      if (top[0]->prv_diff() != NULL)
+      {
         LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff();
-    }
-    else
-    {
+      }
+      else
+      {
         LOG(INFO) << "Debug: Top prv diff is NULL!";
         LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
-    }
+      }
 #endif
-    BatchNormBwd.submit();
+      BatchNormBwd[stats_batch_idx].submit();
 #ifdef DEBUG
-    if (bottom[0]->prv_diff() != NULL)
-    {
+      if (bottom[0]->prv_diff() != NULL)
+      {
         LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff();
-    }
-    else
-    {
+      }
+      else
+      {
         LOG(INFO) << "Debug: Bottom prv diff is NULL!";
         LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff();
-    }
+      }
 #endif
-    PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
-
-    /* FIXME: this wouldn't work with lazy stream */
-    if (use_weight_bias_) {
-        Dtype* dw = (Dtype *)(bwd_scaleshift_diff_memory->get_data_handle());
-        for (int i = 0; i < this->channels_; i++)
-            this->blobs_[3]->mutable_cpu_diff()[i] = dw[i];
-
-        if (bias_term_) {
-            dw += channels_;
-            for (int i = 0; i < this->channels_; i++)
-                this->blobs_[4]->mutable_cpu_diff()[i] = dw[i];
-        }
+      PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
+      if (num_stats_batches_ > 1) {
+        CHECK(scaleshift_blob_ != scaleshift_acc_);
+        CHECK(scaleshift_blob_->count() == scaleshift_acc_->count());
+        caffe_cpu_axpby(scaleshift_acc_->count(), Dtype(1),
+                        scaleshift_blob_->mutable_cpu_diff(),
+                        Dtype(1), scaleshift_acc_->mutable_cpu_diff());
+      }
     }
 }
 
diff --git a/src/caffe/layers/mkldnn_split_layer.cpp b/src/caffe/layers/mkldnn_split_layer.cpp
index ab2c5156a..12359c141 100644
--- a/src/caffe/layers/mkldnn_split_layer.cpp
+++ b/src/caffe/layers/mkldnn_split_layer.cpp
@@ -94,10 +94,15 @@ void MKLDNNSplitLayer<Dtype>::InitSplitBwd(const vector<Blob<Dtype>*>& bottom,
 
   // Dimensions of bottom and top blobs. There is a number of
   // top blobs each of the same size as the bottom one
-  memory::dims bottom_tz = {static_cast<int>(this->sizes_src_[0]),
-                            static_cast<int>(this->sizes_src_[1]),
-                            static_cast<int>(this->sizes_src_[2]),
-                            static_cast<int>(this->sizes_src_[3])};
+  memory::dims bottom_tz;
+  bottom_tz.resize(4);
+  for(int i=0; i<4; i++) {
+    if(i < this->sizes_src_.size()) {
+      bottom_tz[i] = static_cast<int>(this->sizes_src_[i]);
+    } else {
+      bottom_tz[i] = 1;
+    }
+  }
 
   shared_ptr<memory::primitive_desc> prv_diff_dst_mpd;
   shared_ptr<memory::primitive_desc> usr_diff_dst_mpd(
diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp
index bacb6ae61..c53cff7ff 100644
--- a/src/caffe/mkldnn_memory.cpp
+++ b/src/caffe/mkldnn_memory.cpp
@@ -212,8 +212,7 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::convert_from_extprv(shared_ptr<prim
     CHECK(aprimitive);
     if(this->_reorder_extprv2prv_pd == NULL)
         return;
-    if (this->_extprv_memory_pd->desc().data.format == this->_prv_memory_pd->desc().data.format &&
-        this->_extprv_memory_pd->desc().data.data_type == this->_prv_memory_pd->desc().data.data_type)
+    if (*this->_extprv_memory_pd == *this->_prv_memory_pd)
     {
 #ifdef DEBUG
         LOG(INFO) << "The format and data_type of _extprv_memory_pd and _prv_memory_pd is same, no need do conversion.";
@@ -453,6 +452,32 @@ shared_ptr<memory> MKLDNNMemoryDescriptor<Dtype, is_diff>::create_output_memory(
     return omem;
 }
 
+template <typename Dtype, bool is_diff>
+Dtype* MKLDNNMemoryDescriptor<Dtype, is_diff>::get_memory_ptr(long offset) {
+    if (this->conversion_needed()) {
+      // TODO: support DFP16 offset
+      if (this->prv_ptr() != NULL) return (Dtype*)this->prv_ptr() + offset;
+      // when _internal_ptr is null, having same private layout as _blob
+      else return is_diff ?
+             (Dtype*)this->_blob->prv_diff() + offset :
+             (Dtype*)this->_blob->prv_data() + offset;
+    } else {
+      return const_cast<Dtype*>(
+        is_diff ? this->_blob->cpu_diff() + offset : this->_blob->cpu_data() + offset);
+    }
+}
+
+template <typename Dtype, bool is_diff>
+shared_ptr<memory::desc> MKLDNNMemoryDescriptor<Dtype, is_diff>::get_memory_desc() {
+    shared_ptr<memory::desc> desc;
+    if (this->conversion_needed()) {
+        desc.reset(new memory::desc(this->prv_memory_pd()->desc()));
+    } else {
+        desc.reset(new memory::desc(this->usr_memory_pd()->desc()));
+    }
+    return desc;
+}
+
 template <typename Dtype, bool is_diff>
 shared_ptr<MKLDNNMemoryDescriptor<Dtype, is_diff> > get_mkldnn_prv_descriptor(Blob<Dtype>* blob)
 {
diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp
index 13ad8da2b..59eec8c7c 100644
--- a/src/caffe/multinode/multi_solver.cpp
+++ b/src/caffe/multinode/multi_solver.cpp
@@ -105,12 +105,15 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 
   for (int i = 0; i < layers.size(); ++i) {
 #ifdef FW_OVERLAP_OPT
-    if (first && IsSkipWaitGradient(i) == false) {
+    if (first) {
       while (layer_finished_flags_[i] == false) {
+        if (IsSkipWaitGradient(i))
+         break;
         WaitAndUpdateGradient(i);
         if (layer_finished_flags_[i])
           break;
 
+        // wait and update gradient for next layers
         for (int k=i+1; k<layers.size(); k++) {
           if (layer_finished_flags_[k] || IsSkipWaitGradient(k)) {
             layer_finished_flags_[k] = true;
@@ -121,6 +124,7 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
             break;
         }
       }
+      layer_finished_flags_[i] = false;
     }
 #endif
 
@@ -129,6 +133,11 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
     LAYER_TIMING_STOP(forward, i);
   }
 
+  // Clear parameter diffs after communication is finished (that is, after 
+  // calling WaitGradientComm)
+  if (first)
+    root_solver_->net()->ClearParamDiffs();
+
   for (int i = layers.size() - 1; i >= 0; --i) {
     if (!layer_need_backward[i]) {
       continue;
@@ -160,6 +169,10 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
   if (last) {
 #endif
       for (int i = 0; i < layers.size(); ++i) {
+#ifdef FW_OVERLAP_OPT
+        if (layer_finished_flags_[i])
+          continue;
+#endif
         if (IsSkipWaitGradient(i)) {
 #ifdef FW_OVERLAP_OPT
           finished_count++;
@@ -167,10 +180,6 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 #endif
           continue;
         }
-#ifdef FW_OVERLAP_OPT
-        if (layer_finished_flags_[i])
-          continue;
-#endif
 
         WaitAndUpdateGradient(i);
 #ifdef FW_OVERLAP_OPT
@@ -190,7 +199,6 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 template <typename Dtype>
 Dtype MultiSolver<Dtype>::ForwardBackward() {
   Dtype loss = 0;
-  root_solver_->net()->ClearParamDiffs();
   for (int i = 0; i < iter_size; ++i) {
     loss += ForwardBackwardImpl(
       (i == 0), (i + 1 == iter_size));
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 0a8aeb981..9fda127c6 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -62,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/multinode/mlsl.hpp"
 #include "caffe/multinode/apply_mn_param.hpp"
 #include "caffe/util/remove_batch_norm.hpp"
+#include "caffe/util/apply_bn_stats_batch_size.hpp"
 
 PERFORMANCE_CREATE_MONITOR();
 
@@ -147,6 +148,12 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     this->kept_bn_layers_.push_back(param.compile_net_state().kept_bn_layers(idx));
   }
 
+  NetParameter param_with_stats_batch_size;
+  if (param.has_bn_stats_batch_size()) {
+    ApplyBnStatsBatchSize(param, &param_with_stats_batch_size);
+    param = param_with_stats_batch_size;
+  }
+
 #ifdef USE_MLSL
   NetParameter param_with_mn;
   if (mn::is_multinode()) {
@@ -628,13 +635,24 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
     // Note: Currently merging of convolution and relu layers is feasible
     // If current layer is Convolution of MKLDNN engine..
     if ((layer_param->type().compare("Convolution") == 0) &&
-       ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN)
-       || (((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) &&
-            (param.engine().compare(0, 6, "MKLDNN") == 0
-            && param.engine().find(":DLA", 6) == string::npos)) ||
-            (param.engine() == "" &&
-              layer_param->engine().compare(0, 6, "MKLDNN") == 0 &&
-              layer_param->engine().find(":DLA", 6) == string::npos)))) {
+        ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN) ||
+         ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) &&
+          (layer_param->engine().compare(0, 6, "MKLDNN") == 0) &&
+          (layer_param->engine().find(":DLA", 6) == string::npos)) ||
+         ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) &&
+          (layer_param->engine() == "") &&
+          (param.engine().compare(0, 6, "MKLDNN") == 0 &&
+           param.engine().find(":DLA", 6) == string::npos)))) {
+      // check if Dialation is larger than 1. if yes, don't fuse the following Relu layer with this conv layer
+      // as MKLDNN doesn't support dilation convolution yet.
+      bool dilation = false;
+      for (int i = 0; i < layer_param->convolution_param().dilation_size(); ++i) {
+        if (layer_param->convolution_param().dilation(i) > 1) {
+          dilation = true;
+          break;
+        }
+      }
+
       std::vector<const LayerParameter*> consumer_layer_params;
       GetBlobConsumers(consumer_layer_params, layer_param->top(0),
                        param, i+1 < param.layer_size() ? i+1 : i);
@@ -644,14 +662,16 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
 
       // Consumer layer of blob produced by Conv
       // has to be ReLU layer with one Input Blob
-      if ((consumer_layer_param.type().compare("ReLU") == 0) &&
-        ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN)
-        || (((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) &&
-            (param.engine().compare(0, 6, "MKLDNN") == 0
-            && param.engine().find(":DLA", 6) == string::npos)) ||
-            (param.engine() == "" &&
-              layer_param->engine().compare(0, 6, "MKLDNN") == 0 &&
-              layer_param->engine().find(":DLA", 6) == string::npos)))) {
+      if (!dilation &&
+          (consumer_layer_param.type().compare("ReLU") == 0) &&
+          ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN) ||
+           ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) &&
+            (consumer_layer_param.engine().compare(0, 6, "MKLDNN") == 0 &&
+             consumer_layer_param.engine().find(":DLA", 6) == string::npos)) ||
+           ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) &&
+            (consumer_layer_param.engine() == "") &&
+            (param.engine().compare(0, 6, "MKLDNN") == 0 &&
+             param.engine().find(":DLA", 6) == string::npos)))) {
         string& convolution_top_blob_name =
             const_cast<string&>(layer_param->top(0));
 
@@ -715,11 +735,12 @@ void Net<Dtype>::CompilationRuleThree(const NetParameter& param,
     // If current layer is BatchNorm of MKL2017 engine..
     if (((layer_param->type().compare("BatchNorm") == 0) &&
         ((layer_param->batch_norm_param().engine() ==
-         BatchNormParameter_Engine_MKL2017)
+         BatchNormParameter_Engine_MKL2017 || layer_param->batch_norm_param().engine() ==
+         BatchNormParameter_Engine_MKLDNN)
         || ((layer_param->batch_norm_param().engine() ==
            BatchNormParameter_Engine_DEFAULT) &&
-            param.engine().compare("MKL2017") == 0))) &&
-        (layer_param->top(0) == layer_param->bottom(0) )) {
+            (param.engine().compare("MKL2017") == 0 || param.engine().compare("MKLDNN") == 0)))) &&
+        (layer_param->top(0) == layer_param->bottom(0))) {
       std::string& batch_norm_top = const_cast<string&>(layer_param->top(0));
       std::vector<const LayerParameter*> consumer_layer_params;
       GetBlobConsumers(consumer_layer_params,
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index eaf9b6e6b..cd6cb761f 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -208,6 +208,9 @@ message NetParameter {
 
   optional string engine = 9 [default = ""];
 
+  // Batch size used for BatchNorm statistics, 0 would use the batch size of bottom blob
+  optional uint32 bn_stats_batch_size = 11 [default = 0];
+
   // The layers that make up the net.  Each of their configurations, including
   // connectivity and behavior, is specified as a LayerParameter.
   repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
@@ -900,6 +903,8 @@ message BatchNormParameter {
   optional bool bias_term = 6 [default = true]; // whether to have bias terms
   optional FillerParameter filler = 7; // The filler for the weight
   optional FillerParameter bias_filler = 8; // The filler for the bias
+  // Batch size used for statistics, 0 would use the batch size of bottom blob
+  optional uint32 stats_batch_size = 9 [default = 0];
 }
 
 message SplitParameter {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 3c8d1e66b..f7e7ac1cd 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -321,7 +321,12 @@ void Solver<Dtype>::Step(int iters) {
         const string& output_name =
             net_->blob_names()[net_->output_blob_indices()[j]];
         const Dtype loss_weight =
-            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
+            net_->blob_loss_weights()[net_->output_blob_indices()[j]]
+#ifdef USE_MLSL
+            * mn::get_distrib()->get_data_parts()
+#endif
+              ;
+
         for (int k = 0; k < result[j]->count(); ++k) {
           ostringstream loss_msg_stream;
           if (loss_weight) {
@@ -928,6 +933,10 @@ void Solver<Dtype>::Restore(const char* state_file) {
 template <typename Dtype>
 void Solver<Dtype>::UpdateSmoothedLoss(Dtype loss, int start_iter,
     int average_loss) {
+#ifdef USE_MLSL
+  loss *= mn::get_distrib()->get_data_parts();
+#endif
+
   if (losses_.size() < average_loss) {
     losses_.push_back(loss);
     int size = losses_.size();
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 264ac954f..5347dcdf7 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
+
 namespace caffe {
 template <typename Dtype>
 Dtype SGDSolver<Dtype>::GetWarmUpLR(int cur_iter, int warmup_iter, Dtype warmup_start_lr) {
@@ -208,13 +209,27 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
     return;
   }
 
+#ifdef ENABLE_SGD_FUSION
+  if (Caffe::mode() == Caffe::CPU) 
+  {
+    //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD";
+    //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD";
+    SGDFusion(param_id, rate);
+    return;
+  }
+#endif /* ENABLE_SGD_FUSION */
+
+  //LOG(INFO) << "No Fusion: Param_id: " << param_id;
   Normalize(param_id);
+  
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Normalize:");
 
   Regularize(param_id);
+
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Regularize:");
 
   ComputeUpdateValue(param_id, rate);
+
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: wtinc:");
 
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight before update:");
@@ -224,25 +239,241 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight after update:");
 }
 
+#ifdef ENABLE_SGD_FUSION
+//Math function for fusion
+//Function 1: axpy_axpby_copy
+//Start: For L1 Regularize_ComputeUpdateValue_Fusion
+template <typename Dtype>
+void axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff,
+                     const Dtype rate, const Dtype momentum, Dtype* history_data);
+
+template <>
+void axpy_axpby_copy<float>(size_t count, const float decay, const float* net_params_data, float *net_params_diff,
+                            const float rate, const float momentum, float* history_data)
+{
+  float temp_result = 0.;
+#ifdef _OPENMP
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
+#pragma simd
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    history_data[i] = temp_result;
+    net_params_diff[i] = temp_result;
+  }
+}
+
+template <>
+void axpy_axpby_copy<double>(size_t count, const double decay, const double* net_params_data, double *net_params_diff,
+                             const double rate, const double momentum, double* history_data)
+{
+  double temp_result = 0.;
+#ifdef _OPENMP
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
+#pragma simd
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    history_data[i] = temp_result;
+    net_params_diff[i] = temp_result;
+  }
+}
+//End: For L1 Regularize_ComputeUpdateValue_Fusion
+
+//Function 2: axpy_axpby_copy_axpy
+//Start: For L2 Regularize_ComputeUpdateValue_Update_Fusion
+template <typename Dtype>
+void axpy_axpby_copy_axpy(size_t count, const Dtype decay, Dtype* net_params_data, Dtype *net_params_diff,
+                     const Dtype rate, const Dtype momentum, Dtype* history_data, const Dtype update_param);
+
+template <>
+void axpy_axpby_copy_axpy<float>(size_t count, const float decay, float* net_params_data, float *net_params_diff,
+                            const float rate, const float momentum, float* history_data, const float update_param)
+{
+  float temp_result = 0.;
+#ifdef _OPENMP
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
+#pragma simd
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    history_data[i] =  temp_result;
+    net_params_diff[i] = temp_result;
+    net_params_data[i] = update_param * temp_result + net_params_data[i];
+  }
+}
+
+template <>
+void axpy_axpby_copy_axpy<double>(size_t count, const double decay, double* net_params_data, double *net_params_diff,
+                             const double rate, const double momentum, double* history_data, const double update_param)
+{
+  double temp_result = 0.;
+#ifdef _OPENMP
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
+#pragma simd
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    net_params_diff[i] = temp_result;
+    net_params_data[i] = update_param * temp_result + net_params_data[i];
+  }
+}
+//End: For L2 Regularize_ComputeUpdateValue_Update_Fusion
+
+
+template <typename Dtype>
+void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
+//LOG(INFO) << "Fusion: Param_id: " << param_id;
+
+//#pragma region 1. Common initialization
+  //Normalize initialization
+  bool skip_Normalize_stage_flag = false;
+  if (this->param_.iter_size() == 1) { skip_Normalize_stage_flag = true; }
+
+  // Scale gradient to counterbalance accumulation.
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+
+  //Regularize initialization
+  const vector<float>& net_params_weight_decay =
+    this->net_->params_weight_decay();
+  Dtype weight_decay = this->param_.weight_decay();
+  string regularization_type = this->param_.regularization_type();
+  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+  //ComputeUpdateValue  initialization
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+//#pragma endregion
+
+//#pragma region 2. Common condition judgement
+  bool prv_diff_condition_flag = false;
+  if (net_params[param_id]->prv_diff()
+    && (net_params[param_id]->prv_diff_count()
+    == net_params[param_id]->count())) {
+      prv_diff_condition_flag = true;
+  }
+//#pragma endregion
+
+//#pragma region 3. Normalize stage    
+  if (skip_Normalize_stage_flag == false)
+  {
+    const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
+      
+    if (prv_diff_condition_flag) {
+      caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization,
+        net_params[param_id]->mutable_prv_diff());
+    }
+    else {
+      caffe_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_cpu_diff());
+    }
+  }
+//#pragma endregion
+
+//For most common topologies from BVLC, all skipped the Normalize stage, and use L2 regularization
+//If prv_diff_condition_flag == true, then prv_data_condition_flag == true    (1)
+//If prv_diff_condition_flag == false, then prv_data_condition_flag == false  (2)
+//Another case is local_decay == 0, prv_diff_condition_flag == false          (3)
+//So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value
+//We can extend the fusion in L1 regularization by axpy_axpby_copy
+//We extend the fusion of Update stage in L2 regularization by axpy_axpby_copy_axpy,
+//then need to change execute_separate_ComputeUpdateValue_stage_flag to execute_separate_ComputeUpdateValue_Update_stage_flag
+//Simplify the execute_separate_ComputeUpdateValue_Update_stage_flag to is_separate_ComputeUpdateValue_Update
+  bool is_separate_ComputeUpdateValue_Update = true;
+  //Regularize stage (Fused ComputeUpdateValue_stage in some situations)
+  if (local_decay) {
+    if (regularization_type == "L2") {
+      // add weight decay
+      if (net_params[param_id]->prv_data()
+        && (net_params[param_id]->prv_data_count()
+        == net_params[param_id]->count())) {
+          CHECK_EQ(true,
+            net_params[param_id]->get_prv_data_descriptor()->layout_compare(
+            net_params[param_id]->get_prv_diff_descriptor()));
+          if (prv_diff_condition_flag) {
+            axpy_axpby_copy_axpy(net_params[param_id]->prv_data_count(), local_decay,
+                                net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
+
+            is_separate_ComputeUpdateValue_Update = false;
+          }
+      } else {
+        if (!prv_diff_condition_flag)
+        {
+          axpy_axpby_copy_axpy(net_params[param_id]->count(), local_decay,
+                                net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
+
+          is_separate_ComputeUpdateValue_Update = false;
+        }
+      }
+    } else if (regularization_type == "L1") {
+      caffe_cpu_sign(net_params[param_id]->count(),
+                      net_params[param_id]->cpu_data(),
+                      temp_[param_id]->mutable_cpu_data());
+
+      axpy_axpby_copy(net_params[param_id]->count(), local_decay,
+                                temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data());
+      
+      is_separate_ComputeUpdateValue_Update = false;
+      
+      //Update stage (separate)
+      net_params[param_id]->Update();
+    } else {
+      LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+    }
+  }
+  
+  //ComputeUpdateValue_Update stage (separate)
+  if (is_separate_ComputeUpdateValue_Update == true)
+  {
+    //Include the situation: regularization_type == "Unknown"
+    //Include situations (3): local_decay == 0
+    //No Regularize stage, only ComputeUpdateValue stage
+    //ComputeUpdateValue stage
+    if (prv_diff_condition_flag) {
+      caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate,
+                      net_params[param_id]->prv_diff(), momentum,
+                      history_[param_id]->mutable_cpu_data());
+
+      caffe_copy(net_params[param_id]->count(),
+                  history_[param_id]->cpu_data(),
+                  net_params[param_id]->mutable_prv_diff());
+    } else {
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                      net_params[param_id]->cpu_diff(), momentum,
+                      history_[param_id]->mutable_cpu_data());
+
+      caffe_copy(net_params[param_id]->count(),
+                  history_[param_id]->cpu_data(),
+                  net_params[param_id]->mutable_cpu_diff());
+    }
+
+    //Update stage (separate)
+    net_params[param_id]->Update();
+  }
+}
+#endif /* ENABLE_SGD_FUSION */
+
 template <typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
 
-#ifdef USE_MLSL
-  if ((this->param_.iter_size() == 1) && !mn::is_multinode()) {
+  if (this->param_.iter_size() == 1) { 
+    //LOG(INFO) << "Normalize stage: Normalize stage is skipped.";
     return;
   }
-#else /* !USE_MLSL */
-  if (this->param_.iter_size() == 1) { return; }
-#endif /* USE_MLSL */
 
+  //LOG(INFO) << "Normalize stage: Normalize stage is not skipped.";
   // Scale gradient to counterbalance accumulation.
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-
-#ifdef USE_MLSL
-  const Dtype accum_normalization = Dtype(1.) / (this->param_.iter_size() * mn::get_nodes_count());
-#else /* !USE_MLSL */
+  
   const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
-#endif /* USE_MLSL */
 
   switch (Caffe::mode()) {
   case Caffe::CPU: {
@@ -250,8 +481,7 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
     if (net_params[param_id]->prv_diff()
         && (net_params[param_id]->prv_diff_count()
             == net_params[param_id]->count())) {
-
-        caffe_scal(net_params[param_id]->count(), accum_normalization,
+        caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization,
             net_params[param_id]->mutable_prv_diff());
     }
     else {
@@ -295,7 +525,7 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
             net_params[param_id]->get_prv_data_descriptor()->layout_compare(
             net_params[param_id]->get_prv_diff_descriptor()));
 
-          caffe_axpy(net_params[param_id]->count(),
+          caffe_axpy(net_params[param_id]->prv_data_count(),
                      local_decay,
                      net_params[param_id]->prv_data(),
                      net_params[param_id]->mutable_prv_diff());
@@ -376,8 +606,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
     if (net_params[param_id]->prv_diff()
         && (net_params[param_id]->prv_diff_count()
             == net_params[param_id]->count())) {
-
-      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+      caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate,
                       net_params[param_id]->prv_diff(), momentum,
                       history_[param_id]->mutable_cpu_data());
 
@@ -392,6 +621,12 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
       caffe_copy(net_params[param_id]->count(),
                  history_[param_id]->cpu_data(),
                  net_params[param_id]->mutable_cpu_diff());
+
+      if (net_params[param_id]->prv_diff() 
+          && (net_params[param_id]->prv_diff_count()
+              != net_params[param_id]->count())) {
+          net_params[param_id]->mutable_prv_diff();
+      }
     }
     break;
   }
diff --git a/src/caffe/util/apply_bn_stats_batch_size.cpp b/src/caffe/util/apply_bn_stats_batch_size.cpp
new file mode 100644
index 000000000..078cf6bc5
--- /dev/null
+++ b/src/caffe/util/apply_bn_stats_batch_size.cpp
@@ -0,0 +1,57 @@
+/*
+All modification made by Intel Corporation: © 2017 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string>
+#include "caffe/common.hpp"
+#include "caffe/util/apply_bn_stats_batch_size.hpp"
+
+namespace caffe {
+void ApplyBnStatsBatchSize(const NetParameter& param,
+    NetParameter* param_with_stats_batch_size) {
+  CHECK(param.has_bn_stats_batch_size());
+  param_with_stats_batch_size->CopyFrom(param);
+  param_with_stats_batch_size->clear_layer();
+  int bn_stats_batch_size = param.bn_stats_batch_size();
+  for (int i = 0; i < param.layer_size(); i++) {
+    LayerParameter *layer_param = param_with_stats_batch_size->add_layer();
+    layer_param->CopyFrom(param.layer(i));
+    if (layer_param->type() == "BatchNorm") {
+      layer_param->mutable_batch_norm_param()->set_stats_batch_size(bn_stats_batch_size);
+    }
+  }
+}
+}
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 231209127..5d0ea7f49 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -547,6 +547,22 @@ int time() {
   const vector<vector<Blob<float>*> >& top_vecs = caffe_net.top_vecs();
   const vector<vector<bool> >& bottom_need_backward =
       caffe_net.bottom_need_backward();
+
+  // Warm up 5 iterations here, because the first several iteration times
+  // have huge variance in some machines.
+  int warmup_iterations = 5;
+  for (int j = 0; j < warmup_iterations; ++j) {
+    for (int i = 0; i < layers.size(); ++i) {
+      layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
+    }
+    if (!FLAGS_forward_only) {
+      for (int i = layers.size() - 1; i >= 0; --i) {
+        layers[i]->Backward(top_vecs[i], bottom_need_backward[i],
+                            bottom_vecs[i]);
+      }
+    }
+  }
+
   LOG(INFO) << "*** Benchmark begins ***";
   LOG(INFO) << "Testing for " << FLAGS_iterations << " iterations.";
   Timer total_timer;