Add support of dynamic batch size

ersheng-ai · ersheng-ai · commit 1ff5a1085ce0 · 2020-08-11T19:33:50.000+08:00
diff --git a/README.md b/README.md
@@ -89,7 +89,6 @@ See following sections for more details of conversions.
 | ------------------- | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: |
 | DarkNet (YOLOv4 paper)|     0.471 |       0.710 |       0.510 |       0.278 |       0.525 |       0.636 |
 | Pytorch (TianXiaomo)|       0.466 |       0.704 |       0.505 |       0.267 |       0.524 |       0.629 |
-| ONNX                |    incoming |    incoming |    incoming |    incoming |    incoming |    incoming |
 | TensorRT FP32 + BatchedNMSPlugin | 0.472| 0.708 |       0.511 |       0.273 |       0.530 |       0.637 |
 | TensorRT FP16 + BatchedNMSPlugin | 0.472| 0.708 |       0.511 |       0.273 |       0.530 |       0.636 |
 
@@ -99,7 +98,6 @@ See following sections for more details of conversions.
 | ------------------- | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: |
 | DarkNet (YOLOv4 paper)|     0.412 |       0.628 |       0.443 |       0.204 |       0.444 |       0.560 |
 | Pytorch (TianXiaomo)|       0.404 |       0.615 |       0.436 |       0.196 |       0.438 |       0.552 |
-| ONNX                |    incoming |    incoming |    incoming |    incoming |    incoming |    incoming |
 | TensorRT FP32 + BatchedNMSPlugin | 0.412| 0.625 |       0.445 |       0.200 |       0.446 |       0.564 |
 | TensorRT FP16 + BatchedNMSPlugin | 0.412| 0.625 |       0.445 |       0.200 |       0.446 |       0.563 |
 
@@ -163,10 +161,11 @@ Until now, still a small piece of post-processing including NMS is required. We
     python demo_darknet2onnx.py <cfgFile> <weightFile> <imageFile> <batchSize>
     ```
 
-  This script will generate 2 ONNX models.
+## 3.1 Dynamic or static batch size
 
-  - One is for running the demo (batch_size=1)
-  - The other one is what you want to generate (batch_size=batchSize)
+- **Positive batch size will generate ONNX model of static batch size, otherwise, batch size will be dynamic**
+    - Dynamic batch size will generate only one ONNX model
+    - Static batch size will generate 2 ONNX models, one is for running the demo (batch_size=1)
 
 # 4. Pytorch2ONNX (Evolving)
 
@@ -195,34 +194,54 @@ Until now, still a small piece of post-processing including NMS is required. We
     python demo_pytorch2onnx.py yolov4.pth dog.jpg 8 80 416 416
     ```
 
-  This script will generate 2 ONNX models.
+## 4.1 Dynamic or static batch size
 
-  - One is for running the demo (batch_size=1)
-  - The other one is what you want to generate (batch_size=batch_size)
+- **Positive batch size will generate ONNX model of static batch size, otherwise, batch size will be dynamic**
+    - Dynamic batch size will generate only one ONNX model
+    - Static batch size will generate 2 ONNX models, one is for running the demo (batch_size=1)
 
 
 # 5. ONNX2TensorRT (Evolving)
 
 - **TensorRT version Recommended: 7.0, 7.1**
 
+## 5.1 Convert from ONNX of static Batch size
+
 - **Run the following command to convert VOLOv4 ONNX model into TensorRT engine**
 
     ```sh
     trtexec --onnx=<onnx_file> --explicitBatch --saveEngine=<tensorRT_engine_file> --workspace=<size_in_megabytes> --fp16
     ```
     - Note: If you want to use int8 mode in conversion, extra int8 calibration is needed.
 
-- **Run the demo**
+## 5.2 Convert from ONNX of dynamic Batch size
+
+- **Run the following command to convert VOLOv4 ONNX model into TensorRT engine**
 
     ```sh
-    python demo_trt.py <tensorRT_engine_file> <input_image> <input_H> <input_W>
+    trtexec --onnx=<onnx_file> \
+    --minShapes=input:<shape_of_min_batch> --optShapes=input:<shape_of_opt_batch> --maxShapes=input:<shape_of_max_batch> \
+    --workspace=<size_in_megabytes> --saveEngine=yolov4_-1_3_320_512_dyna.engine --fp16
     ```
+- For example:
+
+    ```sh
+    trtexec --onnx=yolov4_-1_3_320_512_dynamic.onnx \
+    --minShapes=input:1x3x320x512 --optShapes=input:4x3x320x512 --maxShapes=input:8x3x320x512 \
+    --workspace=2048 --saveEngine=yolov4_-1_3_320_512_dynamic.engine --fp16
+    ```
+
+## 5.3 Run the demo
+
+```sh
+python demo_trt.py <tensorRT_engine_file> <input_image> <input_H> <input_W>
+```
 
-    - This demo here only works when batchSize=1, but you can update this demo a little for batched inputs.
+- This demo here only works when batchSize is dynamic (1 should be within dynamic range) or batchSize=1, but you can update this demo a little for other dynamic or static batch sizes.
     
-    - Note1: input_H and input_W should agree with the input size in the original ONNX file.
+- Note1: input_H and input_W should agree with the input size in the original ONNX file.
     
-    - Note2: extra NMS operations are needed for the tensorRT output. This demo uses python NMS code from `tool/utils.py`.
+- Note2: extra NMS operations are needed for the tensorRT output. This demo uses python NMS code from `tool/utils.py`.
 
 
 # 6. ONNX2Tensorflow
diff --git a/demo_darknet2onnx.py b/demo_darknet2onnx.py
@@ -12,10 +12,13 @@
 
 def main(cfg_file, weight_file, image_path, batch_size):
 
-    # Transform to onnx as specified batch size
-    transform_to_onnx(cfg_file, weight_file, batch_size)
-    # Transform to onnx for demo
-    onnx_path_demo = transform_to_onnx(cfg_file, weight_file, 1)
+    if batch_size <= 0:
+        onnx_path_demo = transform_to_onnx(cfg_file, weight_file, batch_size)
+    else:
+        # Transform to onnx as specified batch size
+        transform_to_onnx(cfg_file, weight_file, batch_size)
+        # Transform to onnx as demo
+        onnx_path_demo = transform_to_onnx(cfg_file, weight_file, 1)
 
     session = onnxruntime.InferenceSession(onnx_path_demo)
     # session = onnx.load(onnx_path)
diff --git a/demo_pytorch2onnx.py b/demo_pytorch2onnx.py
@@ -19,32 +19,59 @@ def transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W
     pretrained_dict = torch.load(weight_file, map_location=torch.device('cuda'))
     model.load_state_dict(pretrained_dict)
 
-    x = torch.randn((batch_size, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True)  # .cuda()
-
-    onnx_file_name = "yolov4_{}_3_{}_{}.onnx".format(batch_size, IN_IMAGE_H, IN_IMAGE_W)
-
-    # Export the model
-    print('Export the onnx model ...')
-    torch.onnx.export(model,
-                      x,
-                      onnx_file_name,
-                      export_params=True,
-                      opset_version=11,
-                      do_constant_folding=True,
-                      input_names=['input'], output_names=['boxes', 'confs'],
-                      dynamic_axes=None)
-
-    print('Onnx model exporting done')
-    return onnx_file_name
+    input_names = ["input"]
+    output_names = ['boxes', 'confs']
+
+    dynamic = False
+    if batch_size <= 0:
+        dynamic = True
+
+    if dynamic:
+        x = torch.randn((1, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True)
+        onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(IN_IMAGE_H, IN_IMAGE_W)
+        dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}}
+        # Export the model
+        print('Export the onnx model ...')
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=dynamic_axes)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
+
+    else:
+        x = torch.randn((batch_size, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True)
+        onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, IN_IMAGE_H, IN_IMAGE_W)
+        # Export the model
+        print('Export the onnx model ...')
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=None)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
     
 
 
 def main(weight_file, image_path, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W):
 
-    # Transform to onnx as specified batch size
-    transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W)
-    # Transform to onnx for demo
-    onnx_path_demo = transform_to_onnx(weight_file, 1, n_classes, IN_IMAGE_H, IN_IMAGE_W)
+    if batch_size <= 0:
+        onnx_path_demo = transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W)
+    else:
+        # Transform to onnx as specified batch size
+        transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W)
+        # Transform to onnx for demo
+        onnx_path_demo = transform_to_onnx(weight_file, 1, n_classes, IN_IMAGE_H, IN_IMAGE_W)
 
     session = onnxruntime.InferenceSession(onnx_path_demo)
     # session = onnx.load(onnx_path)
diff --git a/demo_trt.py b/demo_trt.py
@@ -73,13 +73,20 @@ def __repr__(self):
         return self.__str__()
 
 # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
-def allocate_buffers(engine):
+def allocate_buffers(engine, batch_size):
     inputs = []
     outputs = []
     bindings = []
     stream = cuda.Stream()
     for binding in engine:
-        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+
+        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
+        dims = engine.get_binding_shape(binding)
+        
+        # in case batch dimension is -1 (dynamic)
+        if dims[0] < 0:
+            size *= -1
+        
         dtype = trt.nptype(engine.get_binding_dtype(binding))
         # Allocate host and device buffers
         host_mem = cuda.pagelocked_empty(size, dtype)
@@ -112,7 +119,10 @@ def do_inference(context, bindings, inputs, outputs, stream):
 
 def main(engine_path, image_path, image_size):
     with get_engine(engine_path) as engine, engine.create_execution_context() as context:
-        buffers = allocate_buffers(engine)
+        buffers = allocate_buffers(engine, 1)
+        IN_IMAGE_H, IN_IMAGE_W = image_size
+        context.set_binding_shape(0, (1, 3, IN_IMAGE_H, IN_IMAGE_W))
+
         image_src = cv2.imread(image_path)
 
         num_classes = 80
diff --git a/models.py b/models.py
@@ -20,17 +20,20 @@ def __init__(self):
 
     def forward(self, x, target_size, inference=False):
         assert (x.data.dim() == 4)
-        _, _, tH, tW = target_size
+        # _, _, tH, tW = target_size
 
         if inference:
-            B = x.data.size(0)
-            C = x.data.size(1)
-            H = x.data.size(2)
-            W = x.data.size(3)
 
-            return x.view(B, C, H, 1, W, 1).expand(B, C, H, tH // H, W, tW // W).contiguous().view(B, C, tH, tW)
+            #B = x.data.size(0)
+            #C = x.data.size(1)
+            #H = x.data.size(2)
+            #W = x.data.size(3)
+
+            return x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+                    expand(x.size(0), x.size(1), x.size(2), target_size[2] // x.size(2), x.size(3), target_size[3] // x.size(3)).\
+                    contiguous().view(x.size(0), x.size(1), target_size[2], target_size[3])
         else:
-            return F.interpolate(x, size=(tH, tW), mode='nearest')
+            return F.interpolate(x, size=(target_size[2], target_size[3]), mode='nearest')
 
 
 class Conv_Bn_Activation(nn.Module):
diff --git a/tool/darknet2onnx.py b/tool/darknet2onnx.py
@@ -3,23 +3,23 @@
 from tool.darknet2pytorch import Darknet
 
 
-def transform_to_onnx(cfgfile, weightfile, batch_size=1, dynamic=False):
+def transform_to_onnx(cfgfile, weightfile, batch_size=1):
     model = Darknet(cfgfile)
 
     model.print_network()
     model.load_weights(weightfile)
     print('Loading weights from %s... Done!' % (weightfile))
 
-    # model.cuda()
+    dynamic = False
+    if batch_size <= 0:
+        dynamic = True
 
-    x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True)  # .cuda()
+    input_names = ["input"]
+    output_names = ['boxes', 'confs']
 
     if dynamic:
-
-        onnx_file_name = "yolov4_{}_3_{}_{}_dyna.onnx".format(batch_size, model.height, model.width)
-        input_names = ["input"]
-        output_names = ['boxes', 'confs']
-
+        x = torch.randn((1, 3, model.height, model.width), requires_grad=True)
+        onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(model.height, model.width)
         dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}}
         # Export the model
         print('Export the onnx model ...')
@@ -36,14 +36,15 @@ def transform_to_onnx(cfgfile, weightfile, batch_size=1, dynamic=False):
         return onnx_file_name
 
     else:
+        x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True)
         onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, model.height, model.width)
         torch.onnx.export(model,
                           x,
                           onnx_file_name,
                           export_params=True,
                           opset_version=11,
                           do_constant_folding=True,
-                          input_names=['input'], output_names=['boxes', 'confs'],
+                          input_names=input_names, output_names=output_names,
                           dynamic_axes=None)
 
         print('Onnx model exporting done')
diff --git a/tool/darknet2pytorch.py b/tool/darknet2pytorch.py
@@ -55,15 +55,12 @@ def __init__(self, stride=2):
         self.stride = stride
 
     def forward(self, x):
-        stride = self.stride
         assert (x.data.dim() == 4)
-        B = x.data.size(0)
-        C = x.data.size(1)
-        H = x.data.size(2)
-        W = x.data.size(3)
-        ws = stride
-        hs = stride
-        x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H * stride, W * stride)
+        
+        x = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+            expand(x.size(0), x.size(1), x.size(2), self.stride, x.size(3), self.stride).contiguous().\
+            view(x.size(0), x.size(1), x.size(2) * self.stride, x.size(3) * self.stride)
+
         return x
 
 
@@ -73,14 +70,9 @@ def __init__(self, stride):
         self.stride = stride
 
     def forward(self, x):
-        x_numpy = x.cpu().detach().numpy()
-        H = x_numpy.shape[2]
-        W = x_numpy.shape[3]
-
-        H = H * self.stride
-        W = W * self.stride
+        assert (x.data.dim() == 4)
 
-        out = F.interpolate(x, size=(H, W), mode='nearest')
+        out = F.interpolate(x, size=(x.size(2) * self.stride, x.size(3) * self.stride), mode='nearest')
         return out
 
 
@@ -246,15 +238,15 @@ def create_network(self, blocks):
         conv_id = 0
         for block in blocks:
             if block['type'] == 'net':
-                prev_filters = int(float(block['channels']))
+                prev_filters = int(block['channels'])
                 continue
             elif block['type'] == 'convolutional':
                 conv_id = conv_id + 1
-                batch_normalize = int(float(block['batch_normalize']))
-                filters = int(float(block['filters']))
-                kernel_size = int(float(block['size']))
-                stride = int(float(block['stride']))
-                is_pad = int(float(block['pad']))
+                batch_normalize = int(block['batch_normalize'])
+                filters = int(block['filters'])
+                kernel_size = int(block['size'])
+                stride = int(block['stride'])
+                is_pad = int(block['pad'])
                 pad = (kernel_size - 1) // 2 if is_pad else 0
                 activation = block['activation']
                 model = nn.Sequential()
diff --git a/tool/yolo_layer.py b/tool/yolo_layer.py

-Original file line number
+Diff line change
 import torch.nn.functional as F
 from tool.torch_utils import *
+-
 -def yolo_forward_alternative(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
 +def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
                               validation=False):
     # Output would be invalid if it does not satisfy this assert
     # assert (output.size(1) == (5 + num_classes) * num_anchors)
     H = output.size(2)
     W = output.size(3)
 -    device = None
 -    cuda_check = output.is_cuda
 -    if cuda_check:
 -        device = output.get_device()
+-
+-
 -    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
 -    grid_x = np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0).reshape(1, 1, H * W).repeat(batch, 0).repeat(num_anchors, 1)
 -    grid_y = np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1).reshape(1, 1, H * W).repeat(batch, 0).repeat(num_anchors, 1)
 -    # Shape: [batch, num_anchors, H * W]
 -    grid_x_tensor = torch.tensor(grid_x, device=device, dtype=torch.float32)
 -    grid_y_tensor = torch.tensor(grid_y, device=device, dtype=torch.float32)
+-
 -    anchor_array = np.array(anchors).reshape(1, num_anchors, 2)
 -    anchor_array = anchor_array.repeat(batch, 0)
 -    anchor_array = np.expand_dims(anchor_array, axis=3).repeat(H * W, 3)
 -    # Shape: [batch, num_anchors, 2, H * W]
 -    anchor_tensor = torch.tensor(anchor_array, device=device, dtype=torch.float32)
+-
 -    # normalize coordinates to [0, 1]
 -    normal_array = np.array([1.0 / W, 1.0 / H, 1.0 / W, 1.0 / H], dtype=np.float32).reshape(1, 1, 4)
 -    normal_array = normal_array.repeat(batch, 0)
 -    normal_array = normal_array.repeat(num_anchors * H * W, 1)
 -    # Shape: [batch, num_anchors * H * W, 4]
 -    normal_tensor = torch.tensor(normal_array, device=device, dtype=torch.float32)
+-
     bxy_list = []
     bwh_list = []
     det_confs_list = []
     # Apply sigmoid(), exp() and softmax() to slices
+    #
 -    bxy = torch.sigmoid(bxy)
 +    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
     bwh = torch.exp(bwh)
     det_confs = torch.sigmoid(det_confs)
     cls_confs = torch.sigmoid(cls_confs)
 -    # Shape: [batch, num_anchors, 2, H * W]
 -    bxy = bxy.view(batch, num_anchors, 2, H * W)
 -    # Shape: [batch, num_anchors, 2, H * W]
 -    bwh = bwh.view(batch, num_anchors, 2, H * W)
 +    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
 +    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
 +    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
 +    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
 +    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
++
 +    anchor_w = []
 +    anchor_h = []
 +    for i in range(num_anchors):
 +        anchor_w.append(anchors[i * 2])
 +        anchor_h.append(anchors[i * 2 + 1])
++
 +    device = None
 +    cuda_check = output.is_cuda
 +    if cuda_check:
 +        device = output.get_device()
++
 +    bx_list = []
 +    by_list = []
 +    bw_list = []
 +    bh_list = []
     # Apply C-x, C-y, P-w, P-h
 -    bxy[:, :, 0] += grid_x_tensor
 -    bxy[:, :, 1] += grid_y_tensor
 +    for i in range(num_anchors):
 +        ii = i * 2
 +        # Shape: [batch, 1, H, W]
 +        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
 +        # Shape: [batch, 1, H, W]
 +        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
 +        # Shape: [batch, 1, H, W]
 +        bw = bwh[:, ii : ii + 1] * anchor_w[i]
 +        # Shape: [batch, 1, H, W]
 +        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
 -    print(anchor_tensor.size())
 -    bwh *= anchor_tensor
 +        bx_list.append(bx)
 +        by_list.append(by)
 +        bw_list.append(bw)
 +        bh_list.append(bh)
 -    bx1y1 = bxy - bwh * 0.5
 -    bx2y2 = bxy + bwh
 -    # Shape: [batch, num_anchors, 4, H * W] --> [batch, num_anchors * H * W, 1, 4]
 -    boxes = torch.cat((bx1y1, bx2y2), dim=2).permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, 1, 4)
 +    ########################################
 +    #   Figure out bboxes from slices     #
 +    ########################################
++
 +    # Shape: [batch, num_anchors, H, W]
 +    bx = torch.cat(bx_list, dim=1)
 +    # Shape: [batch, num_anchors, H, W]
 +    by = torch.cat(by_list, dim=1)
 +    # Shape: [batch, num_anchors, H, W]
 +    bw = torch.cat(bw_list, dim=1)
 +    # Shape: [batch, num_anchors, H, W]
 +    bh = torch.cat(bh_list, dim=1)
++
 +    # Shape: [batch, 2 * num_anchors, H, W]
 +    bx_bw = torch.cat((bx, bw), dim=1)
 +    # Shape: [batch, 2 * num_anchors, H, W]
 +    by_bh = torch.cat((by, bh), dim=1)
++
 +    # normalize coordinates to [0, 1]
 +    bx_bw /= W
 +    by_bh /= H
++
 +    # Shape: [batch, num_anchors * H * W, 1]
 +    bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
 +    by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
 +    bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
 +    bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
++
 +    bx1 = bx - bw * 0.5
 +    by1 = by - bh * 0.5
 +    bx2 = bx1 + bw
 +    by2 = by1 + bh
++
 +    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
 +    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
     # boxes = boxes.repeat(1, 1, num_classes, 1)
 -    print(normal_tensor.size())
 -    boxes *= normal_tensor
 +    # boxes:     [batch, num_anchors * H * W, 1, 4]
 +    # cls_confs: [batch, num_anchors * H * W, num_classes]
 +    # det_confs: [batch, num_anchors * H * W]
     det_confs = det_confs.view(batch, num_anchors * H * W, 1)
     confs = cls_confs * det_confs
     return  boxes, confs
+-
 -def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
 +def yolo_forward_dynamic(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
                               validation=False):
     # Output would be invalid if it does not satisfy this assert
     # assert (output.size(1) == (5 + num_classes) * num_anchors)
     # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
     # And then into
     # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
 -    batch = output.size(0)
 -    H = output.size(2)
 -    W = output.size(3)
 +    # batch = output.size(0)
 +    # H = output.size(2)
 +    # W = output.size(3)
     bxy_list = []
     bwh_list = []
     # Shape: [batch, num_anchors, H, W]
     det_confs = torch.cat(det_confs_list, dim=1)
     # Shape: [batch, num_anchors * H * W]
 -    det_confs = det_confs.view(batch, num_anchors * H * W)
 +    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3))
     # Shape: [batch, num_anchors * num_classes, H, W]
     cls_confs = torch.cat(cls_confs_list, dim=1)
     # Shape: [batch, num_anchors, num_classes, H * W]
 -    cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W)
 +    cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3))
     # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes]
 -    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes)
 +    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes)
     # Apply sigmoid(), exp() and softmax() to slices
+    #
     cls_confs = torch.sigmoid(cls_confs)
     # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
 -    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
 -    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
 +    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(3) - 1, output.size(3)), axis=0).repeat(output.size(2), 0), axis=0), axis=0)
 +    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(2) - 1, output.size(2)), axis=1).repeat(output.size(3), 1), axis=0), axis=0)
     # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
     # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
     by_bh = torch.cat((by, bh), dim=1)
     # normalize coordinates to [0, 1]
 -    bx_bw /= W
 -    by_bh /= H
 +    bx_bw /= output.size(3)
 +    by_bh /= output.size(2)
     # Shape: [batch, num_anchors * H * W, 1]
 -    bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
 -    by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
 -    bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
 -    bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
 +    bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
 +    by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
 +    bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
 +    bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
     bx1 = bx - bw * 0.5
     by1 = by - bh * 0.5
     bx2 = bx1 + bw
     by2 = by1 + bh
     # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
 -    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
 +    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(output.size(0), num_anchors * output.size(2) * output.size(3), 1, 4)
     # boxes = boxes.repeat(1, 1, num_classes, 1)
     # boxes:     [batch, num_anchors * H * W, 1, 4]
     # cls_confs: [batch, num_anchors * H * W, num_classes]
     # det_confs: [batch, num_anchors * H * W]
 -    det_confs = det_confs.view(batch, num_anchors * H * W, 1)
 +    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
     confs = cls_confs * det_confs
     # boxes: [batch, num_anchors * H * W, 1, 4]
     # confs: [batch, num_anchors * H * W, num_classes]
     return  boxes, confs
+-
 class YoloLayer(nn.Module):
     ''' Yolo layer
     model_out: while inference,is post-processing inside or outside the model
             masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step]
         masked_anchors = [anchor / self.stride for anchor in masked_anchors]
 -        return yolo_forward(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y)
 +        return yolo_forward_dynamic(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y)