WenmuZhou
diff --git a/‎base/base_trainer.py
+2-2 b/‎base/base_trainer.py
+2-2
diff --git a/‎config.json
+7-8 b/‎config.json
+7-8
diff --git a/‎config/default.py
+1 b/‎config/default.py
+1
diff --git a/‎data_loader/augment.py
+11-4 b/‎data_loader/augment.py
+11-4
diff --git a/‎data_loader/data_utils.py
+5-6 b/‎data_loader/data_utils.py
+5-6
diff --git a/‎data_loader/dataset.py
+4-3 b/‎data_loader/dataset.py
+4-3
diff --git a/‎models/__init__.py
+5 b/‎models/__init__.py
+5
diff --git a/‎models/loss.py
+18-16 b/‎models/loss.py
+18-16
diff --git a/‎models/model_pse1.py
+92 b/‎models/model_pse1.py
+92
@@ -74,7 +74,7 @@ def __init__(self, config, model, criterion, weights_init):
         else:
             if weights_init is not None:
                 model.apply(weights_init)
-        # self.scheduler = self._initialize('lr_scheduler', torch.optim.lr_scheduler, self.optimizer)
+        self.scheduler = self._initialize('lr_scheduler', torch.optim.lr_scheduler, self.optimizer)
 
         # 单机多卡
         num_gpus = torch.cuda.device_count()
@@ -102,8 +102,8 @@ def train(self):
         """
         for epoch in range(self.start_epoch, self.epochs + 1):
             try:
+                self.scheduler.step()
                 self.epoch_result = self._train_epoch(epoch)
-
                 self._on_epoch_finish()
             except torch.cuda.CudaError:
                 self._log_memory_usage()
 
@@ -1,13 +1,13 @@
 {
-    "name": "PAN",
+    "name": "PAN_pred_mask",
     "data_loader": {
         "type": "ImageDataset",
         "args": {
             "alphabet": "alphabet.npy",
             "dataset": {
                 "train_data_path": [
                     [
-                        "E:\\zj\\dataset\\icdar2015\\train\\train.txt"
+                        "/data1/zj/ocr/icdar2015/train/train.txt"
                     ]
                 ],
                 "train_data_ratio": [
@@ -32,7 +32,7 @@
     "arch": {
         "type": "PANModel",
         "args": {
-            "backbone": "resnet18",
+            "backbone": "resnet50",
             "fpem_repeat": 2,
             "pretrained": true
         }
@@ -50,9 +50,7 @@
     "optimizer": {
         "type": "Adam",
         "args": {
-            "lr": 0.001,
-            "weight_decay": 0,
-            "amsgrad": true
+            "lr": 0.001
         }
     },
     "lr_scheduler": {
@@ -65,15 +63,16 @@
     "trainer": {
         "seed": 2,
         "gpus": [
-            0
+            3
         ],
         "epochs": 600,
         "display_interval": 10,
+        "show_images_interval": 50,
         "resume": {
             "restart_training": true,
             "checkpoint": ""
         },
         "output_dir": "output",
         "tensorboard": true
     }
-}
+}
@@ -74,6 +74,7 @@
     'gpus': [0],
     'epochs': 100,
     'display_interval': 10,
+    'show_images_interval': 50,
     'resume': resume,
     'output_dir': 'output',
     'tensorboard': True
 
@@ -121,17 +121,24 @@ def random_crop(self, imgs, img_size):
             return imgs
 
         # label中存在文本实例，并且按照概率进行裁剪
-        if np.max(imgs[1][:, :, -1]) > 0 and random.random() > 3.0 / 8.0:
+        if np.max(imgs[1][:, :, 0]) > 0 and random.random() > 3.0 / 8.0:
             # 文本实例的top left点
-            tl = np.min(np.where(imgs[1][:, :, -1] > 0), axis=1) - img_size
+            tl = np.min(np.where(imgs[1][:, :, 0] > 0), axis=1) - img_size
             tl[tl < 0] = 0
             # 文本实例的 bottom right 点
-            br = np.max(np.where(imgs[1][:, :, -1] > 0), axis=1) - img_size
+            br = np.max(np.where(imgs[1][:, :, 0] > 0), axis=1) - img_size
             br[br < 0] = 0
             # 保证选到右下角点是，有足够的距离进行crop
             br[0] = min(br[0], h - th)
             br[1] = min(br[1], w - tw)
-
+            for _ in range(50000):
+                i = random.randint(tl[0], br[0])
+                j = random.randint(tl[1], br[1])
+                # 保证最小的图有文本
+                if imgs[1][:, :, -1][i:i + th, j:j + tw].sum() <= 0:
+                    continue
+                else:
+                    break
             i = random.randint(tl[0], br[0])
             j = random.randint(tl[1], br[1])
         else:
 
@@ -52,25 +52,24 @@ def generate_rbox(im_size, text_polys, text_tags,training_mask, shrink_ratio):
         pco.AddPath(poly, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
         shrinked_poly = np.array(pco.Execute(-d_i))
         cv2.fillPoly(score_map, shrinked_poly, i + 1)
-        if tag:
+        if not tag:
             cv2.fillPoly(training_mask, shrinked_poly, 0)
     return score_map, training_mask
 
 
-def augmentation(im: np.ndarray, text_polys: np.ndarray, scales: np.ndarray, degrees: int, input_size: int) -> tuple:
+def augmentation(im: np.ndarray, text_polys: np.ndarray, scales: np.ndarray, degrees: int) -> tuple:
     # the images are rescaled with ratio {0.5, 1.0, 2.0, 3.0} randomly
     im, text_polys = data_aug.random_scale(im, text_polys, scales)
     # the images are horizontally fliped and rotated in range [−10◦, 10◦] randomly
     if random.random() < 0.5:
         im, text_polys = data_aug.horizontal_flip(im, text_polys)
     if random.random() < 0.5:
         im, text_polys = data_aug.random_rotate_img_bbox(im, text_polys, degrees)
-
     return im, text_polys
 
 
 def image_label(im: np.ndarray, text_polys: np.ndarray, text_tags: list, input_size: int = 640,
-                shrink_ratio: float = 0.5, defrees: int = 10,
+                shrink_ratio: float = 0.5, degrees: int = 10,
                 scales: np.ndarray = np.array([0.5, 1, 2.0, 3.0])) -> tuple:
     """
     读取图片并生成label
@@ -79,14 +78,14 @@ def image_label(im: np.ndarray, text_polys: np.ndarray, text_tags: list, input_s
     :param text_tags: 是否忽略文本的标致：true 忽略, false 不忽略
     :param input_size: 输出图像的尺寸
     :param shrink_ratio: gt收缩的比例
-    :param defrees: 随机旋转的角度
+    :param degrees: 随机旋转的角度
     :param scales: 随机缩放的尺度
     :return:
     """
     h, w, _ = im.shape
     # 检查越界
     text_polys = check_and_validate_polys(text_polys, (h, w))
-    # im, text_polys, = augmentation(im, text_polys, scales, defrees, input_size)
+    im, text_polys = augmentation(im, text_polys, scales, degrees)
 
     h, w, _ = im.shape
     short_edge = min(h, w)
 
@@ -53,9 +53,9 @@ def _get_annotation(self, label_path: str) -> tuple:
                 try:
                     label = params[8]
                     if label == '*' or label == '###':
-                        text_tags.append(True)
-                    else:
                         text_tags.append(False)
+                    else:
+                        text_tags.append(True)
                     # if label == '*' or label == '###':
                     x1, y1, x2, y2, x3, y3, x4, y4 = list(map(float, params[:8]))
                     boxes.append([[x1, y1], [x2, y2], [x3, y3], [x4, y4]])
@@ -135,9 +135,10 @@ def __next__(self):
     import matplotlib.pyplot as plt
     from torchvision import transforms
 
+
     train_data = ImageDataset(
         data_list=[
-            (r'E:\zj\dataset\icdar2015\train\img\img_828.jpg', r'E:\zj\dataset\icdar2015\train\gt\gt_img_828.txt')],
+            (r'/data1/zj/ocr/icdar2015/train/img/img_713.jpg','/data1/zj/ocr/icdar2015/train/gt/gt_img_713.txt')],
         input_size=640,
         img_channel=3,
         shrink_ratio=0.5,
 
@@ -2,6 +2,7 @@
 # @Time    : 2019/8/23 21:55
 # @Author  : zhoujun
 from .model import PAN
+from .model_pse1 import PSENet
 from .loss import PANLoss
 
 
@@ -11,6 +12,10 @@ def get_model(config):
     pretrained = config['arch']['args']['pretrained']
     return PAN(backbone=backbone, fpem_repeat=fpem_repeat, pretrained=pretrained)
 
+def get_model_pse1(config):
+    backbone = config['arch']['args']['backbone']
+    pretrained = config['arch']['args']['pretrained']
+    return PSENet(backbone=backbone, pretrained=pretrained)
 
 def get_loss(config):
     alpha = config['loss']['args']['alpha']
 
@@ -28,34 +28,30 @@ def __init__(self, alpha=0.5, beta=0.25, delta_agg=0.5, delta_dis=3, ohem_ratio=
         self.reduction = reduction
 
     def forward(self, outputs, labels, training_masks):
-        batch_size = outputs.size()[0]
         texts = outputs[:, 0, :, :]
         kernels = outputs[:, 1, :, :]
         gt_texts = labels[:, 0, :, :]
         gt_kernels = labels[:, 1, :, :]
 
+
+        # 计算 agg loss 和 dis loss
+        similarity_vectors = outputs[:, 2:, :, :]
+        loss_aggs, loss_diss = self.agg_dis_loss(texts, kernels, gt_texts, gt_kernels, similarity_vectors)
+
         # 计算 text loss
         selected_masks = self.ohem_batch(texts, gt_texts, training_masks)
         selected_masks = selected_masks.to(outputs.device)
 
         loss_texts = self.dice_loss(texts, gt_texts, selected_masks)
 
         # 计算 kernel loss
-        selected_masks = ((gt_texts > 0.5) & (training_masks > 0.5)).float()
-
+        # selected_masks = ((gt_texts > 0.5) & (training_masks > 0.5)).float()
+        mask0 = torch.sigmoid(texts).detach().cpu().numpy()
+        mask1 = training_masks.data.cpu().numpy()
+        selected_masks = ((mask0 > 0.5) & (mask1 > 0.5)).astype('float32')
+        selected_masks = torch.from_numpy(selected_masks).float().to(texts.device)
         loss_kernels = self.dice_loss(kernels, gt_kernels, selected_masks)
 
-        # 计算 agg loss 和 dis loss
-        similarity_vectors = outputs[:, 2:, :, :]
-
-        texts = texts.contiguous().reshape(batch_size, -1)
-        kernels = kernels.contiguous().reshape(batch_size, -1)
-        gt_texts = gt_texts.contiguous().reshape(batch_size, -1)
-        gt_kernels = gt_kernels.contiguous().reshape(batch_size, -1)
-        similarity_vectors = similarity_vectors.contiguous().view(batch_size, 4, -1)
-
-        loss_aggs, loss_diss = self.agg_dis_loss(texts, kernels, gt_texts, gt_kernels, similarity_vectors)
-
         # mean or sum
         if self.reduction == 'mean':
             loss_text = loss_texts.mean()
@@ -83,7 +79,12 @@ def agg_dis_loss(self, texts, kernels, gt_texts, gt_kernels, similarity_vectors)
         :param similarity_vectors: 相似度向量的分割结果 batch_size * 4 *(w*h)
         :return:
         """
-
+        batch_size = texts.size()[0]
+        texts = texts.contiguous().reshape(batch_size, -1)
+        kernels = kernels.contiguous().reshape(batch_size, -1)
+        gt_texts = gt_texts.contiguous().reshape(batch_size, -1)
+        gt_kernels = gt_kernels.contiguous().reshape(batch_size, -1)
+        similarity_vectors = similarity_vectors.contiguous().view(batch_size, 4, -1)
         loss_aggs = []
         loss_diss = []
         for text_i, kernel_i, gt_text_i, gt_kernel_i, similarity_vector in zip(texts, kernels, gt_texts, gt_kernels,
@@ -133,7 +134,8 @@ def agg_dis_loss(self, texts, kernels, gt_texts, gt_kernels, similarity_vectors)
 
     def dice_loss(self, input, target, mask):
         input = torch.sigmoid(input)
-
+        target[target <= 0.5] = 0
+        target[target > 0.5] = 1
         input = input.contiguous().view(input.size()[0], -1)
         target = target.contiguous().view(target.size()[0], -1)
         mask = mask.contiguous().view(mask.size()[0], -1)
 
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2019/8/23 21:57
+# @Author  : zhoujun
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from models.modules import *
+
+backbone_dict = {'resnet18': {'models': resnet18, 'out': [64, 128, 256, 512]},
+                 'resnet34': {'models': resnet34, 'out': [64, 128, 256, 512]},
+                 'resnet50': {'models': resnet50, 'out': [256, 512, 1024, 2048]},
+                 'resnet101': {'models': resnet101, 'out': [256, 512, 1024, 2048]},
+                 'resnet152': {'models': resnet152, 'out': [256, 512, 1024, 2048]},
+                 'resnext50_32x4d': {'models': resnext50_32x4d, 'out': [256, 512, 1024, 2048]},
+                 'resnext101_32x8d': {'models': resnext101_32x8d, 'out': [256, 512, 1024, 2048]}
+                 }
+
+
+# 'MobileNetV3_Large': {'models': MobileNetV3_Large, 'out': [24, 40, 160, 160]},
+# 'MobileNetV3_Small': {'models': MobileNetV3_Small, 'out': [16, 24, 48, 96]},
+# 'shufflenetv2': {'models': shufflenet_v2_x1_0, 'out': [24, 116, 232, 464]}}
+
+inplace = True
+
+class PSENet(nn.Module):
+    def __init__(self, backbone, result_num=6, scale: int = 1, pretrained=False):
+        super(PSENet, self).__init__()
+        assert backbone in backbone_dict, 'backbone must in: {}'.format(backbone_dict)
+        self.name = backbone
+        self.scale = scale
+        conv_out = 256
+        backbone_model, backbone_out = backbone_dict[backbone]['models'], backbone_dict[backbone]['out']
+        self.backbone = backbone_model(pretrained=pretrained)
+
+        # Top layer
+        self.toplayer = nn.Conv2d(backbone_out[3], conv_out, kernel_size=1, stride=1, padding=0)  # Reduce channels
+        # Lateral layers
+        self.latlayer1 = nn.Conv2d(backbone_out[2], conv_out, kernel_size=1, stride=1, padding=0)
+        self.latlayer2 = nn.Conv2d(backbone_out[1], conv_out, kernel_size=1, stride=1, padding=0)
+        self.latlayer3 = nn.Conv2d(backbone_out[0], conv_out, kernel_size=1, stride=1, padding=0)
+
+        # Smooth layers
+        self.smooth1 = nn.Conv2d(conv_out, conv_out, kernel_size=3, stride=1, padding=1)
+        self.smooth2 = nn.Conv2d(conv_out, conv_out, kernel_size=3, stride=1, padding=1)
+        self.smooth3 = nn.Conv2d(conv_out, conv_out, kernel_size=3, stride=1, padding=1)
+
+        self.conv = nn.Sequential(
+            nn.Conv2d(conv_out * 4, conv_out, kernel_size=3, padding=1, stride=1),
+            nn.BatchNorm2d(conv_out),
+            nn.ReLU(inplace=inplace)
+        )
+        self.out_conv = nn.Conv2d(conv_out, result_num, kernel_size=1, stride=1)
+
+    def forward(self, input: torch.Tensor):
+        _, _, H, W = input.size()
+        c2, c3, c4, c5 = self.backbone(input)
+        # Top-down
+        p5 = self.toplayer(c5)
+        p4 = self._upsample_add(p5, self.latlayer1(c4))
+        p3 = self._upsample_add(p4, self.latlayer2(c3))
+        p2 = self._upsample_add(p3, self.latlayer3(c2))
+        # Smooth
+        p4 = self.smooth1(p4)
+        p3 = self.smooth2(p3)
+        p2 = self.smooth3(p2)
+
+        x = self._upsample_cat(p2, p3, p4, p5)
+        x = self.conv(x)
+        x = self.out_conv(x)
+
+        x = F.interpolate(x, size=(H // self.scale, W // self.scale), mode='bilinear', align_corners=True)
+        return x
+
+    def _upsample_add(self, x, y):
+        return F.interpolate(x, size=y.size()[2:], mode='bilinear', align_corners=False) + y
+
+    def _upsample_cat(self, p2, p3, p4, p5):
+        h, w = p2.size()[2:]
+        p3 = F.interpolate(p3, size=(h, w), mode='bilinear', align_corners=False)
+        p4 = F.interpolate(p4, size=(h, w), mode='bilinear', align_corners=False)
+        p5 = F.interpolate(p5, size=(h, w), mode='bilinear', align_corners=False)
+        return torch.cat([p2, p3, p4, p5], dim=1)
+
+if __name__ == '__main__':
+    device = torch.device('cpu')
+    x = torch.zeros(1, 3, 640, 640).to(device)
+
+    model = PAN(backbone='resnet18', fpem_repeat=2, pretrained=True).to(device)
+    y = model(x)
+    print(y.shape)
+    # torch.save(model.state_dict(), 'PAN.pth')