From 8222dc03c219717a0e6fc3c584d0b0da5048e183 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Fri, 5 Jan 2024 13:25:27 +0100
Subject: [PATCH 01/39] add try except around datasets to train on broken
 datasets

---
 doctr/datasets/datasets/base.py | 54 +++++++++++++++++++--------------
 doctr/datasets/detection.py     | 16 +++++++---
 doctr/datasets/recognition.py   | 13 ++++++--
 3 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/doctr/datasets/datasets/base.py b/doctr/datasets/datasets/base.py
index 58f1ca29f..7ef924ffd 100644
--- a/doctr/datasets/datasets/base.py
+++ b/doctr/datasets/datasets/base.py
@@ -5,6 +5,7 @@
 
 import os
 import shutil
+import traceback
 from pathlib import Path
 from typing import Any, Callable, List, Optional, Tuple, Union
 
@@ -46,28 +47,37 @@ def _read_sample(self, index: int) -> Tuple[Any, Any]:
 
     def __getitem__(self, index: int) -> Tuple[Any, Any]:
         # Read image
-        img, target = self._read_sample(index)
-        # Pre-transforms (format conversion at run-time etc.)
-        if self._pre_transforms is not None:
-            img, target = self._pre_transforms(img, target)
-
-        if self.img_transforms is not None:
-            # typing issue cf. https://github.com/python/mypy/issues/5485
-            img = self.img_transforms(img)
-
-        if self.sample_transforms is not None:
-            # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks.
-            if (
-                isinstance(target, dict)
-                and all(isinstance(item, np.ndarray) for item in target.values())
-                and set(target.keys()) != {"boxes", "labels"}  # avoid confusion with obj detection target
-            ):
-                img_transformed = _copy_tensor(img)
-                for class_name, bboxes in target.items():
-                    img_transformed, target[class_name] = self.sample_transforms(img, bboxes)
-                img = img_transformed
-            else:
-                img, target = self.sample_transforms(img, target)
+        try:
+            img, target = self._read_sample(index)
+            # Pre-transforms (format conversion at run-time etc.)
+            if self._pre_transforms is not None:
+                img, target = self._pre_transforms(img, target)
+
+            if self.img_transforms is not None:
+                # typing issue cf. https://github.com/python/mypy/issues/5485
+                img = self.img_transforms(img)
+
+            if self.sample_transforms is not None:
+                # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks.
+                if (
+                    isinstance(target, dict)
+                    and all(isinstance(item, np.ndarray) for item in target.values())
+                    and set(target.keys()) != {"boxes", "labels"}  # avoid confusion with obj detection target
+                ):
+                    img_transformed = _copy_tensor(img)
+                    for class_name, bboxes in target.items():
+                        img_transformed, target[class_name] = self.sample_transforms(img, bboxes)
+                    img = img_transformed
+                else:
+                    img, target = self.sample_transforms(img, target)
+        except Exception:
+            img_name = self.data[index][0]
+            # Write
+            print()
+            print(f"!!!ERROR in Dataset on filename {img_name}")
+            traceback.print_exc()
+            print()
+            return self.__getitem__(0)  # should exists ^^
 
         return img, target
 
diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py
index 0000704df..0e9c0bbf7 100644
--- a/doctr/datasets/detection.py
+++ b/doctr/datasets/detection.py
@@ -55,14 +55,20 @@ def __init__(
 
         self.data: List[Tuple[str, Tuple[np.ndarray, List[str]]]] = []
         np_dtype = np.float32
+        missing_files = []
         for img_name, label in labels.items():
             # File existence check
             if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-
-            geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
-
-            self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes)))
+                missing_files.append(img_name)
+                # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            else:
+                geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
+                self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes)))
+        print("List of missing files:")
+        print(f"MISSING FILES: {len(missing_files)}")
+        from pprint import pprint
+
+        pprint(missing_files)
 
     def format_polygons(
         self, polygons: Union[List, Dict], use_polygons: bool, np_dtype: Type
diff --git a/doctr/datasets/recognition.py b/doctr/datasets/recognition.py
index ebf37a20a..381776138 100644
--- a/doctr/datasets/recognition.py
+++ b/doctr/datasets/recognition.py
@@ -40,11 +40,18 @@ def __init__(
         with open(labels_path, encoding="utf-8") as f:
             labels = json.load(f)
 
+        missing_files = []
         for img_name, label in labels.items():
             if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-
-            self.data.append((img_name, label))
+                missing_files.append(img_name)
+                # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            else:
+                self.data.append((img_name, label))
+        print("List of missing files:")
+        print(f"MISSING FILES: {len(missing_files)}")
+        from pprint import pprint
+
+        pprint(missing_files)
 
     def merge_dataset(self, ds: AbstractDataset) -> None:
         # Update data with new root for self

From e6faaf6f1d30b12ba751a2075bf70b3cbf315467 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Sat, 6 Jan 2024 16:33:12 +0100
Subject: [PATCH 02/39] fix in collate_fn

---
 doctr/datasets/datasets/tensorflow.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/doctr/datasets/datasets/tensorflow.py b/doctr/datasets/datasets/tensorflow.py
index 86b7b7928..da7890f97 100644
--- a/doctr/datasets/datasets/tensorflow.py
+++ b/doctr/datasets/datasets/tensorflow.py
@@ -49,10 +49,18 @@ def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
 
     @staticmethod
     def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-        images, targets = zip(*samples)
+        # FIXME
+        # problems with some shape != 3
+        images, targets = [], []
+        for sample in samples:
+            if sample[0].shape[-1] == 3:
+                images.append(sample[0])
+                targets.append(sample[1])
+
+        # images, targets = zip(*samples)
         images = tf.stack(images, axis=0)
 
-        return images, list(targets)
+        return images, targets
 
 
 class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101

From b5a41bdbb37cf4738b3904e0603d0428cba3524a Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Sat, 6 Jan 2024 17:04:21 +0100
Subject: [PATCH 03/39] Problems with augmentations involving _gaussian_filter

---
 references/detection/train_tensorflow.py | 28 ++++++++++++++----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py
index 05ee7c890..21b27d1be 100644
--- a/references/detection/train_tensorflow.py
+++ b/references/detection/train_tensorflow.py
@@ -178,9 +178,11 @@ def main(args):
     with open(os.path.join(args.val_path, "labels.json"), "rb") as f:
         val_hash = hashlib.sha256(f.read()).hexdigest()
 
-    batch_transforms = T.Compose([
-        T.Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)),
-    ])
+    batch_transforms = T.Compose(
+        [
+            T.Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)),
+        ]
+    )
 
     # Load doctr model
     model = detection.__dict__[args.arch](
@@ -223,9 +225,9 @@ def main(args):
             # Augmentations
             T.RandomApply(T.ColorInversion(), 0.1),
             T.RandomJpegQuality(60),
-            T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
-            T.RandomApply(T.RandomShadow(), 0.1),
-            T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1),
+            #T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
+            #T.RandomApply(T.RandomShadow(), 0.1),
+            #T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1),
             T.RandomSaturation(0.3),
             T.RandomContrast(0.3),
             T.RandomBrightness(0.3),
@@ -356,12 +358,14 @@ def main(args):
         print(log_msg)
         # W&B
         if args.wb:
-            wandb.log({
-                "val_loss": val_loss,
-                "recall": recall,
-                "precision": precision,
-                "mean_iou": mean_iou,
-            })
+            wandb.log(
+                {
+                    "val_loss": val_loss,
+                    "recall": recall,
+                    "precision": precision,
+                    "mean_iou": mean_iou,
+                }
+            )
 
         # ClearML
         if args.clearml:

From fbef2cfda880b497a438ab6b0ccd95e21f824adc Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Thu, 1 Feb 2024 23:19:57 +0100
Subject: [PATCH 04/39] from https://github.com/mindee/doctr/pull/1444

---
 .../differentiable_binarization/pytorch.py    | 38 ++++++++++---------
 references/detection/train_pytorch.py         | 34 +++++++++--------
 references/detection/train_tensorflow.py      | 26 +++++++------
 3 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py
index 9e4b81ef9..17686bb28 100644
--- a/doctr/models/detection/differentiable_binarization/pytorch.py
+++ b/doctr/models/detection/differentiable_binarization/pytorch.py
@@ -57,24 +57,28 @@ def __init__(
 
         conv_layer = DeformConv2d if deform_conv else nn.Conv2d
 
-        self.in_branches = nn.ModuleList([
-            nn.Sequential(
-                conv_layer(chans, out_channels, 1, bias=False),
-                nn.BatchNorm2d(out_channels),
-                nn.ReLU(inplace=True),
-            )
-            for idx, chans in enumerate(in_channels)
-        ])
+        self.in_branches = nn.ModuleList(
+            [
+                nn.Sequential(
+                    conv_layer(chans, out_channels, 1, bias=False),
+                    nn.BatchNorm2d(out_channels),
+                    nn.ReLU(inplace=True),
+                )
+                for idx, chans in enumerate(in_channels)
+            ]
+        )
         self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
-        self.out_branches = nn.ModuleList([
-            nn.Sequential(
-                conv_layer(out_channels, out_chans, 3, padding=1, bias=False),
-                nn.BatchNorm2d(out_chans),
-                nn.ReLU(inplace=True),
-                nn.Upsample(scale_factor=2**idx, mode="bilinear", align_corners=True),
-            )
-            for idx, chans in enumerate(in_channels)
-        ])
+        self.out_branches = nn.ModuleList(
+            [
+                nn.Sequential(
+                    conv_layer(out_channels, out_chans, 3, padding=1, bias=False),
+                    nn.BatchNorm2d(out_chans),
+                    nn.ReLU(inplace=True),
+                    nn.Upsample(scale_factor=2**idx, mode="bilinear", align_corners=True),
+                )
+                for idx, chans in enumerate(in_channels)
+            ]
+        )
 
     def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
         if len(x) != len(self.out_branches):
diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index 4f6401151..e3fe2c9f8 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -266,15 +266,17 @@ def main(args):
     train_set = DetectionDataset(
         img_folder=os.path.join(args.train_path, "images"),
         label_path=os.path.join(args.train_path, "labels.json"),
-        img_transforms=Compose([
-            # Augmentations
-            T.RandomApply(T.ColorInversion(), 0.1),
-            T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
-            T.RandomApply(T.RandomShadow(), 0.1),
-            T.RandomApply(GaussianBlur(kernel_size=3), 0.1),
-            RandomPhotometricDistort(p=0.05),
-            RandomGrayscale(p=0.05),
-        ]),
+        img_transforms=Compose(
+            [
+                # Augmentations
+                T.RandomApply(T.ColorInversion(), 0.1),
+                T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
+                T.RandomApply(T.RandomShadow(), 0.1),
+                T.RandomApply(GaussianBlur(kernel_size=3), 0.1),
+                RandomPhotometricDistort(p=0.05),
+                RandomGrayscale(p=0.05),
+            ]
+        ),
         sample_transforms=T.SampleCompose(
             (
                 [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
@@ -390,12 +392,14 @@ def main(args):
         print(log_msg)
         # W&B
         if args.wb:
-            wandb.log({
-                "val_loss": val_loss,
-                "recall": recall,
-                "precision": precision,
-                "mean_iou": mean_iou,
-            })
+            wandb.log(
+                {
+                    "val_loss": val_loss,
+                    "recall": recall,
+                    "precision": precision,
+                    "mean_iou": mean_iou,
+                }
+            )
         if args.early_stop and early_stopper.early_stop(val_loss):
             print("Training halted early due to reaching patience limit.")
             break
diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py
index 21b27d1be..fd0271c18 100644
--- a/references/detection/train_tensorflow.py
+++ b/references/detection/train_tensorflow.py
@@ -221,18 +221,20 @@ def main(args):
     train_set = DetectionDataset(
         img_folder=os.path.join(args.train_path, "images"),
         label_path=os.path.join(args.train_path, "labels.json"),
-        img_transforms=T.Compose([
-            # Augmentations
-            T.RandomApply(T.ColorInversion(), 0.1),
-            T.RandomJpegQuality(60),
-            #T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
-            #T.RandomApply(T.RandomShadow(), 0.1),
-            #T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1),
-            T.RandomSaturation(0.3),
-            T.RandomContrast(0.3),
-            T.RandomBrightness(0.3),
-            T.RandomApply(T.ToGray(num_output_channels=3), 0.05),
-        ]),
+        img_transforms=T.Compose(
+            [
+                # Augmentations
+                T.RandomApply(T.ColorInversion(), 0.1),
+                T.RandomJpegQuality(60),
+                # T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
+                # T.RandomApply(T.RandomShadow(), 0.1),
+                # T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1),
+                T.RandomSaturation(0.3),
+                T.RandomContrast(0.3),
+                T.RandomBrightness(0.3),
+                T.RandomApply(T.ToGray(num_output_channels=3), 0.05),
+            ]
+        ),
         sample_transforms=T.SampleCompose(
             (
                 [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]

From c350bd7c4b3b4ede2b9fcbf17567f37a970b3e35 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 6 Feb 2024 13:30:50 +0100
Subject: [PATCH 05/39] send message on slack (pytorch script)

---
 references/detection/train_pytorch.py | 45 ++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index e3fe2c9f8..d23111f18 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -12,6 +12,7 @@
 import logging
 import multiprocessing as mp
 import time
+from pathlib import Path
 
 import numpy as np
 import psutil
@@ -28,6 +29,32 @@
 from doctr.utils.metrics import LocalizationConfusion
 from utils import EarlyStopper, plot_recorder, plot_samples
 
+SLACK_WEBHOOK_URL = None
+SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
+if SLACK_WEBHOOK_PATH.exists():
+    with open(SLACK_WEBHOOK_PATH) as f:
+        SLACK_WEBHOOK_URL = f.read().strip()
+else:
+    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
+
+
+def send_on_slack(text: str):
+    """Send a message on Slack.
+
+    Args:
+        text (str): message to send on Slack
+    """
+    if SLACK_WEBHOOK_URL:
+        try:
+            import requests
+
+            requests.post(
+                url=SLACK_WEBHOOK_URL,
+                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
+            )
+        except Exception:
+            print("Impossible to send message on Slack, continue...")
+
 
 def record_lr(
     model: torch.nn.Module,
@@ -106,6 +133,8 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 
     model.train()
     # Iterate over the batches of the dataset
+    last_progress = 0
+    interval_progress = 5
     pbar = tqdm(train_loader, position=1)
     for images, targets in pbar:
         if torch.cuda.is_available():
@@ -130,8 +159,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
             optimizer.step()
 
         scheduler.step()
-
         pbar.set_description(f"Training loss: {train_loss.item():.6}")
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+    send_on_slack(f"Final training loss: {train_loss.item():.6}")
 
 
 @torch.no_grad()
@@ -170,6 +203,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
 
 def main(args):
     print(args)
+    send_on_slack(f"Start training: {args}")
 
     if args.push_to_hub:
         login_to_hub()
@@ -212,6 +246,9 @@ def main(args):
         collate_fn=val_set.collate_fn,
     )
     print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)")
+    send_on_slack(
+        f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)"
+    )
     with open(os.path.join(args.val_path, "labels.json"), "rb") as f:
         val_hash = hashlib.sha256(f.read()).hexdigest()
 
@@ -227,6 +264,7 @@ def main(args):
     # Resume weights
     if isinstance(args.resume, str):
         print(f"Resuming {args.resume}")
+        send_on_slack(f"Resuming {args.resume}")
         checkpoint = torch.load(args.resume, map_location="cpu")
         model.load_state_dict(checkpoint)
 
@@ -306,6 +344,9 @@ def main(args):
         collate_fn=train_set.collate_fn,
     )
     print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)")
+    send_on_slack(
+        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)"
+    )
     with open(os.path.join(args.train_path, "labels.json"), "rb") as f:
         train_hash = hashlib.sha256(f.read()).hexdigest()
 
@@ -379,6 +420,7 @@ def main(args):
         val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp)
         if val_loss < min_loss:
             print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
+            send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
             torch.save(model.state_dict(), f"./{exp_name}.pt")
             min_loss = val_loss
         if args.save_interval_epoch:
@@ -390,6 +432,7 @@ def main(args):
         else:
             log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})"
         print(log_msg)
+        send_on_slack(log_msg)
         # W&B
         if args.wb:
             wandb.log(

From f16b19232803b77d7220220e97fcfc11bf7be51c Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 6 Feb 2024 13:35:21 +0100
Subject: [PATCH 06/39] exclude l1_loss in db model

---
 doctr/models/detection/differentiable_binarization/pytorch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py
index 17686bb28..8207b9a54 100644
--- a/doctr/models/detection/differentiable_binarization/pytorch.py
+++ b/doctr/models/detection/differentiable_binarization/pytorch.py
@@ -287,7 +287,8 @@ def compute_loss(
         if torch.any(thresh_mask):
             l1_loss = (torch.abs(thresh_map - thresh_target) * thresh_mask).sum() / (thresh_mask.sum() + eps)
 
-        return l1_loss + focal_scale * focal_loss + dice_loss
+        # return l1_loss + focal_scale * focal_loss + dice_loss
+        return focal_scale * focal_loss + dice_loss
 
 
 def _dbnet(

From 9b0ae9226623b3608285707d0256647e6bad5a30 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 13 Feb 2024 10:15:54 +0100
Subject: [PATCH 07/39] send_on_slack tf

---
 references/detection/train_tensorflow.py | 46 ++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py
index fd0271c18..bed759187 100644
--- a/references/detection/train_tensorflow.py
+++ b/references/detection/train_tensorflow.py
@@ -12,6 +12,7 @@
 import hashlib
 import multiprocessing as mp
 import time
+from pathlib import Path
 
 import numpy as np
 import psutil
@@ -31,6 +32,32 @@
 from doctr.utils.metrics import LocalizationConfusion
 from utils import EarlyStopper, load_backbone, plot_recorder, plot_samples
 
+SLACK_WEBHOOK_URL = None
+SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
+if SLACK_WEBHOOK_PATH.exists():
+    with open(SLACK_WEBHOOK_PATH) as f:
+        SLACK_WEBHOOK_URL = f.read().strip()
+else:
+    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
+
+
+def send_on_slack(text: str):
+    """Send a message on Slack.
+
+    Args:
+        text (str): message to send on Slack
+    """
+    if SLACK_WEBHOOK_URL:
+        try:
+            import requests
+
+            requests.post(
+                url=SLACK_WEBHOOK_URL,
+                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
+            )
+        except Exception:
+            print("Impossible to send message on Slack, continue...")
+
 
 def record_lr(
     model: tf.keras.Model,
@@ -87,6 +114,8 @@ def record_lr(
 def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
     train_iter = iter(train_loader)
     # Iterate over the batches of the dataset
+    last_progress = 0
+    interval_progress = 5
     pbar = tqdm(train_iter, position=1)
     for images, targets in pbar:
         images = batch_transforms(images)
@@ -99,6 +128,11 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
         optimizer.apply_gradients(zip(grads, model.trainable_weights))
 
         pbar.set_description(f"Training loss: {train_loss.numpy():.6}")
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+    send_on_slack(f"Final training loss: {train_loss.item():.6}")
 
 
 def evaluate(model, val_loader, batch_transforms, val_metric):
@@ -129,6 +163,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric):
 
 def main(args):
     print(args)
+    send_on_slack(f"Start training: {args}")
 
     if args.push_to_hub:
         login_to_hub()
@@ -175,6 +210,10 @@ def main(args):
         f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in "
         f"{val_loader.num_batches} batches)"
     )
+    send_on_slack(
+        f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in "
+        f"{val_loader.num_batches} batches)"
+    )
     with open(os.path.join(args.val_path, "labels.json"), "rb") as f:
         val_hash = hashlib.sha256(f.read()).hexdigest()
 
@@ -264,6 +303,10 @@ def main(args):
         f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in "
         f"{train_loader.num_batches} batches)"
     )
+    send_on_slack(
+        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in "
+        f"{train_loader.num_batches} batches)"
+    )
     with open(os.path.join(args.train_path, "labels.json"), "rb") as f:
         train_hash = hashlib.sha256(f.read()).hexdigest()
 
@@ -347,10 +390,12 @@ def main(args):
         val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric)
         if val_loss < min_loss:
             print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
+            send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
             model.save_weights(f"./{exp_name}/weights")
             min_loss = val_loss
         if args.save_interval_epoch:
             print(f"Saving state at epoch: {epoch + 1}")
+            send_on_slack(f"Saving state at epoch: {epoch + 1}")
             model.save_weights(f"./{exp_name}_{epoch + 1}/weights")
         log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} "
         if any(val is None for val in (recall, precision, mean_iou)):
@@ -358,6 +403,7 @@ def main(args):
         else:
             log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})"
         print(log_msg)
+        send_on_slack(log_msg)
         # W&B
         if args.wb:
             wandb.log(

From 20365993bb82ee43033c21e0782d7b56cef4604f Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 13 Feb 2024 10:59:40 +0100
Subject: [PATCH 08/39] Revert "fix test"

This reverts commit 548772d131d88cea0fd4c2c650230a74ad2e9211.
---
 tests/pytorch/test_models_zoo_pt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pytorch/test_models_zoo_pt.py b/tests/pytorch/test_models_zoo_pt.py
index 5bcd10ee6..3c6267ab7 100644
--- a/tests/pytorch/test_models_zoo_pt.py
+++ b/tests/pytorch/test_models_zoo_pt.py
@@ -222,9 +222,9 @@ def test_trained_kie_predictor(mock_payslip):
     geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]])
     assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr, rtol=0.05)
 
-    assert out.pages[0].predictions[CLASS_NAME][4].value == "revised"
+    assert out.pages[0].predictions[CLASS_NAME][6].value == "revised"
     geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]])
-    assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][4].geometry), geometry_revised, rtol=0.05)
+    assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][6].geometry), geometry_revised, rtol=0.05)
 
     det_predictor = detection_predictor(
         "db_resnet50",

From c02a4779608e1585f98538b1bc33f951c624fab9 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Wed, 14 Feb 2024 09:47:11 +0100
Subject: [PATCH 09/39] fix

---
 references/detection/train_tensorflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py
index bed759187..0b61c4c29 100644
--- a/references/detection/train_tensorflow.py
+++ b/references/detection/train_tensorflow.py
@@ -132,7 +132,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
         if current_progress - last_progress > interval_progress:
             send_on_slack(str(pbar))
             last_progress = int(current_progress)
-    send_on_slack(f"Final training loss: {train_loss.item():.6}")
+    send_on_slack(f"Final training loss: {train_loss.numpy():.6}")
 
 
 def evaluate(model, val_loader, batch_transforms, val_metric):

From 2602a5da533ea8be1d4af9f7c61a57cbf341dc77 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 19 Feb 2024 10:18:52 +0100
Subject: [PATCH 10/39] Display pbar before starting training

---
 references/detection/train_pytorch.py    | 1 +
 references/detection/train_tensorflow.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index d23111f18..fccc092f1 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -136,6 +136,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
     last_progress = 0
     interval_progress = 5
     pbar = tqdm(train_loader, position=1)
+    send_on_slack(str(pbar))
     for images, targets in pbar:
         if torch.cuda.is_available():
             images = images.cuda()
diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py
index 0b61c4c29..62788331f 100644
--- a/references/detection/train_tensorflow.py
+++ b/references/detection/train_tensorflow.py
@@ -117,6 +117,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
     last_progress = 0
     interval_progress = 5
     pbar = tqdm(train_iter, position=1)
+    send_on_slack(str(pbar))
     for images, targets in pbar:
         images = batch_transforms(images)
 

From 578d9b82a34b13c724645c3448ea123ffdd58240 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Wed, 6 Mar 2024 18:01:52 +0100
Subject: [PATCH 11/39] temp eval with cord funsd from felix

---
 references/detection/train_pytorch.py | 117 +++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 1 deletion(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index fccc092f1..dc5263c1a 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -4,6 +4,10 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import os
+from pathlib import Path
+from doctr.file_utils import CLASS_NAME
+from doctr import datasets
+
 
 os.environ["USE_TORCH"] = "1"
 
@@ -201,6 +205,38 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
     recall, precision, mean_iou = val_metric.summary()
     return val_loss, recall, precision, mean_iou
 
+@torch.no_grad()
+def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
+    # Model in eval mode
+    model.eval()
+    # Reset val metric
+    val_metric.reset()
+    # Validation loop
+    val_loss, batch_cnt = 0, 0
+    for images, targets in tqdm(val_loader):
+        if torch.cuda.is_available():
+            images = images.cuda()
+        images = batch_transforms(images)
+        targets = [{CLASS_NAME: t["boxes"]} for t in targets]
+        if amp:
+            with torch.cuda.amp.autocast():
+                out = model(images, targets, return_preds=True)
+        else:
+            out = model(images, targets, return_preds=True)
+        # Compute metric
+        loc_preds = out["preds"]
+        for target, loc_pred in zip(targets, loc_preds):
+            for boxes_gt, boxes_pred in zip(target.values(), loc_pred.values()):
+                # Remove scores
+                val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :-1])
+
+        val_loss += out["loss"].item()
+        batch_cnt += 1
+
+    val_loss /= batch_cnt
+    recall, precision, mean_iou = val_metric.summary()
+    return val_loss, recall, precision, mean_iou
+
 
 def main(args):
     print(args)
@@ -255,6 +291,67 @@ def main(args):
 
     batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287))
 
+    funsd_ds = datasets.FUNSD(
+        train=True,
+        download=True,
+        use_polygons=args.rotation,
+        sample_transforms=T.Resize((args.input_size, args.input_size)),
+    )
+    # Monkeypatch
+    subfolder = funsd_ds.root.split("/")[-2:]
+    funsd_ds.root = str(Path(funsd_ds.root).parent.parent)
+    funsd_ds.data = [(os.path.join(*subfolder, name), target) for name, target in funsd_ds.data]
+    _funsd_ds = datasets.FUNSD(
+        train=False,
+        download=True,
+        use_polygons=args.rotation,
+        sample_transforms=T.Resize((args.input_size, args.input_size)),
+    )
+    subfolder = _funsd_ds.root.split("/")[-2:]
+    funsd_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _funsd_ds.data])
+
+    funsd_test_loader = DataLoader(
+        funsd_ds,
+        batch_size=args.batch_size,
+        drop_last=False,
+        num_workers=args.workers,
+        sampler=SequentialSampler(funsd_ds),
+        pin_memory=torch.cuda.is_available(),
+        collate_fn=funsd_ds.collate_fn,
+    )
+    print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)")
+
+
+    cord_ds = datasets.CORD(
+        train=True,
+        download=True,
+        use_polygons=args.rotation,
+        sample_transforms=T.Resize((args.input_size, args.input_size)),
+    )
+    # Monkeypatch
+    subfolder = cord_ds.root.split("/")[-2:]
+    cord_ds.root = str(Path(cord_ds.root).parent.parent)
+    cord_ds.data = [(os.path.join(*subfolder, name), target) for name, target in cord_ds.data]
+    _cord_ds = datasets.CORD(
+        train=False,
+        download=True,
+        use_polygons=args.rotation,
+        sample_transforms=T.Resize((args.input_size, args.input_size)),
+    )
+    subfolder = _cord_ds.root.split("/")[-2:]
+    cord_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _cord_ds.data])
+
+    cord_test_loader = DataLoader(
+        cord_ds,
+        batch_size=args.batch_size,
+        drop_last=False,
+        num_workers=args.workers,
+        sampler=SequentialSampler(cord_ds),
+        pin_memory=torch.cuda.is_available(),
+        collate_fn=cord_ds.collate_fn,
+    )
+    print(f"CORD Test set loaded in {time.time() - st:.4}s ({len(cord_ds)} samples in " f"{len(funsd_test_loader)} batches)")
+
     # Load doctr model
     model = detection.__dict__[args.arch](
         pretrained=args.pretrained,
@@ -290,6 +387,16 @@ def main(args):
         mask_shape=(args.input_size, args.input_size),
         use_broadcasting=True if system_available_memory > 62 else False,
     )
+    funsd_val_metric = LocalizationConfusion(
+        use_polygons=args.rotation and not args.eval_straight,
+        mask_shape=(args.input_size, args.input_size),
+        use_broadcasting=True if system_available_memory > 62 else False,
+    )
+    cord_val_metric = LocalizationConfusion(
+        use_polygons=args.rotation and not args.eval_straight,
+        mask_shape=(args.input_size, args.input_size),
+        use_broadcasting=True if system_available_memory > 62 else False,
+    )
 
     if args.test_only:
         print("Running evaluation")
@@ -419,6 +526,12 @@ def main(args):
         fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp)
         # Validation loop at the end of each epoch
         val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp)
+        _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate(
+            model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp
+        )
+        _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate(
+            model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp
+        )
         if val_loss < min_loss:
             print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
             send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
@@ -431,7 +544,9 @@ def main(args):
         if any(val is None for val in (recall, precision, mean_iou)):
             log_msg += "(Undefined metric value, caused by empty GTs or predictions)"
         else:
-            log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})"
+            log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})\n"
+            log_msg += f"FUNSD: Recall: {funsd_recall:.2%} | Precision: {funsd_precision:.2%} | Mean IoU: {funsd_mean_iou:.2%}\n"
+            log_msg += f"CORD: Recall: {cord_recall:.2%} | Precision: {cord_precision:.2%} | Mean IoU: {cord_mean_iou:.2%}"
         print(log_msg)
         send_on_slack(log_msg)
         # W&B

From ccf19d9aa62bb47b913a9108212fba784147463c Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Fri, 8 Mar 2024 14:15:49 +0100
Subject: [PATCH 12/39] try_except on sec_evaluate

---
 references/detection/train_pytorch.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index dc5263c1a..d4c2510db 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -526,12 +526,20 @@ def main(args):
         fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp)
         # Validation loop at the end of each epoch
         val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp)
-        _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate(
-            model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp
-        )
-        _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate(
-            model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp
-        )
+        funsd_recall, funsd_precision, funsd_mean_iou = 0.0, 0.0, 0.0
+        cord_recall, cord_precision, cord_mean_iou = 0.0, 0.0, 0.0
+        try:
+            _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate(
+                model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp
+            )
+        except Exception:
+            pass
+        try:
+            _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate(
+                model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp
+            )
+        except Exception:
+            pass
         if val_loss < min_loss:
             print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
             send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")

From a5a6101a8ba76c2a6bc346a4954d031551ce3ef3 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Fri, 8 Mar 2024 14:19:44 +0100
Subject: [PATCH 13/39] pbar on evaluate fn

---
 references/detection/train_pytorch.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index d4c2510db..d33e932f6 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -178,9 +178,13 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
     model.eval()
     # Reset val metric
     val_metric.reset()
+    last_progress = 0
+    interval_progress = 5
+    pbar = tqdm(val_loader)
+    send_on_slack(str(pbar))
     # Validation loop
     val_loss, batch_cnt = 0, 0
-    for images, targets in tqdm(val_loader):
+    for images, targets in pbar:
         if torch.cuda.is_available():
             images = images.cuda()
         images = batch_transforms(images)
@@ -198,6 +202,10 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
                     boxes_pred = np.concatenate((boxes_pred.min(axis=1), boxes_pred.max(axis=1)), axis=-1)
                 val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4])
 
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
         val_loss += out["loss"].item()
         batch_cnt += 1
 

From cc795f604854ef66d05e60bacf01684ac30cce97 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Fri, 8 Mar 2024 14:22:02 +0100
Subject: [PATCH 14/39] pbar on sec_evaluate

---
 references/detection/train_pytorch.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index d33e932f6..2add7873c 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -219,9 +219,13 @@ def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
     model.eval()
     # Reset val metric
     val_metric.reset()
+    last_progress = 0
+    interval_progress = 5
+    pbar = tqdm(val_loader)
+    send_on_slack(str(pbar))
     # Validation loop
     val_loss, batch_cnt = 0, 0
-    for images, targets in tqdm(val_loader):
+    for images, targets in pbar:
         if torch.cuda.is_available():
             images = images.cuda()
         images = batch_transforms(images)
@@ -238,6 +242,10 @@ def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
                 # Remove scores
                 val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :-1])
 
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
         val_loss += out["loss"].item()
         batch_cnt += 1
 

From 34e32ebea31b6626dbe61e576b8b22fe157554df Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 11 Mar 2024 19:04:10 +0100
Subject: [PATCH 15/39] apply patch from
 https://github.com/felixdittrich92/doctr/commit/27bc838a44784f1a6868693d55b12b3c5216d81c

---
 references/detection/train_pytorch.py | 120 ++++++++++----------------
 1 file changed, 44 insertions(+), 76 deletions(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index 2add7873c..d43f610ab 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -213,46 +213,6 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
     recall, precision, mean_iou = val_metric.summary()
     return val_loss, recall, precision, mean_iou
 
-@torch.no_grad()
-def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
-    # Model in eval mode
-    model.eval()
-    # Reset val metric
-    val_metric.reset()
-    last_progress = 0
-    interval_progress = 5
-    pbar = tqdm(val_loader)
-    send_on_slack(str(pbar))
-    # Validation loop
-    val_loss, batch_cnt = 0, 0
-    for images, targets in pbar:
-        if torch.cuda.is_available():
-            images = images.cuda()
-        images = batch_transforms(images)
-        targets = [{CLASS_NAME: t["boxes"]} for t in targets]
-        if amp:
-            with torch.cuda.amp.autocast():
-                out = model(images, targets, return_preds=True)
-        else:
-            out = model(images, targets, return_preds=True)
-        # Compute metric
-        loc_preds = out["preds"]
-        for target, loc_pred in zip(targets, loc_preds):
-            for boxes_gt, boxes_pred in zip(target.values(), loc_pred.values()):
-                # Remove scores
-                val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :-1])
-
-        current_progress = pbar.n / pbar.total * 100
-        if current_progress - last_progress > interval_progress:
-            send_on_slack(str(pbar))
-            last_progress = int(current_progress)
-        val_loss += out["loss"].item()
-        batch_cnt += 1
-
-    val_loss /= batch_cnt
-    recall, precision, mean_iou = val_metric.summary()
-    return val_loss, recall, precision, mean_iou
-
 
 def main(args):
     print(args)
@@ -307,24 +267,27 @@ def main(args):
 
     batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287))
 
-    funsd_ds = datasets.FUNSD(
-        train=True,
-        download=True,
-        use_polygons=args.rotation,
-        sample_transforms=T.Resize((args.input_size, args.input_size)),
-    )
-    # Monkeypatch
-    subfolder = funsd_ds.root.split("/")[-2:]
-    funsd_ds.root = str(Path(funsd_ds.root).parent.parent)
-    funsd_ds.data = [(os.path.join(*subfolder, name), target) for name, target in funsd_ds.data]
-    _funsd_ds = datasets.FUNSD(
-        train=False,
-        download=True,
-        use_polygons=args.rotation,
-        sample_transforms=T.Resize((args.input_size, args.input_size)),
+    funsd_ds = DetectionDataset(
+        img_folder=os.path.join(args.funsd_path, "images"),
+        label_path=os.path.join(args.funsd_path, "labels.json"),
+        sample_transforms=T.SampleCompose(
+            (
+                [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
+                if not args.rotation or args.eval_straight
+                else []
+            )
+            + (
+                [
+                    T.Resize(args.input_size, preserve_aspect_ratio=True),  # This does not pad
+                    T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
+                    T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
+                ]
+                if args.rotation and not args.eval_straight
+                else []
+            )
+        ),
+        use_polygons=args.rotation and not args.eval_straight,
     )
-    subfolder = _funsd_ds.root.split("/")[-2:]
-    funsd_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _funsd_ds.data])
 
     funsd_test_loader = DataLoader(
         funsd_ds,
@@ -338,24 +301,27 @@ def main(args):
     print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)")
 
 
-    cord_ds = datasets.CORD(
-        train=True,
-        download=True,
-        use_polygons=args.rotation,
-        sample_transforms=T.Resize((args.input_size, args.input_size)),
-    )
-    # Monkeypatch
-    subfolder = cord_ds.root.split("/")[-2:]
-    cord_ds.root = str(Path(cord_ds.root).parent.parent)
-    cord_ds.data = [(os.path.join(*subfolder, name), target) for name, target in cord_ds.data]
-    _cord_ds = datasets.CORD(
-        train=False,
-        download=True,
-        use_polygons=args.rotation,
-        sample_transforms=T.Resize((args.input_size, args.input_size)),
+    cord_ds = DetectionDataset(
+        img_folder=os.path.join(args.cord_path, "images"),
+        label_path=os.path.join(args.cord_path, "labels.json"),
+        sample_transforms=T.SampleCompose(
+            (
+                [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
+                if not args.rotation or args.eval_straight
+                else []
+            )
+            + (
+                [
+                    T.Resize(args.input_size, preserve_aspect_ratio=True),  # This does not pad
+                    T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
+                    T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
+                ]
+                if args.rotation and not args.eval_straight
+                else []
+            )
+        ),
+        use_polygons=args.rotation and not args.eval_straight,
     )
-    subfolder = _cord_ds.root.split("/")[-2:]
-    cord_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _cord_ds.data])
 
     cord_test_loader = DataLoader(
         cord_ds,
@@ -545,13 +511,13 @@ def main(args):
         funsd_recall, funsd_precision, funsd_mean_iou = 0.0, 0.0, 0.0
         cord_recall, cord_precision, cord_mean_iou = 0.0, 0.0, 0.0
         try:
-            _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate(
+            _, funsd_recall, funsd_precision, funsd_mean_iou = evaluate(
                 model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp
             )
         except Exception:
             pass
         try:
-            _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate(
+            _, cord_recall, cord_precision, cord_mean_iou = evaluate(
                 model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp
             )
         except Exception:
@@ -603,6 +569,8 @@ def parse_args():
 
     parser.add_argument("train_path", type=str, help="path to training data folder")
     parser.add_argument("val_path", type=str, help="path to validation data folder")
+    parser.add_argument("funsd_path", type=str, help="path to FUNSD data folder")
+    parser.add_argument("cord_path", type=str, help="path to Cord data folder")
     parser.add_argument("arch", type=str, help="text-detection model to train")
     parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
     parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")

From 2e90794343b7f9144a36908bb3b780db9795e080 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Wed, 20 Mar 2024 11:59:09 +0100
Subject: [PATCH 16/39] stop using custom ds for val

---
 references/detection/train_pytorch.py | 178 +++++++++++++-------------
 1 file changed, 88 insertions(+), 90 deletions(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index d43f610ab..4cde7df4b 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -267,72 +267,72 @@ def main(args):
 
     batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287))
 
-    funsd_ds = DetectionDataset(
-        img_folder=os.path.join(args.funsd_path, "images"),
-        label_path=os.path.join(args.funsd_path, "labels.json"),
-        sample_transforms=T.SampleCompose(
-            (
-                [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
-                if not args.rotation or args.eval_straight
-                else []
-            )
-            + (
-                [
-                    T.Resize(args.input_size, preserve_aspect_ratio=True),  # This does not pad
-                    T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
-                    T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
-                ]
-                if args.rotation and not args.eval_straight
-                else []
-            )
-        ),
-        use_polygons=args.rotation and not args.eval_straight,
-    )
-
-    funsd_test_loader = DataLoader(
-        funsd_ds,
-        batch_size=args.batch_size,
-        drop_last=False,
-        num_workers=args.workers,
-        sampler=SequentialSampler(funsd_ds),
-        pin_memory=torch.cuda.is_available(),
-        collate_fn=funsd_ds.collate_fn,
-    )
-    print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)")
-
-
-    cord_ds = DetectionDataset(
-        img_folder=os.path.join(args.cord_path, "images"),
-        label_path=os.path.join(args.cord_path, "labels.json"),
-        sample_transforms=T.SampleCompose(
-            (
-                [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
-                if not args.rotation or args.eval_straight
-                else []
-            )
-            + (
-                [
-                    T.Resize(args.input_size, preserve_aspect_ratio=True),  # This does not pad
-                    T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
-                    T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
-                ]
-                if args.rotation and not args.eval_straight
-                else []
-            )
-        ),
-        use_polygons=args.rotation and not args.eval_straight,
-    )
-
-    cord_test_loader = DataLoader(
-        cord_ds,
-        batch_size=args.batch_size,
-        drop_last=False,
-        num_workers=args.workers,
-        sampler=SequentialSampler(cord_ds),
-        pin_memory=torch.cuda.is_available(),
-        collate_fn=cord_ds.collate_fn,
-    )
-    print(f"CORD Test set loaded in {time.time() - st:.4}s ({len(cord_ds)} samples in " f"{len(funsd_test_loader)} batches)")
+    #funsd_ds = DetectionDataset(
+    #    img_folder=os.path.join(args.funsd_path, "images"),
+    #    label_path=os.path.join(args.funsd_path, "labels.json"),
+    #    sample_transforms=T.SampleCompose(
+    #        (
+    #            [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
+    #            if not args.rotation or args.eval_straight
+    #            else []
+    #        )
+    #        + (
+    #            [
+    #                T.Resize(args.input_size, preserve_aspect_ratio=True),  # This does not pad
+    #                T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
+    #                T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
+    #            ]
+    #            if args.rotation and not args.eval_straight
+    #            else []
+    #        )
+    #    ),
+    #    use_polygons=args.rotation and not args.eval_straight,
+    #)
+
+    #funsd_test_loader = DataLoader(
+    #    funsd_ds,
+    #    batch_size=args.batch_size,
+    #    drop_last=False,
+    #    num_workers=args.workers,
+    #    sampler=SequentialSampler(funsd_ds),
+    #    pin_memory=torch.cuda.is_available(),
+    #    collate_fn=funsd_ds.collate_fn,
+    #)
+    #print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)")
+
+
+    #cord_ds = DetectionDataset(
+    #    img_folder=os.path.join(args.cord_path, "images"),
+    #    label_path=os.path.join(args.cord_path, "labels.json"),
+    #    sample_transforms=T.SampleCompose(
+    #        (
+    #            [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)]
+    #            if not args.rotation or args.eval_straight
+    #            else []
+    #        )
+    #        + (
+    #            [
+    #                T.Resize(args.input_size, preserve_aspect_ratio=True),  # This does not pad
+    #                T.RandomApply(T.RandomRotate(90, expand=True), 0.5),
+    #                T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True),
+    #            ]
+    #            if args.rotation and not args.eval_straight
+    #            else []
+    #        )
+    #    ),
+    #    use_polygons=args.rotation and not args.eval_straight,
+    #)
+
+    #cord_test_loader = DataLoader(
+    #    cord_ds,
+    #    batch_size=args.batch_size,
+    #    drop_last=False,
+    #    num_workers=args.workers,
+    #    sampler=SequentialSampler(cord_ds),
+    #    pin_memory=torch.cuda.is_available(),
+    #    collate_fn=cord_ds.collate_fn,
+    #)
+    #print(f"CORD Test set loaded in {time.time() - st:.4}s ({len(cord_ds)} samples in " f"{len(funsd_test_loader)} batches)")
 
     # Load doctr model
     model = detection.__dict__[args.arch](
@@ -369,16 +369,16 @@ def main(args):
         mask_shape=(args.input_size, args.input_size),
         use_broadcasting=True if system_available_memory > 62 else False,
     )
-    funsd_val_metric = LocalizationConfusion(
-        use_polygons=args.rotation and not args.eval_straight,
-        mask_shape=(args.input_size, args.input_size),
-        use_broadcasting=True if system_available_memory > 62 else False,
-    )
-    cord_val_metric = LocalizationConfusion(
-        use_polygons=args.rotation and not args.eval_straight,
-        mask_shape=(args.input_size, args.input_size),
-        use_broadcasting=True if system_available_memory > 62 else False,
-    )
+    #funsd_val_metric = LocalizationConfusion(
+    #    use_polygons=args.rotation and not args.eval_straight,
+    #    mask_shape=(args.input_size, args.input_size),
+    #    use_broadcasting=True if system_available_memory > 62 else False,
+    #)
+    #cord_val_metric = LocalizationConfusion(
+    #    use_polygons=args.rotation and not args.eval_straight,
+    #    mask_shape=(args.input_size, args.input_size),
+    #    use_broadcasting=True if system_available_memory > 62 else False,
+    #)
 
     if args.test_only:
         print("Running evaluation")
@@ -510,18 +510,18 @@ def main(args):
         val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp)
         funsd_recall, funsd_precision, funsd_mean_iou = 0.0, 0.0, 0.0
         cord_recall, cord_precision, cord_mean_iou = 0.0, 0.0, 0.0
-        try:
-            _, funsd_recall, funsd_precision, funsd_mean_iou = evaluate(
-                model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp
-            )
-        except Exception:
-            pass
-        try:
-            _, cord_recall, cord_precision, cord_mean_iou = evaluate(
-                model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp
-            )
-        except Exception:
-            pass
+        #try:
+        #    _, funsd_recall, funsd_precision, funsd_mean_iou = evaluate(
+        #        model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp
+        #    )
+        #except Exception:
+        #    pass
+        #try:
+        #    _, cord_recall, cord_precision, cord_mean_iou = evaluate(
+        #        model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp
+        #    )
+        #except Exception:
+        #    pass
         if val_loss < min_loss:
             print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
             send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
@@ -569,8 +569,6 @@ def parse_args():
 
     parser.add_argument("train_path", type=str, help="path to training data folder")
     parser.add_argument("val_path", type=str, help="path to validation data folder")
-    parser.add_argument("funsd_path", type=str, help="path to FUNSD data folder")
-    parser.add_argument("cord_path", type=str, help="path to Cord data folder")
     parser.add_argument("arch", type=str, help="text-detection model to train")
     parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
     parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")

From e6d393b015906f7e1d1ad5a6d86803dd74d7f86a Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 15 Apr 2024 23:09:36 +0200
Subject: [PATCH 17/39] train_pytorch_orientation send_on_slack

---
 .../train_pytorch_orientation.py              | 57 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py
index 688e48564..82c2bd46a 100644
--- a/references/classification/train_pytorch_orientation.py
+++ b/references/classification/train_pytorch_orientation.py
@@ -11,6 +11,7 @@
 import logging
 import multiprocessing as mp
 import time
+from pathlib import Path
 
 import numpy as np
 import torch
@@ -35,6 +36,33 @@
 from doctr.models.utils import export_model_to_onnx
 from utils import EarlyStopper, plot_recorder, plot_samples
 
+SLACK_WEBHOOK_URL = None
+SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
+if SLACK_WEBHOOK_PATH.exists():
+    with open(SLACK_WEBHOOK_PATH) as f:
+        SLACK_WEBHOOK_URL = f.read().strip()
+else:
+    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
+
+
+def send_on_slack(text: str):
+    """Send a message on Slack.
+
+    Args:
+        text (str): message to send on Slack
+    """
+    if SLACK_WEBHOOK_URL:
+        try:
+            import requests
+
+            requests.post(
+                url=SLACK_WEBHOOK_URL,
+                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
+            )
+        except Exception:
+            print("Impossible to send message on Slack, continue...")
+
+
 CLASSES = [0, 90, 180, 270]
 
 
@@ -121,7 +149,10 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 
     model.train()
     # Iterate over the batches of the dataset
+    last_progress = 0
+    interval_progress = 5
     pbar = tqdm(train_loader, position=1)
+    send_on_slack(str(pbar))
     for images, targets in pbar:
         if torch.cuda.is_available():
             images = images.cuda()
@@ -146,15 +177,24 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
         scheduler.step()
 
         pbar.set_description(f"Training loss: {train_loss.item():.6}")
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+    send_on_slack(f"Final training loss: {train_loss.item():.6}")
 
 
 @torch.no_grad()
 def evaluate(model, val_loader, batch_transforms, amp=False):
     # Model in eval mode
     model.eval()
+    last_progress = 0
+    interval_progress = 5
+    pbar = tqdm(val_loader)
+    send_on_slack(str(pbar))
     # Validation loop
     val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0
-    for images, targets in tqdm(val_loader):
+    for images, targets in pbar:
         images = batch_transforms(images)
 
         if torch.cuda.is_available():
@@ -175,6 +215,11 @@ def evaluate(model, val_loader, batch_transforms, amp=False):
         batch_cnt += 1
         samples += images.shape[0]
 
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+
     val_loss /= batch_cnt
     acc = correct / samples
     return val_loss, acc
@@ -214,6 +259,9 @@ def main(args):
         pin_memory=torch.cuda.is_available(),
     )
     print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)")
+    send_on_slack(
+        f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)"
+    )
 
     batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301))
 
@@ -223,6 +271,7 @@ def main(args):
     # Resume weights
     if isinstance(args.resume, str):
         print(f"Resuming {args.resume}")
+        send_on_slack(f"Resuming {args.resume}")
         checkpoint = torch.load(args.resume, map_location="cpu")
         model.load_state_dict(checkpoint)
 
@@ -276,6 +325,9 @@ def main(args):
         pin_memory=torch.cuda.is_available(),
     )
     print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)")
+    send_on_slack(
+        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)"
+    )
 
     if args.show_samples:
         x, target = next(iter(train_loader))
@@ -338,9 +390,11 @@ def main(args):
         val_loss, acc = evaluate(model, val_loader, batch_transforms)
         if val_loss < min_loss:
             print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
+            send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
             torch.save(model.state_dict(), f"./{exp_name}.pt")
             min_loss = val_loss
         print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})")
+        send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})")
         # W&B
         if args.wb:
             wandb.log({
@@ -349,6 +403,7 @@ def main(args):
             })
         if args.early_stop and early_stopper.early_stop(val_loss):
             print("Training halted early due to reaching patience limit.")
+            send_on_slack("Training halted early due to reaching patience limit.")
             break
     if args.wb:
         run.finish()

From 05a677970e0e1c4bdc96410bac070c8cd578a1cc Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 15 Apr 2024 23:13:45 +0200
Subject: [PATCH 18/39] feat: :sparkles: orientation dataset walk

---
 doctr/datasets/orientation.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doctr/datasets/orientation.py b/doctr/datasets/orientation.py
index 10bd55444..11ebd806f 100644
--- a/doctr/datasets/orientation.py
+++ b/doctr/datasets/orientation.py
@@ -3,7 +3,7 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
+from pathlib import Path
 from typing import Any, List, Tuple
 
 import numpy as np
@@ -37,4 +37,6 @@ def __init__(
         )
 
         # initialize dataset with 0 degree rotation targets
-        self.data: List[Tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)]
+        self.data: List[Tuple[str, np.ndarray]] = [
+            (img_name, np.array([0])) for img_name in Path(self.root).rglob("*.jpg")
+        ]

From bd18864bac054d11329809b12494d4a247bc90c3 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 16 Apr 2024 19:01:19 +0200
Subject: [PATCH 19/39] (32, 32) -> (128, 128)

---
 references/classification/train_pytorch_orientation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py
index 82c2bd46a..7f57a3dff 100644
--- a/references/classification/train_pytorch_orientation.py
+++ b/references/classification/train_pytorch_orientation.py
@@ -236,7 +236,7 @@ def main(args):
 
     torch.backends.cudnn.benchmark = True
 
-    input_size = (256, 256) if args.type == "page" else (32, 32)
+    input_size = (256, 256) if args.type == "page" else (128, 128)
 
     # Load val data generator
     st = time.time()

From ea587106ad8dc6ead0314ecd458d22228e6b05ac Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 16 Apr 2024 19:02:37 +0200
Subject: [PATCH 20/39] (256, 256) -> (512, 512)

---
 references/classification/train_pytorch_orientation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py
index 7f57a3dff..2dbc5fc59 100644
--- a/references/classification/train_pytorch_orientation.py
+++ b/references/classification/train_pytorch_orientation.py
@@ -236,7 +236,7 @@ def main(args):
 
     torch.backends.cudnn.benchmark = True
 
-    input_size = (256, 256) if args.type == "page" else (128, 128)
+    input_size = (512, 512) if args.type == "page" else (128, 128)
 
     # Load val data generator
     st = time.time()

From 1049fab08bd308e9ff68b015a16eb1de07e75634 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 16 Apr 2024 19:04:29 +0200
Subject: [PATCH 21/39] train_tensorflow_orientation.py: size for crop

---
 references/classification/train_tensorflow_orientation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py
index ed0479172..c01ce8371 100644
--- a/references/classification/train_tensorflow_orientation.py
+++ b/references/classification/train_tensorflow_orientation.py
@@ -147,7 +147,7 @@ def main(args):
     if not isinstance(args.workers, int):
         args.workers = min(16, mp.cpu_count())
 
-    input_size = (256, 256) if args.type == "page" else (32, 32)
+    input_size = (512, 512) if args.type == "page" else (128, 128)
 
     # AMP
     if args.amp:

From 53aa99bdf42beb56ba115a7ab3bdbf35f8f52df8 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 22 Apr 2024 16:20:37 +0200
Subject: [PATCH 22/39] slack display args on train_pytorch_orientation

---
 references/classification/train_pytorch_orientation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py
index 5b51fdc69..feca568e5 100644
--- a/references/classification/train_pytorch_orientation.py
+++ b/references/classification/train_pytorch_orientation.py
@@ -227,6 +227,7 @@ def evaluate(model, val_loader, batch_transforms, amp=False):
 
 def main(args):
     print(args)
+    send_on_slack(f"Start training: {args}")
 
     if args.push_to_hub:
         login_to_hub()

From 4d751f1f20d99cefbd675651bb33125632871d57 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 30 Apr 2024 16:15:57 +0200
Subject: [PATCH 23/39] pbar `train_tensorflow_orientation`

---
 .../train_tensorflow_orientation.py           | 58 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py
index a37ce8e1d..e7116595f 100644
--- a/references/classification/train_tensorflow_orientation.py
+++ b/references/classification/train_tensorflow_orientation.py
@@ -11,6 +11,7 @@
 import datetime
 import multiprocessing as mp
 import time
+from pathlib import Path
 
 import numpy as np
 import tensorflow as tf
@@ -30,6 +31,32 @@
 from doctr.transforms.functional import rotated_img_tensor
 from utils import EarlyStopper, plot_recorder, plot_samples
 
+SLACK_WEBHOOK_URL = None
+SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
+if SLACK_WEBHOOK_PATH.exists():
+    with open(SLACK_WEBHOOK_PATH) as f:
+        SLACK_WEBHOOK_URL = f.read().strip()
+else:
+    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
+
+
+def send_on_slack(text: str):
+    """Send a message on Slack.
+
+    Args:
+        text (str): message to send on Slack
+    """
+    if SLACK_WEBHOOK_URL:
+        try:
+            import requests
+
+            requests.post(
+                url=SLACK_WEBHOOK_URL,
+                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
+            )
+        except Exception:
+            print("Impossible to send message on Slack, continue...")
+
 CLASSES = [0, -90, 180, 90]
 
 
@@ -99,7 +126,10 @@ def record_lr(
 
 def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
     # Iterate over the batches of the dataset
+    last_progress = 0
+    interval_progress = 5
     pbar = tqdm(train_loader, position=1)
+    send_on_slack(str(pbar))
     for images, targets in pbar:
         images = batch_transforms(images)
 
@@ -112,13 +142,22 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
         optimizer.apply_gradients(zip(grads, model.trainable_weights))
 
         pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}")
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+    send_on_slack(f"Final training loss: {train_loss.item():.6}")
 
 
 def evaluate(model, val_loader, batch_transforms):
     # Validation loop
+    last_progress = 0
+    interval_progress = 5
     val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0
     val_iter = iter(val_loader)
-    for images, targets in tqdm(val_iter):
+    pbar = tqdm(val_iter)
+    send_on_slack(str(pbar))
+    for images, targets in pbar:
         images = batch_transforms(images)
         out = model(images, training=False)
         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out)
@@ -129,6 +168,11 @@ def evaluate(model, val_loader, batch_transforms):
         batch_cnt += 1
         samples += images.shape[0]
 
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+
     val_loss /= batch_cnt
     acc = correct / samples
     return val_loss, acc
@@ -143,6 +187,7 @@ def collate_fn(samples):
 
 def main(args):
     print(args)
+    send_on_slack(f"Start training: {args}")
 
     if args.push_to_hub:
         login_to_hub()
@@ -180,6 +225,10 @@ def main(args):
         f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in "
         f"{val_loader.num_batches} batches)"
     )
+    send_on_slack(
+        f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in "
+        f"{val_loader.num_batches} batches)"
+    )
 
     # Load doctr model
     model = classification.__dict__[args.arch](
@@ -236,6 +285,10 @@ def main(args):
         f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in "
         f"{train_loader.num_batches} batches)"
     )
+    send_on_slack(
+        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in "
+        f"{train_loader.num_batches} batches)"
+    )
 
     if args.show_samples:
         x, target = next(iter(train_loader))
@@ -307,9 +360,11 @@ def main(args):
         val_loss, acc = evaluate(model, val_loader, batch_transforms)
         if val_loss < min_loss:
             print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
+            send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
             model.save_weights(f"./{exp_name}/weights")
             min_loss = val_loss
         print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})")
+        send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})")
         # W&B
         if args.wb:
             wandb.log({
@@ -326,6 +381,7 @@ def main(args):
             logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch)
         if args.early_stop and early_stopper.early_stop(val_loss):
             print("Training halted early due to reaching patience limit.")
+            send_on_slack("Training halted early due to reaching patience limit.")
             break
     if args.wb:
         run.finish()

From 9f034cb83f5b90d09815e12eb2cb2da4957ee2c1 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Wed, 22 May 2024 11:00:18 +0200
Subject: [PATCH 24/39] Frankenstein script to train TF model with Torch
 DataLoader

---
 ...train_tensorflow_orientation_from_torch.py | 471 ++++++++++++++++++
 1 file changed, 471 insertions(+)
 create mode 100644 references/classification/train_tensorflow_orientation_from_torch.py

diff --git a/references/classification/train_tensorflow_orientation_from_torch.py b/references/classification/train_tensorflow_orientation_from_torch.py
new file mode 100644
index 000000000..a4dec1286
--- /dev/null
+++ b/references/classification/train_tensorflow_orientation_from_torch.py
@@ -0,0 +1,471 @@
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+import os
+
+os.environ["USE_TORCH"] = "1"
+
+import datetime
+import logging
+import multiprocessing as mp
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+import wandb
+from torch.nn.functional import cross_entropy
+from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torchvision.transforms import functional as F
+from torchvision.transforms.v2 import (
+    Compose,
+    GaussianBlur,
+    Normalize,
+    RandomGrayscale,
+    RandomPerspective,
+    RandomPhotometricDistort,
+)
+from tqdm.auto import tqdm
+
+from doctr import transforms as T
+from doctr.datasets import OrientationDataset
+from doctr.models import classification, login_to_hub, push_to_hf_hub
+from doctr.models.utils import export_model_to_onnx
+from utils import EarlyStopper, plot_recorder, plot_samples
+
+SLACK_WEBHOOK_URL = None
+SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
+if SLACK_WEBHOOK_PATH.exists():
+    with open(SLACK_WEBHOOK_PATH) as f:
+        SLACK_WEBHOOK_URL = f.read().strip()
+else:
+    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
+
+
+def send_on_slack(text: str):
+    """Send a message on Slack.
+
+    Args:
+        text (str): message to send on Slack
+    """
+    if SLACK_WEBHOOK_URL:
+        try:
+            import requests
+
+            requests.post(
+                url=SLACK_WEBHOOK_URL,
+                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
+            )
+        except Exception:
+            print("Impossible to send message on Slack, continue...")
+
+
+CLASSES = [0, -90, 180, 90]
+
+
+def rnd_rotate(img: torch.Tensor, target):
+    angle = int(np.random.choice(CLASSES))
+    idx = CLASSES.index(angle)
+    # augment the angle randomly with a probability of 0.5
+    if np.random.rand() < 0.5:
+        angle += float(np.random.choice(np.arange(-25, 25, 5)))
+    rotated_img = F.rotate(img, angle=-angle, fill=0, expand=angle not in CLASSES)[:3]
+    return rotated_img, idx
+
+
+def record_lr(
+    model: torch.nn.Module,
+    train_loader: DataLoader,
+    batch_transforms,
+    optimizer,
+    start_lr: float = 1e-7,
+    end_lr: float = 1,
+    num_it: int = 100,
+    amp: bool = False,
+):
+    """Gridsearch the optimal learning rate for the training.
+    Adapted from https://github.com/frgfm/Holocron/blob/master/holocron/trainer/core.py
+    """
+    if num_it > len(train_loader):
+        raise ValueError("the value of `num_it` needs to be lower than the number of available batches")
+
+    model = model.train()
+    # Update param groups & LR
+    optimizer.defaults["lr"] = start_lr
+    for pgroup in optimizer.param_groups:
+        pgroup["lr"] = start_lr
+
+    gamma = (end_lr / start_lr) ** (1 / (num_it - 1))
+    scheduler = MultiplicativeLR(optimizer, lambda step: gamma)
+
+    lr_recorder = [start_lr * gamma**idx for idx in range(num_it)]
+    loss_recorder = []
+
+    if amp:
+        scaler = torch.cuda.amp.GradScaler()
+
+    for batch_idx, (images, targets) in enumerate(train_loader):
+        if torch.cuda.is_available():
+            images = images.cuda()
+            targets = targets.cuda()
+
+        images = batch_transforms(images)
+
+        # Forward, Backward & update
+        optimizer.zero_grad()
+        if amp:
+            with torch.cuda.amp.autocast():
+                out = model(images)
+                train_loss = cross_entropy(out, targets)
+            scaler.scale(train_loss).backward()
+            # Update the params
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            out = model(images)
+            train_loss = cross_entropy(out, targets)
+            train_loss.backward()
+            optimizer.step()
+        # Update LR
+        scheduler.step()
+
+        # Record
+        if not torch.isfinite(train_loss):
+            if batch_idx == 0:
+                raise ValueError("loss value is NaN or inf.")
+            else:
+                break
+        loss_recorder.append(train_loss.item())
+        # Stop after the number of iterations
+        if batch_idx + 1 == num_it:
+            break
+
+    return lr_recorder[: len(loss_recorder)], loss_recorder
+
+
+def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
+    # Iterate over the batches of the dataset
+    last_progress = 0
+    interval_progress = 5
+    pbar = tqdm(train_loader, position=1)
+    send_on_slack(str(pbar))
+    import tensorflow as tf
+    for images, targets in pbar:
+        images = batch_transforms(images)
+
+        images = tf.convert_to_tensor(images)
+        images = tf.transpose(images, (0, 3, 2, 1))
+        with tf.GradientTape() as tape:
+            out = model(images, training=True)
+            train_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out)
+        grads = tape.gradient(train_loss, model.trainable_weights)
+        if amp:
+            grads = optimizer.get_unscaled_gradients(grads)
+        optimizer.apply_gradients(zip(grads, model.trainable_weights))
+
+        pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}")
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+    send_on_slack(str(pbar))
+    #send_on_slack(f"Final training loss: {train_loss.item():.6}")
+
+
+def evaluate(model, val_loader, batch_transforms):
+    # Validation loop
+    last_progress = 0
+    interval_progress = 5
+    val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0
+    val_iter = iter(val_loader)
+    pbar = tqdm(val_iter)
+    send_on_slack(str(pbar))
+    import tensorflow as tf
+    for images, targets in pbar:
+        images = batch_transforms(images)
+        images = tf.convert_to_tensor(images)
+        images = tf.transpose(images, (0, 3, 2, 1))
+        out = model(images, training=False)
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out)
+        # Compute metric
+        correct += int((out.numpy().argmax(1) == targets.numpy()).sum())
+
+        val_loss += loss.numpy().mean()
+        batch_cnt += 1
+        samples += images.shape[0]
+
+        current_progress = pbar.n / pbar.total * 100
+        if current_progress - last_progress > interval_progress:
+            send_on_slack(str(pbar))
+            last_progress = int(current_progress)
+
+    val_loss /= batch_cnt
+    acc = correct / samples
+    return val_loss, acc
+
+
+def main(args):
+    print(args)
+    send_on_slack(f"Start training: {args}")
+
+    if args.push_to_hub:
+        login_to_hub()
+
+    if not isinstance(args.workers, int):
+        args.workers = min(16, mp.cpu_count())
+
+    torch.backends.cudnn.benchmark = True
+
+    input_size = (512, 512) if args.type == "page" else (256, 256)
+
+    # Load val data generator
+    st = time.time()
+    val_set = OrientationDataset(
+        img_folder=os.path.join(args.val_path, "images"),
+        img_transforms=Compose([
+            T.Resize(input_size, preserve_aspect_ratio=True, symmetric_pad=True),
+        ]),
+        sample_transforms=T.SampleCompose([
+            lambda x, y: rnd_rotate(x, y),
+            T.Resize(input_size),
+        ]),
+    )
+    val_loader = DataLoader(
+        val_set,
+        batch_size=args.batch_size,
+        drop_last=False,
+        num_workers=args.workers,
+        sampler=SequentialSampler(val_set),
+        pin_memory=torch.cuda.is_available(),
+    )
+    print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)")
+    send_on_slack(
+        f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)"
+    )
+
+    batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301))
+
+    # Load doctr model
+    import doctr.models.classification.mobilenet.tensorflow as classification_tf
+    model = classification_tf.__dict__[args.arch](pretrained=args.pretrained, num_classes=len(CLASSES), classes=CLASSES)
+
+    # Resume weights
+    if isinstance(args.resume, str):
+        print(f"Resuming {args.resume}")
+        send_on_slack(f"Resuming {args.resume}")
+        checkpoint = torch.load(args.resume, map_location="cpu")
+        model.load_state_dict(checkpoint)
+
+    # GPU
+    #if isinstance(args.device, int):
+    #    if not torch.cuda.is_available():
+    #        raise AssertionError("PyTorch cannot access your GPU. Please investigate!")
+    #    if args.device >= torch.cuda.device_count():
+    #        raise ValueError("Invalid device index")
+    ## Silent default switch to GPU if available
+    #elif torch.cuda.is_available():
+    #    args.device = 0
+    #else:
+    #    logging.warning("No accessible GPU, targe device set to CPU.")
+    #if torch.cuda.is_available():
+    #    torch.cuda.set_device(args.device)
+    #    model = model.cuda()
+
+    if args.test_only:
+        print("Running evaluation")
+        val_loss, acc = evaluate(model, val_loader, batch_transforms)
+        print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})")
+        return
+
+    st = time.time()
+    train_set = OrientationDataset(
+        img_folder=os.path.join(args.train_path, "images"),
+        img_transforms=Compose([
+            T.Resize(input_size, preserve_aspect_ratio=True, symmetric_pad=True),
+            # Augmentations
+            T.RandomApply(T.ColorInversion(), 0.1),
+            T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
+            T.RandomApply(T.RandomShadow(), 0.2),
+            T.RandomApply(GaussianBlur(kernel_size=3), 0.1),
+            RandomPhotometricDistort(p=0.1),
+            RandomGrayscale(p=0.1),
+            RandomPerspective(distortion_scale=0.1, p=0.3),
+        ]),
+        sample_transforms=T.SampleCompose([
+            lambda x, y: rnd_rotate(x, y),
+            T.Resize(input_size),
+        ]),
+    )
+
+    train_loader = DataLoader(
+        train_set,
+        batch_size=args.batch_size,
+        drop_last=True,
+        num_workers=args.workers,
+        sampler=RandomSampler(train_set),
+        pin_memory=torch.cuda.is_available(),
+    )
+    print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)")
+    send_on_slack(
+        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)"
+    )
+
+    if args.show_samples:
+        x, target = next(iter(train_loader))
+        plot_samples(x, [CLASSES[t] for t in target])
+        return
+
+    # Optimizer
+    #optimizer = torch.optim.Adam(
+    #    [p for p in model.parameters() if p.requires_grad],
+    #    args.lr,
+    #    betas=(0.95, 0.99),
+    #    eps=1e-6,
+    #    weight_decay=args.weight_decay,
+    #)
+    import tensorflow as tf
+    scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
+        args.lr,
+        decay_steps=args.epochs * len(train_loader),
+        decay_rate=1 / (1e3),  # final lr as a fraction of initial lr
+        staircase=False,
+        name="ExponentialDecay",
+    )
+    optimizer = tf.keras.optimizers.Adam(
+        learning_rate=scheduler,
+        beta_1=0.95,
+        beta_2=0.99,
+        epsilon=1e-6,
+    )
+
+    # LR Finder
+    if args.find_lr:
+        lrs, losses = record_lr(model, train_loader, batch_transforms, optimizer, amp=args.amp)
+        plot_recorder(lrs, losses)
+        return
+    ## Scheduler
+    #if args.sched == "cosine":
+    #    scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
+    #elif args.sched == "onecycle":
+    #    scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
+
+    # Training monitoring
+    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name
+
+    # W&B
+    if args.wb:
+        run = wandb.init(
+            name=exp_name,
+            project="orientation-classification",
+            config={
+                "learning_rate": args.lr,
+                "epochs": args.epochs,
+                "weight_decay": args.weight_decay,
+                "batch_size": args.batch_size,
+                "architecture": args.arch,
+                "input_size": input_size,
+                "optimizer": "adam",
+                "framework": "pytorch",
+                "classes": CLASSES,
+                "scheduler": args.sched,
+                "pretrained": args.pretrained,
+            },
+        )
+
+    # Create loss queue
+    min_loss = np.inf
+    # Training loop
+    if args.early_stop:
+        early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
+    for epoch in range(args.epochs):
+        fit_one_epoch(model, train_loader, batch_transforms, optimizer)
+        model.save_weights(f"./{exp_name}_{epoch}/weights")
+
+        try:
+            # Validation loop at the end of each epoch
+            val_loss, acc = evaluate(model, val_loader, batch_transforms)
+            if val_loss < min_loss:
+                print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
+                send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...")
+                model.save_weights(f"./{exp_name}/weights")
+                min_loss = val_loss
+            print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})")
+            send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})")
+        except Exception:
+            import traceback
+            traceback.print_exc()
+        ## W&B
+        #if args.wb:
+        #    wandb.log({
+        #        "val_loss": val_loss,
+        #        "acc": acc,
+        #    })
+        #if args.early_stop and early_stopper.early_stop(val_loss):
+        #    print("Training halted early due to reaching patience limit.")
+        #    send_on_slack("Training halted early due to reaching patience limit.")
+        #    break
+    if args.wb:
+        run.finish()
+
+    if args.push_to_hub:
+        push_to_hf_hub(model, exp_name, task="classification", run_config=args)
+
+    if args.export_onnx:
+        print("Exporting model to ONNX...")
+        dummy_batch = next(iter(val_loader))
+        dummy_input = dummy_batch[0].cuda() if torch.cuda.is_available() else dummy_batch[0]
+        model_path = export_model_to_onnx(model, exp_name, dummy_input)
+        print(f"Exported model saved in {model_path}")
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="DocTR training script for orientation classification (PyTorch)",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument("train_path", type=str, help="path to training data folder")
+    parser.add_argument("val_path", type=str, help="path to validation data folder")
+    parser.add_argument("arch", type=str, help="classification model to train")
+    parser.add_argument("type", type=str, choices=["page", "crop"], help="type of data to train on")
+    parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
+    parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
+    parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training")
+    parser.add_argument("--device", default=None, type=int, help="device")
+    parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)")
+    parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay")
+    parser.add_argument("-j", "--workers", type=int, default=None, help="number of workers used for dataloading")
+    parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint")
+    parser.add_argument("--test-only", dest="test_only", action="store_true", help="Run the validation loop")
+    parser.add_argument(
+        "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples"
+    )
+    parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases")
+    parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub")
+    parser.add_argument(
+        "--pretrained",
+        dest="pretrained",
+        action="store_true",
+        help="Load pretrained parameters before starting the training",
+    )
+    parser.add_argument("--export-onnx", dest="export_onnx", action="store_true", help="Export the model to ONNX")
+    parser.add_argument("--sched", type=str, default="cosine", help="scheduler to use")
+    parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true")
+    parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR")
+    parser.add_argument("--early-stop", action="store_true", help="Enable early stopping")
+    parser.add_argument("--early-stop-epochs", type=int, default=5, help="Patience for early stopping")
+    parser.add_argument("--early-stop-delta", type=float, default=0.01, help="Minimum Delta for early stopping")
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

From e56b7c07d9f46baa289bbc352e10826fd7cf9bb2 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Wed, 5 Jun 2024 07:36:43 +0000
Subject: [PATCH 25/39] fix send_on_slack

---
 references/classification/train_tensorflow_orientation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py
index e7116595f..a5ed27c19 100644
--- a/references/classification/train_tensorflow_orientation.py
+++ b/references/classification/train_tensorflow_orientation.py
@@ -146,7 +146,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
         if current_progress - last_progress > interval_progress:
             send_on_slack(str(pbar))
             last_progress = int(current_progress)
-    send_on_slack(f"Final training loss: {train_loss.item():.6}")
+    send_on_slack(f"Final training loss: {train_loss.numpy():.6}")
 
 
 def evaluate(model, val_loader, batch_transforms):

From 983e815ecf549eb53a7a7c2473cf9605b0ce202b Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Wed, 5 Jun 2024 12:00:05 +0000
Subject: [PATCH 26/39] upd

---
 references/classification/train_tensorflow_orientation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py
index a5ed27c19..16048161f 100644
--- a/references/classification/train_tensorflow_orientation.py
+++ b/references/classification/train_tensorflow_orientation.py
@@ -146,7 +146,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
         if current_progress - last_progress > interval_progress:
             send_on_slack(str(pbar))
             last_progress = int(current_progress)
-    send_on_slack(f"Final training loss: {train_loss.numpy():.6}")
+    send_on_slack(f"Final training loss: {train_loss.numpy().mean():.6}")
 
 
 def evaluate(model, val_loader, batch_transforms):

From 1efc8ee93a155a8cbe04fc4e66d3b3bc1c4fc77f Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 17 Dec 2024 15:18:37 +0100
Subject: [PATCH 27/39] revert few changes

---
 doctr/datasets/datasets/tensorflow.py                         | 2 +-
 doctr/models/detection/differentiable_binarization/pytorch.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/doctr/datasets/datasets/tensorflow.py b/doctr/datasets/datasets/tensorflow.py
index 203c770eb..f0206cf03 100644
--- a/doctr/datasets/datasets/tensorflow.py
+++ b/doctr/datasets/datasets/tensorflow.py
@@ -52,7 +52,7 @@ def collate_fn(samples: list[tuple[tf.Tensor, Any]]) -> tuple[tf.Tensor, list[An
         images, targets = zip(*samples)
         images = tf.stack(images, axis=0)
 
-        return images, targets
+        return images, list(targets)
 
 
 class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101
diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py
index 74e68a340..cad6a74aa 100644
--- a/doctr/models/detection/differentiable_binarization/pytorch.py
+++ b/doctr/models/detection/differentiable_binarization/pytorch.py
@@ -286,8 +286,7 @@ def compute_loss(
         if torch.any(thresh_mask):
             l1_loss = (torch.abs(thresh_map - thresh_target) * thresh_mask).sum() / (thresh_mask.sum() + eps)
 
-        # return l1_loss + focal_scale * focal_loss + dice_loss
-        return focal_scale * focal_loss + dice_loss
+        return l1_loss + focal_scale * focal_loss + dice_loss
 
 
 def _dbnet(

From 60a63bbbbfa7b6193aed982e2cdcd9948d7c5690 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 14 Jan 2025 13:51:16 +0100
Subject: [PATCH 28/39] clean `detection/train_pytorch.py`

---
 references/detection/train_pytorch.py | 40 ++++++---------------------
 1 file changed, 8 insertions(+), 32 deletions(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index 10e4a3f3f..ccc6f52a4 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -25,20 +25,14 @@
 else:
     from tqdm.auto import tqdm
 
+from slack_sdk import WebClient
+
 from doctr import transforms as T
 from doctr.datasets import DetectionDataset
 from doctr.models import detection, login_to_hub, push_to_hf_hub
 from doctr.utils.metrics import LocalizationConfusion
 from utils import EarlyStopper, plot_recorder, plot_samples
 
-SLACK_WEBHOOK_URL = None
-SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
-if SLACK_WEBHOOK_PATH.exists():
-    with open(SLACK_WEBHOOK_PATH) as f:
-        SLACK_WEBHOOK_URL = f.read().strip()
-else:
-    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
-
 
 def send_on_slack(text: str):
     """Send a message on Slack.
@@ -46,16 +40,12 @@ def send_on_slack(text: str):
     Args:
         text (str): message to send on Slack
     """
-    if SLACK_WEBHOOK_URL:
-        try:
-            import requests
-
-            requests.post(
-                url=SLACK_WEBHOOK_URL,
-                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
-            )
-        except Exception:
-            print("Impossible to send message on Slack, continue...")
+    if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"):
+        client = WebClient(token=os.getenv("TQDM_SLACK_TOKEN"))
+        client.chat_postMessage(
+            channel=os.getenv("TQDM_SLACK_CHANNEL"),
+            text=f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}",
+        )
 
 
 def record_lr(
@@ -135,10 +125,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 
     model.train()
     # Iterate over the batches of the dataset
-    last_progress = 0
-    interval_progress = 5
     pbar = tqdm(train_loader, position=1)
-    send_on_slack(str(pbar))
     for images, targets in pbar:
         if torch.cuda.is_available():
             images = images.cuda()
@@ -163,10 +150,6 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 
         scheduler.step()
         pbar.set_description(f"Training loss: {train_loss.item():.6}")
-        current_progress = pbar.n / pbar.total * 100
-        if current_progress - last_progress > interval_progress:
-            send_on_slack(str(pbar))
-            last_progress = int(current_progress)
     send_on_slack(f"Final training loss: {train_loss.item():.6}")
 
 
@@ -176,10 +159,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
     model.eval()
     # Reset val metric
     val_metric.reset()
-    last_progress = 0
-    interval_progress = 5
     pbar = tqdm(val_loader)
-    send_on_slack(str(pbar))
     # Validation loop
     val_loss, batch_cnt = 0, 0
     for images, targets in pbar:
@@ -200,10 +180,6 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False):
                     boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1)
                 val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4])
 
-        current_progress = pbar.n / pbar.total * 100
-        if current_progress - last_progress > interval_progress:
-            send_on_slack(str(pbar))
-            last_progress = int(current_progress)
         val_loss += out["loss"].item()
         batch_cnt += 1
 

From 4dcd9724a0878d55ca1321ba3ab28d4ad8b01ade Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 14 Jan 2025 15:43:02 +0100
Subject: [PATCH 29/39] add clearml logging

---
 references/detection/train_pytorch.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index ccc6f52a4..bcfdfee5a 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -496,6 +496,14 @@ def main(args):
             },
         )
 
+    # ClearML
+    if args.clearml:
+        from clearml import Task
+
+        task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False)
+        task.upload_artifact("config", config)
+
+
     # Create loss queue
     min_loss = np.inf
     if args.early_stop:
@@ -547,6 +555,16 @@ def main(args):
                 "precision": precision,
                 "mean_iou": mean_iou,
             })
+
+        # ClearML
+        if args.clearml:
+            from clearml import Logger
+
+            logger = Logger.current_logger()
+            logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch)
+            logger.report_scalar(title="Precision Recall", series="recall", value=recall, iteration=epoch)
+            logger.report_scalar(title="Precision Recall", series="precision", value=precision, iteration=epoch)
+            logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch)
         if args.early_stop and early_stopper.early_stop(val_loss):
             print("Training halted early due to reaching patience limit.")
             break
@@ -589,6 +607,7 @@ def parse_args():
         "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples"
     )
     parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases")
+    parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML")
     parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub")
     parser.add_argument(
         "--pretrained",

From e182ae7a005e4c0c5ebb3da3f0b52b3ea4118a5d Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 14 Jan 2025 16:19:26 +0100
Subject: [PATCH 30/39] add boto3

---
 references/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/references/requirements.txt b/references/requirements.txt
index 90e24543d..84d95095b 100644
--- a/references/requirements.txt
+++ b/references/requirements.txt
@@ -1,6 +1,7 @@
 -e .
 tqdm
 slack-sdk
+boto3>=1.9
 wandb>=0.10.31
 clearml>=1.11.1
 matplotlib>=3.1.0

From 6494f87a8b0b862cbc7597941db7abd068b9b567 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 14 Jan 2025 17:02:40 +0100
Subject: [PATCH 31/39] `config`

---
 references/detection/train_pytorch.py | 33 ++++++++++++++-------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index bcfdfee5a..7ef20f5dd 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -470,6 +470,22 @@ def main(args):
     # Training monitoring
     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
     exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name
+    config = {
+        "learning_rate": args.lr,
+        "epochs": args.epochs,
+        "weight_decay": args.weight_decay,
+        "batch_size": args.batch_size,
+        "architecture": args.arch,
+        "input_size": args.input_size,
+        "optimizer": args.optim,
+        "framework": "pytorch",
+        "scheduler": args.sched,
+        "train_hash": train_hash,
+        "val_hash": val_hash,
+        "pretrained": args.pretrained,
+        "rotation": args.rotation,
+        "amp": args.amp,
+    }
 
     # W&B
     if args.wb:
@@ -478,22 +494,7 @@ def main(args):
         run = wandb.init(
             name=exp_name,
             project="text-detection",
-            config={
-                "learning_rate": args.lr,
-                "epochs": args.epochs,
-                "weight_decay": args.weight_decay,
-                "batch_size": args.batch_size,
-                "architecture": args.arch,
-                "input_size": args.input_size,
-                "optimizer": args.optim,
-                "framework": "pytorch",
-                "scheduler": args.sched,
-                "train_hash": train_hash,
-                "val_hash": val_hash,
-                "pretrained": args.pretrained,
-                "rotation": args.rotation,
-                "amp": args.amp,
-            },
+            config=config,
         )
 
     # ClearML

From 2a12cfc497ba2e5b2dbbbe9f0719fc76b95d8ccd Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Thu, 16 Jan 2025 18:13:53 +0100
Subject: [PATCH 32/39] Grad accumulation - testing

---
 references/recognition/train_pytorch.py | 41 ++++++++++++++-----------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/references/recognition/train_pytorch.py b/references/recognition/train_pytorch.py
index 70a841dce..ba443684d 100644
--- a/references/recognition/train_pytorch.py
+++ b/references/recognition/train_pytorch.py
@@ -109,7 +109,7 @@ def record_lr(
     return lr_recorder[: len(loss_recorder)], loss_recorder
 
 
-def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False):
+def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps, amp=False, clearml_log=False):
     if amp:
         scaler = torch.cuda.amp.GradScaler()
 
@@ -121,37 +121,37 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 
     # Iterate over the batches of the dataset
     pbar = tqdm(train_loader, position=1)
-    for images, targets in pbar:
+    for step, (images, targets) in enumerate(pbar):
         if torch.cuda.is_available():
             images = images.cuda()
         images = batch_transforms(images)
 
-        train_loss = model(images, targets)["loss"]
-
-        optimizer.zero_grad()
         if amp:
             with torch.cuda.amp.autocast():
-                train_loss = model(images, targets)["loss"]
+                train_loss = model(images, targets)["loss"] / grad_accumulation_steps
             scaler.scale(train_loss).backward()
             # Gradient clipping
             scaler.unscale_(optimizer)
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
-            # Update the params
-            scaler.step(optimizer)
-            scaler.update()
         else:
-            train_loss = model(images, targets)["loss"]
+            train_loss = model(images, targets)["loss"] / grad_accumulation_steps
             train_loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
-            optimizer.step()
 
-        scheduler.step()
+        if (step + 1) % grad_accumulation_steps == 0 or step + 1 == len(train_loader):
+            if amp:
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                optimizer.step()
+            optimizer.zero_grad()
+            scheduler.step()
 
-        pbar.set_description(f"Training loss: {train_loss.item():.6}")
+        pbar.set_description(f"Training loss: {train_loss.item() * grad_accumulation_steps:.6f}")
         if clearml_log:
             global iteration
             logger.report_scalar(
-                title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration
+                title="Training Loss", series="train_loss", value=train_loss.item() * grad_accumulation_steps, iteration=iteration
             )
             iteration += 1
 
@@ -376,12 +376,16 @@ def main(args):
         return
 
     # Scheduler
+    # Effective steps per epoch (due to grad accumulation)
+    grad_steps = args.grad_accumulation
+    effective_steps_per_epoch = len(train_loader) // grad_steps
+    total_steps = args.epochs * effective_steps_per_epoch
     if args.sched == "cosine":
-        scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
+        scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4)
     elif args.sched == "onecycle":
-        scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
+        scheduler = OneCycleLR(optimizer, args.lr, total_steps)
     elif args.sched == "poly":
-        scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader))
+        scheduler = PolynomialLR(optimizer, total_steps)
 
     # Training monitoring
     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -429,7 +433,7 @@ def main(args):
         early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
     for epoch in range(args.epochs):
         fit_one_epoch(
-            model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml
+            model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps=grad_steps, amp=args.amp, clearml_log=args.clearml
         )
 
         # Validation loop at the end of each epoch
@@ -501,6 +505,7 @@ def parse_args():
     parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
     parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
     parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size for training")
+    parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps")
     parser.add_argument("--device", default=None, type=int, help="device")
     parser.add_argument("--input_size", type=int, default=32, help="input size H for the model, W = 4*H")
     parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)")

From 8f5ccf9986c8bf1447e896715e7f04d20dadb1f1 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Thu, 16 Jan 2025 18:31:20 +0100
Subject: [PATCH 33/39] Revert "Grad accumulation - testing"

This reverts commit 2a12cfc497ba2e5b2dbbbe9f0719fc76b95d8ccd.
---
 references/recognition/train_pytorch.py | 41 +++++++++++--------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/references/recognition/train_pytorch.py b/references/recognition/train_pytorch.py
index ba443684d..70a841dce 100644
--- a/references/recognition/train_pytorch.py
+++ b/references/recognition/train_pytorch.py
@@ -109,7 +109,7 @@ def record_lr(
     return lr_recorder[: len(loss_recorder)], loss_recorder
 
 
-def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps, amp=False, clearml_log=False):
+def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False):
     if amp:
         scaler = torch.cuda.amp.GradScaler()
 
@@ -121,37 +121,37 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, g
 
     # Iterate over the batches of the dataset
     pbar = tqdm(train_loader, position=1)
-    for step, (images, targets) in enumerate(pbar):
+    for images, targets in pbar:
         if torch.cuda.is_available():
             images = images.cuda()
         images = batch_transforms(images)
 
+        train_loss = model(images, targets)["loss"]
+
+        optimizer.zero_grad()
         if amp:
             with torch.cuda.amp.autocast():
-                train_loss = model(images, targets)["loss"] / grad_accumulation_steps
+                train_loss = model(images, targets)["loss"]
             scaler.scale(train_loss).backward()
             # Gradient clipping
             scaler.unscale_(optimizer)
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            # Update the params
+            scaler.step(optimizer)
+            scaler.update()
         else:
-            train_loss = model(images, targets)["loss"] / grad_accumulation_steps
+            train_loss = model(images, targets)["loss"]
             train_loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            optimizer.step()
 
-        if (step + 1) % grad_accumulation_steps == 0 or step + 1 == len(train_loader):
-            if amp:
-                scaler.step(optimizer)
-                scaler.update()
-            else:
-                optimizer.step()
-            optimizer.zero_grad()
-            scheduler.step()
+        scheduler.step()
 
-        pbar.set_description(f"Training loss: {train_loss.item() * grad_accumulation_steps:.6f}")
+        pbar.set_description(f"Training loss: {train_loss.item():.6}")
         if clearml_log:
             global iteration
             logger.report_scalar(
-                title="Training Loss", series="train_loss", value=train_loss.item() * grad_accumulation_steps, iteration=iteration
+                title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration
             )
             iteration += 1
 
@@ -376,16 +376,12 @@ def main(args):
         return
 
     # Scheduler
-    # Effective steps per epoch (due to grad accumulation)
-    grad_steps = args.grad_accumulation
-    effective_steps_per_epoch = len(train_loader) // grad_steps
-    total_steps = args.epochs * effective_steps_per_epoch
     if args.sched == "cosine":
-        scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4)
+        scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
     elif args.sched == "onecycle":
-        scheduler = OneCycleLR(optimizer, args.lr, total_steps)
+        scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
     elif args.sched == "poly":
-        scheduler = PolynomialLR(optimizer, total_steps)
+        scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader))
 
     # Training monitoring
     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -433,7 +429,7 @@ def main(args):
         early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
     for epoch in range(args.epochs):
         fit_one_epoch(
-            model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps=grad_steps, amp=args.amp, clearml_log=args.clearml
+            model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml
         )
 
         # Validation loop at the end of each epoch
@@ -505,7 +501,6 @@ def parse_args():
     parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
     parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
     parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size for training")
-    parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps")
     parser.add_argument("--device", default=None, type=int, help="device")
     parser.add_argument("--input_size", type=int, default=32, help="input size H for the model, W = 4*H")
     parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)")

From 934730ebf789757f0dcbefa374134d3d99e7ab30 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Thu, 16 Jan 2025 18:34:12 +0100
Subject: [PATCH 34/39] Grad accumulation - testing

---
 references/detection/train_pytorch.py | 41 ++++++++++++++++-----------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index d6af3f69a..80b05d427 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -119,7 +119,7 @@ def record_lr(
     return lr_recorder[: len(loss_recorder)], loss_recorder
 
 
-def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False):
+def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps, amp=False, clearml_log=False):
     if amp:
         scaler = torch.cuda.amp.GradScaler()
 
@@ -131,34 +131,38 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 
     # Iterate over the batches of the dataset
     pbar = tqdm(train_loader, position=1)
-    for images, targets in pbar:
+    for step, (images, targets) in enumerate(pbar):
         if torch.cuda.is_available():
             images = images.cuda()
         images = batch_transforms(images)
 
-        optimizer.zero_grad()
         if amp:
             with torch.cuda.amp.autocast():
-                train_loss = model(images, targets)["loss"]
+                train_loss = model(images, targets)["loss"] / grad_accumulation_steps
             scaler.scale(train_loss).backward()
             # Gradient clipping
             scaler.unscale_(optimizer)
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
-            # Update the params
-            scaler.step(optimizer)
-            scaler.update()
         else:
-            train_loss = model(images, targets)["loss"]
+            train_loss = model(images, targets)["loss"] / grad_accumulation_steps
             train_loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
-            optimizer.step()
 
-        scheduler.step()
-        pbar.set_description(f"Training loss: {train_loss.item():.6}")
+        if (step + 1) % grad_accumulation_steps == 0 or step + 1 == len(train_loader):
+            if amp:
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                optimizer.step()
+
+            optimizer.zero_grad()
+            scheduler.step()
+
+        pbar.set_description(f"Training loss: {train_loss.item() * grad_accumulation_steps:.6f}")
         if clearml_log:
             global iteration
             logger.report_scalar(
-                title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration
+                title="Training Loss", series="train_loss", value=train_loss.item() * grad_accumulation_steps, iteration=iteration
             )
             iteration += 1
     send_on_slack(f"Final training loss: {train_loss.item():.6}")
@@ -471,12 +475,16 @@ def main(args):
         return
 
     # Scheduler
+    # Effective steps per epoch (due to grad accumulation)
+    grad_steps = args.grad_accumulation
+    effective_steps_per_epoch = len(train_loader) // grad_steps
+    total_steps = args.epochs * effective_steps_per_epoch
     if args.sched == "cosine":
-        scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
+        scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4)
     elif args.sched == "onecycle":
-        scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
+        scheduler = OneCycleLR(optimizer, args.lr, total_steps)
     elif args.sched == "poly":
-        scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader))
+        scheduler = PolynomialLR(optimizer, total_steps)
 
     # Training monitoring
     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -525,7 +533,7 @@ def main(args):
     # Training loop
     for epoch in range(args.epochs):
         fit_one_epoch(
-            model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml
+            model, train_loader, batch_transforms, optimizer, scheduler, grad_steps, amp=args.amp, clearml_log=args.clearml
         )
         # Validation loop at the end of each epoch
         val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp)
@@ -606,6 +614,7 @@ def parse_args():
     parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
     parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
     parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training")
+    parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps")
     parser.add_argument("--device", default=None, type=int, help="device")
     parser.add_argument(
         "--save-interval-epoch", dest="save_interval_epoch", action="store_true", help="Save model every epoch"

From d3bcd09f6ea4a77d1bb6b39cd2d044070e75fd6c Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Tue, 21 Jan 2025 16:16:22 +0100
Subject: [PATCH 35/39] `power=0.5` for polynomialLR

---
 references/detection/train_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index 80b05d427..610cbe2ae 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -484,7 +484,7 @@ def main(args):
     elif args.sched == "onecycle":
         scheduler = OneCycleLR(optimizer, args.lr, total_steps)
     elif args.sched == "poly":
-        scheduler = PolynomialLR(optimizer, total_steps)
+        scheduler = PolynomialLR(optimizer, total_steps, power=0.5)
 
     # Training monitoring
     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

From b912d8fcd81b37ed7ccc6f7c13604ef18eca3d51 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Thu, 23 Jan 2025 15:37:26 +0100
Subject: [PATCH 36/39] clean branch

---
 .../train_pytorch_orientation.py              | 36 -----------------
 .../train_tensorflow_orientation.py           | 40 -------------------
 references/detection/train_pytorch.py         | 19 +--------
 references/detection/train_tensorflow.py      | 32 ---------------
 4 files changed, 1 insertion(+), 126 deletions(-)

diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py
index beb63c8f5..13df5f843 100644
--- a/references/classification/train_pytorch_orientation.py
+++ b/references/classification/train_pytorch_orientation.py
@@ -38,33 +38,6 @@
 from doctr.models.utils import export_model_to_onnx
 from utils import EarlyStopper, plot_recorder, plot_samples
 
-SLACK_WEBHOOK_URL = None
-SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
-if SLACK_WEBHOOK_PATH.exists():
-    with open(SLACK_WEBHOOK_PATH) as f:
-        SLACK_WEBHOOK_URL = f.read().strip()
-else:
-    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
-
-
-def send_on_slack(text: str):
-    """Send a message on Slack.
-
-    Args:
-        text (str): message to send on Slack
-    """
-    if SLACK_WEBHOOK_URL:
-        try:
-            import requests
-
-            requests.post(
-                url=SLACK_WEBHOOK_URL,
-                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
-            )
-        except Exception:
-            print("Impossible to send message on Slack, continue...")
-
-
 CLASSES = [0, -90, 180, 90]
 
 
@@ -195,10 +168,6 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
 def evaluate(model, val_loader, batch_transforms, amp=False, log=None):
     # Model in eval mode
     model.eval()
-    last_progress = 0
-    interval_progress = 5
-    pbar = tqdm(val_loader)
-    send_on_slack(str(pbar))
     # Validation loop
     val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0
     pbar = tqdm(val_loader, dynamic_ncols=True)
@@ -226,11 +195,6 @@ def evaluate(model, val_loader, batch_transforms, amp=False, log=None):
         batch_cnt += 1
         samples += images.shape[0]
 
-        current_progress = pbar.n / pbar.total * 100
-        if current_progress - last_progress > interval_progress:
-            send_on_slack(str(pbar))
-            last_progress = int(current_progress)
-
     val_loss /= batch_cnt
     acc = correct / samples
     return val_loss, acc
diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py
index b133ccbfe..87c46b62d 100644
--- a/references/classification/train_tensorflow_orientation.py
+++ b/references/classification/train_tensorflow_orientation.py
@@ -38,33 +38,6 @@
 from doctr.transforms.functional import rotated_img_tensor
 from utils import EarlyStopper, plot_recorder, plot_samples
 
-SLACK_WEBHOOK_URL = None
-SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
-if SLACK_WEBHOOK_PATH.exists():
-    with open(SLACK_WEBHOOK_PATH) as f:
-        SLACK_WEBHOOK_URL = f.read().strip()
-else:
-    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
-
-
-def send_on_slack(text: str):
-    """Send a message on Slack.
-
-    Args:
-        text (str): message to send on Slack
-    """
-    if SLACK_WEBHOOK_URL:
-        try:
-            import requests
-
-            requests.post(
-                url=SLACK_WEBHOOK_URL,
-                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
-            )
-        except Exception:
-            print("Impossible to send message on Slack, continue...")
-
-
 CLASSES = [0, -90, 180, 90]
 
 
@@ -168,8 +141,6 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, l
 
 def evaluate(model, val_loader, batch_transforms, log=None):
     # Validation loop
-    last_progress = 0
-    interval_progress = 5
     val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0
     val_iter = iter(val_loader)
     pbar = tqdm(val_iter, dynamic_ncols=True)
@@ -187,11 +158,6 @@ def evaluate(model, val_loader, batch_transforms, log=None):
         batch_cnt += 1
         samples += images.shape[0]
 
-        current_progress = pbar.n / pbar.total * 100
-        if current_progress - last_progress > interval_progress:
-            send_on_slack(str(pbar))
-            last_progress = int(current_progress)
-
     val_loss /= batch_cnt
     acc = correct / samples
     return val_loss, acc
@@ -239,9 +205,6 @@ def main(args):
     pbar.write(
         f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)"
     )
-    send_on_slack(
-        f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)"
-    )
 
     # Load doctr model
     model = classification.__dict__[args.arch](
@@ -296,9 +259,6 @@ def main(args):
     pbar.write(
         f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)"
     )
-    send_on_slack(
-        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)"
-    )
 
     if args.show_samples:
         x, target = next(iter(train_loader))
diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index 29094d0d9..2d72bdde7 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -4,7 +4,6 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import os
-from pathlib import Path
 
 os.environ["USE_TORCH"] = "1"
 
@@ -13,6 +12,7 @@
 import logging
 import multiprocessing as mp
 import time
+from pathlib import Path
 
 import numpy as np
 import torch
@@ -25,8 +25,6 @@
 else:
     from tqdm.auto import tqdm
 
-from slack_sdk import WebClient
-
 from doctr import transforms as T
 from doctr.datasets import DetectionDataset
 from doctr.models import detection, login_to_hub, push_to_hf_hub
@@ -34,20 +32,6 @@
 from utils import EarlyStopper, plot_recorder, plot_samples
 
 
-def send_on_slack(text: str):
-    """Send a message on Slack.
-
-    Args:
-        text (str): message to send on Slack
-    """
-    if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"):
-        client = WebClient(token=os.getenv("TQDM_SLACK_TOKEN"))
-        client.chat_postMessage(
-            channel=os.getenv("TQDM_SLACK_CHANNEL"),
-            text=f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}",
-        )
-
-
 def record_lr(
     model: torch.nn.Module,
     train_loader: DataLoader,
@@ -163,7 +147,6 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=Non
     model.eval()
     # Reset val metric
     val_metric.reset()
-    pbar = tqdm(val_loader)
     # Validation loop
     val_loss, batch_cnt = 0, 0
     pbar = tqdm(val_loader, dynamic_ncols=True)
diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py
index 2fe3de5bb..dde8d5eb5 100644
--- a/references/detection/train_tensorflow.py
+++ b/references/detection/train_tensorflow.py
@@ -38,32 +38,6 @@
 from doctr.utils.metrics import LocalizationConfusion
 from utils import EarlyStopper, plot_recorder, plot_samples
 
-SLACK_WEBHOOK_URL = None
-SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt"))
-if SLACK_WEBHOOK_PATH.exists():
-    with open(SLACK_WEBHOOK_PATH) as f:
-        SLACK_WEBHOOK_URL = f.read().strip()
-else:
-    print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...")
-
-
-def send_on_slack(text: str):
-    """Send a message on Slack.
-
-    Args:
-        text (str): message to send on Slack
-    """
-    if SLACK_WEBHOOK_URL:
-        try:
-            import requests
-
-            requests.post(
-                url=SLACK_WEBHOOK_URL,
-                json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"},
-            )
-        except Exception:
-            print("Impossible to send message on Slack, continue...")
-
 
 def record_lr(
     model: Model,
@@ -222,9 +196,6 @@ def main(args):
     pbar.write(
         f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)"
     )
-    send_on_slack(
-        f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)"
-    )
     with open(os.path.join(args.val_path, "labels.json"), "rb") as f:
         val_hash = hashlib.sha256(f.read()).hexdigest()
 
@@ -320,9 +291,6 @@ def main(args):
     pbar.write(
         f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)"
     )
-    send_on_slack(
-        f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)"
-    )
     with open(os.path.join(args.train_path, "labels.json"), "rb") as f:
         train_hash = hashlib.sha256(f.read()).hexdigest()
 

From 5548afa982599264564723f92a7e03f9acc4fd61 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 27 Jan 2025 11:53:33 +0100
Subject: [PATCH 37/39] remove grad accumu

---
 references/detection/train_pytorch.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index 2d72bdde7..78bf92e97 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -116,17 +116,22 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
             images = images.cuda()
         images = batch_transforms(images)
 
+        optimizer.zero_grad()
         if amp:
             with torch.cuda.amp.autocast():
-                train_loss = model(images, targets)["loss"] / grad_accumulation_steps
+                train_loss = model(images, targets)["loss"]
             scaler.scale(train_loss).backward()
             # Gradient clipping
             scaler.unscale_(optimizer)
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            # Update the params
+            scaler.step(optimizer)
+            scaler.update()
         else:
-            train_loss = model(images, targets)["loss"] / grad_accumulation_steps
+            train_loss = model(images, targets)["loss"]
             train_loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
+            optimizer.step()
 
         scheduler.step()
         last_lr = scheduler.get_last_lr()[0]
@@ -444,16 +449,12 @@ def main(args):
         return
 
     # Scheduler
-    # Effective steps per epoch (due to grad accumulation)
-    grad_steps = args.grad_accumulation
-    effective_steps_per_epoch = len(train_loader) // grad_steps
-    total_steps = args.epochs * effective_steps_per_epoch
     if args.sched == "cosine":
-        scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4)
+        scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
     elif args.sched == "onecycle":
-        scheduler = OneCycleLR(optimizer, args.lr, total_steps)
+        scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
     elif args.sched == "poly":
-        scheduler = PolynomialLR(optimizer, total_steps, power=0.5)
+        scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader))
 
     # Training monitoring
     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -608,7 +609,6 @@ def parse_args():
     parser.add_argument("--name", type=str, default=None, help="Name of your training experiment")
     parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
     parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training")
-    parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps")
     parser.add_argument("--device", default=None, type=int, help="device")
     parser.add_argument(
         "--save-interval-epoch", dest="save_interval_epoch", action="store_true", help="Save model every epoch"

From 1162d8b89b80ef3d53ea57c3449b1bf975eef806 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 27 Jan 2025 12:11:39 +0100
Subject: [PATCH 38/39] tqdm disable

---
 references/detection/train_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index 78bf92e97..d33734821 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -185,7 +185,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=Non
 
 
 def main(args):
-    pbar = tqdm(disable=True)
+    pbar = tqdm(disable=False)
     pbar.write(str(args))
 
     if args.push_to_hub:

From 185ce11f61001a69e88c71490013f82161b5aae4 Mon Sep 17 00:00:00 2001
From: Olivier Dulcy <olivier.dulcy@mindee.co>
Date: Mon, 27 Jan 2025 15:10:36 +0100
Subject: [PATCH 39/39] enable pbar.write

---
 references/detection/train_pytorch.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py
index d33734821..b7a49db21 100644
--- a/references/detection/train_pytorch.py
+++ b/references/detection/train_pytorch.py
@@ -186,6 +186,9 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=Non
 
 def main(args):
     pbar = tqdm(disable=False)
+    # Monkey patch tqdm write method to send messages directly to Slack
+    if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"):
+        pbar.write = lambda msg: pbar.sio.client.chat_postMessage(channel=os.getenv("TQDM_SLACK_CHANNEL"), text=msg)
     pbar.write(str(args))
 
     if args.push_to_hub: